├── .editorconfig ├── .github ├── ISSUE_TEMPLATE.md └── workflows │ └── python-publish.yml ├── .gitignore ├── AUTHORS.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── dashboard.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── modules.rst ├── optimization.rst ├── readme.rst └── usage.rst ├── examples ├── OCTIS_LDA_training_only.ipynb ├── OCTIS_Optimizing_CTM.ipynb └── README.rst ├── logo.png ├── octis ├── __init__.py ├── cli.py ├── configuration │ ├── __init__.py │ ├── citations.py │ └── defaults.py ├── dashboard │ ├── __init__.py │ ├── experimentManager.py │ ├── frameworkScanner.py │ ├── queueManager.py │ ├── server.py │ ├── static │ │ ├── bootstrap.min.js │ │ ├── images │ │ │ ├── batch.png │ │ │ ├── batch_selected.png │ │ │ ├── dataset.png │ │ │ ├── dataset_selected.png │ │ │ ├── down-arrow.png │ │ │ ├── favicon.png │ │ │ ├── logo.png │ │ │ ├── model.png │ │ │ ├── model_selected.png │ │ │ ├── pic1.png │ │ │ └── up-arrow.png │ │ ├── jquery.easing.min.js │ │ ├── jquery.min.js │ │ ├── plotly-latest.min.js │ │ ├── popper.min.js │ │ ├── slick.js │ │ └── styles │ │ │ ├── bootstrap.css │ │ │ ├── slick-theme.css │ │ │ ├── slick.css │ │ │ └── styles.css │ └── templates │ │ ├── CreateExperiments.html │ │ ├── ManageExperiments.html │ │ ├── SingleExperiment.html │ │ ├── VisualizeExperiments.html │ │ ├── index.html │ │ └── serverClosed.html ├── dataset │ ├── __init__.py │ ├── dataset.py │ └── downloader.py ├── evaluation_metrics │ ├── __init__.py │ ├── classification_metrics.py │ ├── coherence_metrics.py │ ├── diversity_metrics.py │ ├── metrics.py │ ├── rbo.py │ ├── similarity_metrics.py │ ├── topic_significance_metrics.py │ ├── word_embeddings_rbo.py │ └── word_embeddings_rbo_centroid.py ├── models │ ├── CTM.py │ ├── DETM.py │ ├── DETM_model │ │ ├── __init__.py │ │ ├── data.py │ │ └── detm.py │ ├── ETM.py │ ├── ETM_model │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data.py │ │ ├── data │ │ │ └── 20ng │ │ │ │ ├── bow_tr_counts.mat │ │ │ │ ├── bow_tr_tokens.mat │ │ │ │ ├── bow_ts_counts.mat │ │ │ │ ├── bow_ts_h1_counts.mat │ │ │ │ ├── bow_ts_h1_tokens.mat │ │ │ │ ├── bow_ts_h2_counts.mat │ │ │ │ ├── bow_ts_h2_tokens.mat │ │ │ │ ├── bow_ts_tokens.mat │ │ │ │ ├── bow_va_counts.mat │ │ │ │ ├── bow_va_tokens.mat │ │ │ │ └── vocab.pkl │ │ ├── etm.py │ │ ├── skipgram.py │ │ └── utils.py │ ├── HDP.py │ ├── LDA.py │ ├── LDA_tomopy.py │ ├── LSI.py │ ├── NMF.py │ ├── NMF_scikit.py │ ├── NeuralLDA.py │ ├── ProdLDA.py │ ├── __init__.py │ ├── base_etm.py │ ├── contextualized_topic_models │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── contextualized_topic_models.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ └── dataset.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ └── ctm.py │ │ ├── networks │ │ │ ├── __init__.py │ │ │ ├── decoding_network.py │ │ │ └── inference_network.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── data_preparation.py │ │ │ └── preprocessing.py │ ├── early_stopping │ │ ├── .gitignore │ │ ├── CODE_OF_CONDUCT.md │ │ ├── LICENSE │ │ ├── MNIST_Early_Stopping_example.ipynb │ │ ├── README.md │ │ ├── __init__.py │ │ ├── checkpoint.pt │ │ ├── loss_plot.png │ │ ├── pytorchtools.py │ │ └── requirements.txt │ ├── model.py │ └── pytorchavitm │ │ ├── AVITM.py │ │ ├── __init__.py │ │ ├── avitm │ │ ├── __init__.py │ │ ├── avitm_model.py │ │ ├── decoder_network.py │ │ └── inference_network.py │ │ └── datasets │ │ ├── __init__.py │ │ └── bow.py ├── octis.py ├── optimization │ ├── README.md │ ├── __init__.py │ ├── optimizer.py │ ├── optimizer_evaluation.py │ └── optimizer_tool.py └── preprocessing │ ├── __init__.py │ ├── preprocessing.py │ ├── sources │ ├── M10.py │ ├── custom_dataset.py │ ├── dblp.py │ ├── newsgroup.py │ ├── reuters.py │ ├── source_tools.py │ └── wikipedia.py │ └── stopwords │ └── english.txt ├── preprocessed_datasets ├── 20NewsGroup │ ├── corpus.tsv │ ├── corpus.txt │ ├── labels.txt │ ├── metadata.json │ └── vocabulary.txt ├── BBC_News │ ├── corpus.tsv │ ├── corpus.txt │ ├── labels.txt │ ├── metadata.json │ └── vocabulary.txt ├── DBLP │ ├── corpus.tsv │ ├── corpus.txt │ ├── labels.txt │ ├── metadata.json │ └── vocabulary.txt ├── DBPedia_IT │ ├── corpus.tsv │ ├── indexes.txt │ ├── metadata.json │ └── vocabulary.txt ├── Europarl_IT │ ├── corpus.tsv │ ├── indexes.txt │ ├── metadata.json │ └── vocabulary.txt ├── M10 │ ├── corpus.tsv │ ├── corpus.txt │ ├── labels.txt │ ├── metadata.json │ └── vocabulary.txt ├── README.rst ├── sample_dataset │ ├── corpus.tsv │ ├── metadata.json │ └── vocabulary.txt └── sample_texts │ └── unprepr_docs.txt ├── requirements.txt ├── requirements_dev.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test_datasets.py ├── test_evaluation_metrics.py ├── test_octis.py └── test_optimization.py └── trained_embeddings └── test_example ├── example.bin ├── example.keyedvectors ├── example.pickle ├── example.txt └── headerless_example.txt /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * OCTIS version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | paths-ignore: 6 | - '**.rst' 7 | pull_request: 8 | branches: ['master'] 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.8, 3.9, "3.10"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install flake8 pytest pytest-xdist 28 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 29 | - name: Lint with flake8 30 | run: | 31 | # stop the build if there are Python syntax errors or undefined names 32 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 33 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 35 | - name: Test with pytest 36 | run: | 37 | pytest -v 38 | - name: Install dependencies 39 | run: pip install wheel 40 | - name: Build package 41 | run: python setup.py sdist bdist_wheel 42 | - name: Publish package 43 | if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') 44 | uses: pypa/gh-action-pypi-publish@master 45 | with: 46 | user: __token__ 47 | password: ${{ secrets.pypi_password_octis}} 48 | skip_existing: true 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | # IDE settings 105 | .vscode/ 106 | 107 | # IDE Pycharm 108 | .idea -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Project and Development Lead 6 | ------------------------------ 7 | 8 | * Silvia Terragni 9 | * Elisabetta Fersini University of Milano-Bicocca 10 | * Antonio Candelieri University of Milano-Bicocca 11 | 12 | 13 | Contributors 14 | ------------ 15 | 16 | * Pietro Tropeano Framework architecture, Preprocessing, Topic Models, Evaluation metrics and Web Dashboard 17 | * Bruno Galuzzi Bayesian Optimization 18 | * Silvia Terragni Overall project 19 | 20 | Past Contributors 21 | ------------------ 22 | * Lorenzo Famiglini Neural models integration 23 | * Davide Pietrasanta Bayesian Optimization 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/MIND-Lab/OCTIS/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | OCTIS could always use more documentation, whether as part of the 42 | official OCTIS docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/MIND-Lab/OCTIS/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `OCTIS` for local development. 61 | 62 | 1. Fork the `OCTIS` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/OCTIS.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv OCTIS 70 | $ cd OCTIS/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the 80 | tests, including testing other Python versions with tox:: 81 | 82 | $ flake8 octis tests 83 | $ python setup.py test or pytest 84 | $ tox 85 | 86 | To get flake8 and tox, just pip install them into your virtualenv. 87 | 88 | 6. Commit your changes and push your branch to GitHub:: 89 | 90 | $ git add . 91 | $ git commit -m "Your detailed description of your changes." 92 | $ git push origin name-of-your-bugfix-or-feature 93 | 94 | 7. Submit a pull request through the GitHub website. 95 | 96 | Pull Request Guidelines 97 | ----------------------- 98 | 99 | Before you submit a pull request, check that it meets these guidelines: 100 | 101 | 1. The pull request should include tests. 102 | 2. If the pull request adds functionality, the docs should be updated. Put 103 | your new functionality into a function with a docstring, and add the 104 | feature to the list in README.rst. 105 | 3. The pull request should work for Python 3.6, 3.7 and 3.8, and for PyPI. 106 | Make sure you have `enabled workflow actions for your GitHub fork 107 | `_ 108 | and that the tests pass for all supported Python versions. 109 | 110 | Tips 111 | ---- 112 | 113 | To run a subset of tests:: 114 | 115 | $ pytest tests/test_octis.py 116 | 117 | 118 | Deploying 119 | --------- 120 | 121 | A reminder for the maintainers on how to deploy. 122 | Make sure all your changes are committed (including an entry in HISTORY.rst). 123 | Then run:: 124 | 125 | $ bump2version patch # possible: major / minor / patch 126 | $ git push 127 | $ git push --tags 128 | 129 | GitHub Actions will then deploy to PyPI if tests pass. 130 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 1.14.0 6 | -------------- 7 | * FIX dependencies #117 #123 8 | 9 | 10 | 1.13.1 11 | -------------- 12 | * FIX #106 Fix scikit-learn version 13 | 14 | 15 | 1.13.0 16 | -------------- 17 | * FIX #96 Fix preprocessing with num_processes not None 18 | * FIX #104 fix numpy version 19 | 20 | 1.12.1 21 | -------------- 22 | * FIX #102 fix requirements 23 | 24 | 1.12.0 25 | --------------- 26 | * fix #91 add parameter for setting num of processes for gensim coherence 27 | * FIX pandas error 28 | 29 | 30 | 1.11.1 31 | --------------- 32 | * fix gensim requirements #87 33 | 34 | 35 | 1.11.0 36 | --------------- 37 | * Improve preprocessing #70 38 | * Bug fix CTM num_topics #76 39 | * Add top_words parameter to CTM model #84 40 | * Add seed parameter to CTM #65 41 | * Update some requirements 42 | * Add testing for python 3.9 and remove 3.6 43 | * Minor fixes 44 | 45 | 46 | 1.10.4 (2022-05-20) 47 | -------------------- 48 | * Update metadata Italian datasets 49 | * Fix dataset encoding (#57) 50 | * Fix word embeddings topic coherence (#58) 51 | * Fix dataset name BBC_News (#59) 52 | 53 | 54 | 1.10.3 (2022-02-20) 55 | -------------------- 56 | * Fix KL Divergence in diversity metrics (#51, #52) 57 | 58 | 1.10.2 (2021-12-20) 59 | -------------------- 60 | * Bug fix optimizer evaluation with additional metrics (#46) 61 | 62 | 1.10.1 (2021-12-08) 63 | -------------------- 64 | * Bug fix Coherence with word embeddings (#43, #45) 65 | 66 | 1.10.0 (2021-11-21) 67 | -------------------- 68 | * ETM now supports different formats of word embeddings (#36) 69 | * Bug fix similarity measures (#41) 70 | * Minor fixes 71 | 72 | 1.9.0 (2021-09-27) 73 | ------------------ 74 | * Bug fix preprocessing (#26) 75 | * Bug fix ctm (#28) 76 | * Bug fix weirbo_centroid (#31) 77 | * Added new Italian datasets 78 | * Minor fixes 79 | 80 | 1.8.3 (2021-07-26) 81 | ------------------ 82 | * Gensim migration from 3.8 to >=4.0.0 83 | 84 | 1.8.2 (2021-07-25) 85 | ------------------ 86 | * Fixed unwanted sorting of documents 87 | 88 | 1.8.1 (2021-07-08) 89 | ------------------ 90 | * Fixed gensim version (#22) 91 | 92 | 1.8.0 (2021-06-18) 93 | ------------------ 94 | * Added per-topic kl-uniform significance 95 | 96 | 97 | 1.7.1 (2021-06-09) 98 | ------------------ 99 | * Handling multilabel classification 100 | * Fixed preprocessing when dataset is not split (#17) 101 | 102 | 1.6.0 (2021-05-20) 103 | ------------------ 104 | * Added regularization hyperparameter to NMF_scikit 105 | * Added similarity metrics 106 | * Fixed handling of stopwords in preprocessing 107 | * Fixed coherence and diversity metrics 108 | * Added new metrics tests 109 | 110 | 1.4.0 (2021-05-12) 111 | ------------------ 112 | * Fixed CTM training when only training dataset is used 113 | * Dashboard bugs fixed 114 | * Minor bug fixes 115 | * Added new tests for TM training 116 | 117 | 1.3.0 (2021-04-25) 118 | ------------------ 119 | * Added parameter num_samples to CTM, NeuralLDA and ProdLDA 120 | * Bug fix AVITM 121 | 122 | 1.2.1 (2021-04-21) 123 | ------------------ 124 | * Bug fix info dataset 125 | 126 | 1.2.0 (2021-04-20) 127 | ------------------ 128 | * Tomotopy LDA's implementation should work now 129 | 130 | 1.1.1 (2021-04-19) 131 | ------------------ 132 | * bug fix dataset download 133 | * CTM is no longer verbose 134 | 135 | 136 | 1.1.0 (2021-04-18) 137 | ------------------ 138 | * New classification metrics 139 | * Vocabulary downloader fix 140 | 141 | 1.0.2 (2021-04-16) 142 | ------------------ 143 | * Dataset downloader fix 144 | 145 | 1.0.0 (2021-04-16) 146 | ------------------ 147 | * New metrics initialization (do not support dictionaries as input anymore) 148 | * Optimization, dataset and dashboard bug fixes 149 | * Refactoring 150 | * Updated README and documentation 151 | 152 | 0.4.0 (2021-04-15) 153 | ------------------ 154 | * Dataset preprocessing produces also an indexes.txt file containing the indexes of the documents 155 | * Eval metrics bug fixes 156 | * BBC news added in the correct format 157 | 158 | 0.3.0 (2021-04-10) 159 | ------------------ 160 | * Bug fixes 161 | 162 | 0.2.0 (2021-03-30) 163 | ------------------ 164 | 165 | * New dataset format 166 | 167 | 168 | 0.1.0 (2021-03-11) 169 | ------------------ 170 | 171 | * First release on PyPI. 172 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, Silvia Terragni 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-test clean-pyc clean-build docs help 2 | .DEFAULT_GOAL := help 3 | 4 | define BROWSER_PYSCRIPT 5 | import os, webbrowser, sys 6 | 7 | from urllib.request import pathname2url 8 | 9 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 10 | endef 11 | export BROWSER_PYSCRIPT 12 | 13 | define PRINT_HELP_PYSCRIPT 14 | import re, sys 15 | 16 | for line in sys.stdin: 17 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 18 | if match: 19 | target, help = match.groups() 20 | print("%-20s %s" % (target, help)) 21 | endef 22 | export PRINT_HELP_PYSCRIPT 23 | 24 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 25 | 26 | help: 27 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 28 | 29 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 30 | 31 | clean-build: ## remove build artifacts 32 | rm -fr build/ 33 | rm -fr dist/ 34 | rm -fr .eggs/ 35 | find . -name '*.egg-info' -exec rm -fr {} + 36 | find . -name '*.egg' -exec rm -f {} + 37 | 38 | clean-pyc: ## remove Python file artifacts 39 | find . -name '*.pyc' -exec rm -f {} + 40 | find . -name '*.pyo' -exec rm -f {} + 41 | find . -name '*~' -exec rm -f {} + 42 | find . -name '__pycache__' -exec rm -fr {} + 43 | 44 | clean-test: ## remove test and coverage artifacts 45 | rm -fr .tox/ 46 | rm -f .coverage 47 | rm -fr htmlcov/ 48 | rm -fr .pytest_cache 49 | 50 | lint: ## check style with flake8 51 | flake8 octis tests 52 | 53 | test: ## run tests quickly with the default Python 54 | pytest 55 | 56 | test-all: ## run tests on every Python version with tox 57 | tox 58 | 59 | coverage: ## check code coverage quickly with the default Python 60 | coverage run --source octis -m pytest 61 | coverage report -m 62 | coverage html 63 | $(BROWSER) htmlcov/index.html 64 | 65 | docs: ## generate Sphinx HTML documentation, including API docs 66 | rm -f docs/octis.rst 67 | rm -f docs/modules.rst 68 | sphinx-apidoc -o docs/ octis 69 | $(MAKE) -C docs clean 70 | $(MAKE) -C docs html 71 | $(BROWSER) docs/_build/html/index.html 72 | 73 | servedocs: docs ## compile the docs watching for changes 74 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 75 | 76 | release: dist ## package and upload a release 77 | twine upload dist/* 78 | 79 | dist: clean ## builds source and wheel package 80 | python setup.py sdist 81 | python setup.py bdist_wheel 82 | ls -l dist 83 | 84 | install: clean ## install the package to the active Python's site-packages 85 | python setup.py install 86 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = octis 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # octis documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another 16 | # directory, add these directories to sys.path here. If the directory is 17 | # relative to the documentation root, use os.path.abspath to make it 18 | # absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('..')) 23 | 24 | import octis 25 | 26 | # -- General configuration --------------------------------------------- 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 34 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'octis' 50 | copyright = "2020, Silvia Terragni" 51 | author = "Silvia Terragni" 52 | 53 | # The version info for the project you're documenting, acts as replacement 54 | # for |version| and |release|, also used in various other places throughout 55 | # the built documents. 56 | # 57 | # The short X.Y version. 58 | version = octis.__version__ 59 | # The full version, including alpha/beta/rc tags. 60 | release = octis.__version__ 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'sphinx_rtd_theme' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a 89 | # theme further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | 100 | # -- Options for HTMLHelp output --------------------------------------- 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = 'octisdoc' 104 | 105 | 106 | # -- Options for LaTeX output ------------------------------------------ 107 | 108 | latex_elements = { 109 | # The paper size ('letterpaper' or 'a4paper'). 110 | # 111 | # 'papersize': 'letterpaper', 112 | 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | 121 | # Latex figure (float) alignment 122 | # 123 | # 'figure_align': 'htbp', 124 | } 125 | 126 | # Grouping the document tree into LaTeX files. List of tuples 127 | # (source start file, target name, title, author, documentclass 128 | # [howto, manual, or own class]). 129 | latex_documents = [ 130 | (master_doc, 'octis.tex', 131 | 'octis Documentation', 132 | 'Silvia Terragni', 'manual'), 133 | ] 134 | 135 | 136 | # -- Options for manual page output ------------------------------------ 137 | 138 | # One entry per manual page. List of tuples 139 | # (source start file, name, description, authors, manual section). 140 | man_pages = [ 141 | (master_doc, 'octis', 142 | 'octis Documentation', 143 | [author], 1) 144 | ] 145 | 146 | 147 | # -- Options for Texinfo output ---------------------------------------- 148 | 149 | # Grouping the document tree into Texinfo files. List of tuples 150 | # (source start file, target name, title, author, 151 | # dir menu entry, description, category) 152 | texinfo_documents = [ 153 | (master_doc, 'octis', 154 | 'octis Documentation', 155 | author, 156 | 'octis', 157 | 'One line description of project.', 158 | 'Miscellaneous'), 159 | ] 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/dashboard.rst: -------------------------------------------------------------------------------- 1 | Local dashboard 2 | ================ 3 | 4 | The local dashboard is a user-friendly graphical interface for creating, monitoring, and viewing experiments. 5 | Following the implementation standards of datasets, models, and metrics the dashboard will automatically update and allow you to use your custom implementations. 6 | 7 | To run rhe dashboard you need to clone the repo. 8 | While in the project directory run the following command: 9 | 10 | .. code-block:: bash 11 | 12 | python OCTIS/dashboard/server.py --port [port number] --dashboardState [path to dashboard state file] 13 | 14 | The port parameter is optional and the selected port number will be used to host the dashboard server, the default port is 5000. 15 | The dashboardState parameter is optional and the selected json file will be used to save the informations used to launch and find the experiments, the default dashboardState path is the current directory. 16 | 17 | The browser will open and you will be redirected to the dashboard. 18 | In the dashboard you can: 19 | 20 | * Create new experiments organized in batch 21 | * Visualize and compare all the experiments 22 | * Visualize a custom experiment 23 | * Manage the experiment queue 24 | 25 | 26 | Using the Dashboard 27 | ------------------- 28 | 29 | When the dashboard opens, the home will be automatically loaded on your browser. 30 | 31 | Create new experiments 32 | ^^^^^^^^^^^^^^^^^^^^^^ 33 | To create a new experiment click on the ``CREATE EXPERIMENTS`` tab. 34 | In this tab have to choose: 35 | 36 | * The folder in which you want to save the experiment results 37 | * The name of the experiment 38 | * The name of the batch of experiments in which the experiment is contained 39 | * The dataset 40 | * The model to optimize 41 | * Hyperparameters of the model to optimize 42 | * Search space of the hyperparameters to optimize 43 | * The metric to optimize 44 | * Parameters of the metric 45 | * Metrics to track [optional] 46 | * Parameters of the metrics to track [optional] 47 | * Optimization parameters 48 | 49 | After that you can click on ``Start Experiment`` and the experiment will be added to the Queue. 50 | 51 | Visualize and compare all the experiments 52 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 53 | To visualize the experiments click on the ``VISUALIZE EXPERIMENTS`` tab. 54 | In this tab, you can choose which batch (or set of batches) to visualize. 55 | 56 | A plot of each experiment that contains the best-seen evaluation at each iteration is visualized in a grid. 57 | Alternatively, you can visualize a box plot at each iteration to understand if a given hyper-parameter configuration is noisy (high variance) or not. 58 | 59 | You can interact with the single experiment graphic or choose to have a look at the single experiment by clicking on ``Click here to inspect the results``. 60 | 61 | It is possible to decide in which order to show the experiments and apply some filters to have a more intuitive visualization of the experiments. 62 | 63 | 64 | Visualize a custom experiment 65 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 66 | In the ``VISUALIZE EXPERIMENTS`` tab, after clicking on the ``Click here to inspect the results`` button, you will be redirected to the single experiment tab. 67 | In this tab, you can visualize all the information and statistics related to the experiment, including the best hyper-parameter configuration and the best value of the optimized metric. You can also have an outline of the statistics of the tracked metrics. 68 | 69 | It is also possible to have a look at a word cloud obtained from the most relevant words of a given topic, scaled by their probability; the topic distribution on each document (and a preview of the document), and the weight of each word of the vocabulary for each topic. 70 | 71 | 72 | Manage the experiment queue 73 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 74 | To manage the experiment queue click on the ``MANAGE EXPERIMENTS`` tab. 75 | In this tab, you can pause or resume the execution of an experiment. 76 | You can also change the order of the experiments to perform or delete the ones you are no longer interested in. 77 | 78 | 79 | Frequently used terms 80 | --------------------- 81 | 82 | Batch 83 | ^^^^^ 84 | A batch of experiments is a set of related experiments that can be recognized using a keyword referred to as batch ``name``. 85 | 86 | Model runs 87 | ^^^^^^^^^^ 88 | In the optimization context of the framework, since the performance estimated by the evaluation metrics can be affected by noise, the objective function is computed as the median of a given number of ``model runs`` (i.e., topic models run with the same hyperparameter configuration) -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to OCTIS's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | usage 11 | modules 12 | optimization 13 | dashboard 14 | contributing 15 | authors 16 | history 17 | 18 | Indices and tables 19 | ================== 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install OCTIS, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install octis 16 | 17 | This is the preferred method to install OCTIS, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for OCTIS can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/mind-lab/octis 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OJL https://github.com/mind-lab/octis/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/mind-lab/octis 51 | .. _tarball: https://github.com/mind-lab/octis/tarball/master 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=octis 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | ================ 2 | Modules 3 | ================ 4 | 5 | 6 | .. highlight:: shell 7 | 8 | ---------------- 9 | Dataset 10 | ---------------- 11 | 12 | .. automodule:: octis.dataset.dataset 13 | :members: 14 | 15 | ------------------------ 16 | Data Preprocessing 17 | ------------------------ 18 | 19 | .. automodule:: octis.preprocessing.preprocessing 20 | :members: 21 | 22 | ------------------------ 23 | Evaluation Measures 24 | ------------------------ 25 | 26 | .. automodule:: octis.evaluation_metrics.metrics 27 | :members: 28 | 29 | 30 | .. automodule:: octis.evaluation_metrics.coherence_metrics 31 | :members: 32 | 33 | .. automodule:: octis.evaluation_metrics.diversity_metrics 34 | :members: 35 | 36 | .. automodule:: octis.evaluation_metrics.classification_metrics 37 | :members: 38 | 39 | .. automodule:: octis.evaluation_metrics.topic_significance_metrics 40 | :members: 41 | ------------------------ 42 | Optimization 43 | ------------------------ 44 | 45 | .. automodule:: octis.optimization.optimizer 46 | :members: 47 | 48 | .. automodule:: octis.optimization.optimizer_tool 49 | :members: 50 | 51 | 52 | ------------------------ 53 | Models 54 | ------------------------ 55 | 56 | .. automodule:: octis.models.model 57 | :members: 58 | 59 | .. automodule:: octis.models.LDA 60 | :members: 61 | 62 | .. automodule:: octis.models.NFM 63 | :members: 64 | 65 | .. automodule:: octis.models.NMF_scikit 66 | :members: 67 | 68 | .. automodule:: octis.models.CTM 69 | :members: 70 | 71 | .. automodule:: octis.models.ETM 72 | :members: 73 | -------------------------------------------------------------------------------- /docs/optimization.rst: -------------------------------------------------------------------------------- 1 | ================================ 2 | Hyper-parameter optimization 3 | ================================ 4 | 5 | The core of OCTIS framework consists of an efficient and user-friendly way to select the best hyper-parameters for a Topic Model 6 | using Bayesian Optimization. 7 | 8 | To inizialize an optimization, inizialize the Optimizer class: 9 | 10 | .. code-block:: bash 11 | 12 | from octis.optimization.optimizer import Optimizer 13 | optimizer = Optimizer() 14 | 15 | Choose the dataset you want to analyze. 16 | 17 | .. code-block:: bash 18 | 19 | from octis.dataset.dataset import Dataset 20 | dataset = Dataset() 21 | dataset.load("octis/preprocessed_datasets/M10") 22 | 23 | Choose a Topic-Model. 24 | 25 | .. code-block:: bash 26 | 27 | from octis.models.LDA import LDA 28 | model = LDA() 29 | model.hyperparameters.update({"num_topics": 25}) 30 | 31 | Choose the metric function to optimize. 32 | 33 | .. code-block:: bash 34 | 35 | from octis.evaluation_metrics.coherence_metrics import Coherence 36 | metric_parameters = { 37 | 'texts': dataset.get_corpus(), 38 | 'topk': 10, 39 | 'measure': 'c_npmi' 40 | } 41 | npmi = Coherence(metric_parameters) 42 | 43 | Create the search space for the optimization. 44 | 45 | .. code-block:: python 46 | 47 | from skopt.space.space import Real 48 | search_space = { 49 | "alpha": Real(low=0.001, high=5.0), 50 | "eta": Real(low=0.001, high=5.0) 51 | } 52 | 53 | Finally, launch the optimization. 54 | 55 | .. code-block:: python 56 | 57 | optimization_result=optimizer.optimize(model, 58 | dataset, 59 | npmi, 60 | search_space, 61 | number_of_call=10, 62 | n_random_starts=3, 63 | model_runs=3, 64 | save_name="result", 65 | surrogate_model="RF", 66 | acq_func="LCB" 67 | ) 68 | 69 | where: 70 | 71 | * number_of_call: int, default: 5. Number of function evaluations. 72 | * n_random_starts: int, default: 1. Number of random points used to inizialize the BO 73 | * model_runs: int: default: 3. Number of model runs. 74 | * save_name: str, default "results". Name of the json file where all the results are saved 75 | * surrogate_model: str, default: "RF". Probabilistic surrogate model used to build to prior on the objective function. Can be either: 76 | 77 | * "RF" for Random Forest regression 78 | * "GP" for Gaussian Process regression 79 | * "ET" for Extra-tree Regression 80 | 81 | * acq_function: str, default: "EI". function to optimize the surrogate model. Can be either: 82 | 83 | * "LCB" for lower confidence bound 84 | * "EI" for expected improvment 85 | * "PI" for probability of improvment 86 | 87 | The results of the optimization are saved in the json file, by default. However, you can save the results of the optimization also in a user-friendly csv file 88 | 89 | .. code-block:: python 90 | 91 | optimization_result.save_to_csv("results.csv") 92 | 93 | Resume the optimization 94 | ------------------------- 95 | 96 | Optimization runs, for some reason, can be interrupted. With the help of the ``resume_optimization`` you can restart the optimization run from the last saved iteration. 97 | 98 | .. code-block:: python 99 | 100 | optimizer = Optimizer() 101 | optimizer.resume_optimization(json_path) 102 | 103 | where ``json_path`` is the path of json file of the previous results. 104 | 105 | Continue the optimization 106 | ------------------------- 107 | 108 | Suppose that, after an optimization process, you want to perform three extra-evaluations. 109 | You can do this using the method ``resume_optimization``. 110 | 111 | .. code-block:: python 112 | 113 | optimizer = Optimizer() 114 | optimizer.resume_optimization(json_path, extra_evaluations=3) 115 | 116 | where ``extra_evaluations`` (int, default 0) is the number of extra-evaluations to perform. 117 | 118 | Inspect an extra-metric 119 | ------------------------- 120 | 121 | Suppose that, during the optimization process, you want to inspect the value of another metric. 122 | For example, suppose that you want to check the value of 123 | 124 | .. code-block:: python 125 | 126 | metric_parameters = { 127 | 'texts': dataset.get_corpus(), 128 | 'topk': 10, 129 | 'measure': 'c_npmi' 130 | } 131 | npmi2 = Coherence(metric_parameters) 132 | 133 | You can add this as a parameter. 134 | 135 | .. code-block:: python 136 | 137 | optimization_result=optimizer.optimize(model, 138 | dataset, 139 | npmi, 140 | search_space, 141 | number_of_call=10, 142 | n_random_starts=3, 143 | extra_metrics=[npmi2] 144 | ) 145 | 146 | where ``extra_metrics`` (list, default None) is the list of extra metrics to inspect. 147 | 148 | Early stopping 149 | --------------- 150 | 151 | Suppose that you want to terminate the optimization process if there is no improvement after a certain number of iterations. You can apply an early stopping criterium during the optimization. 152 | 153 | 154 | .. code-block:: python 155 | 156 | optimization_result=optimizer.optimize(model, 157 | dataset, 158 | npmi, 159 | search_space, 160 | number_of_call=10, 161 | n_random_starts=3, 162 | early_stop=True, 163 | early_step=5, 164 | ) 165 | 166 | where ``early_step`` (int, default 5) is the number of function evaluations after that the optimization process is stopped. 167 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | To use OCTIS in a project:: 6 | 7 | import octis 8 | -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | Examples and Tutorials 2 | -------------------- 3 | 4 | Our Colab Tutorials: 5 | 6 | 7 | .. |colab1| image:: https://colab.research.google.com/assets/colab-badge.svg 8 | :target: https://colab.research.google.com/github/MIND-Lab/OCTIS/blob/master/examples/OCTIS_LDA_training_only.ipynb 9 | :alt: Open In Colab 10 | 11 | .. |colab2| image:: https://colab.research.google.com/assets/colab-badge.svg 12 | :target: https://colab.research.google.com/github/MIND-Lab/OCTIS/blob/master/examples/OCTIS_Optimizing_CTM.ipynb 13 | :alt: Open In Colab 14 | 15 | 16 | 17 | +--------------------------------------------------------------------------------+------------------+ 18 | | Name | Link | 19 | +================================================================================+==================+ 20 | | How to build a topic model and evaluate the results (LDA on 20Newsgroups) | |colab1| | 21 | +--------------------------------------------------------------------------------+------------------+ 22 | | How to optimize the hyperparameters of a neural topic model (CTM on M10) | |colab2| | 23 | +--------------------------------------------------------------------------------+------------------+ 24 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/logo.png -------------------------------------------------------------------------------- /octis/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for OCTIS.""" 2 | from octis.dashboard import * 3 | from octis.configuration import * 4 | from octis.dataset import * 5 | from octis.evaluation_metrics import * 6 | from octis.models import * 7 | from octis.optimization import * 8 | from octis.preprocessing import * 9 | 10 | __author__ = """Silvia Terragni""" 11 | __email__ = 's.terragni4@campus.unimib.it' 12 | __version__ = '1.14.0' 13 | -------------------------------------------------------------------------------- /octis/cli.py: -------------------------------------------------------------------------------- 1 | """Console script for octis.""" 2 | import sys 3 | import click 4 | 5 | 6 | @click.command() 7 | def main(args=None): 8 | """Console script for octis.""" 9 | click.echo("Replace this message by putting your code into " 10 | "octis.cli.main") 11 | click.echo("See click documentation at https://click.palletsprojects.com/") 12 | return 0 13 | 14 | 15 | if __name__ == "__main__": 16 | sys.exit(main()) # pragma: no cover 17 | -------------------------------------------------------------------------------- /octis/configuration/__init__.py: -------------------------------------------------------------------------------- 1 | from . import defaults 2 | from . import citations -------------------------------------------------------------------------------- /octis/dashboard/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | -------------------------------------------------------------------------------- /octis/dashboard/frameworkScanner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from pathlib import Path 4 | 5 | # get the path to the framework folder 6 | path = Path(os.path.dirname(os.path.realpath(__file__))) 7 | path = str(path.parent.parent) 8 | 9 | 10 | def scanDatasets(): 11 | """ 12 | Retrieves the name of each dataset present in the framework 13 | 14 | :return: list with name of each dataset as element 15 | :rtype: List 16 | """ 17 | 18 | datasets = os.listdir(str(os.path.join(path, "preprocessed_datasets"))) 19 | datasets.remove("README.rst") 20 | return datasets 21 | 22 | 23 | def getDatasetMetadata(datasetName): 24 | """ 25 | Retrieves the dataset metadata 26 | 27 | :param datasetName: name of the dataset 28 | :type datasetName: String 29 | 30 | :return: dict with metadata if dataset is found, False otherwise 31 | :rtype: Dict 32 | """ 33 | file = str(os.path.join( 34 | path, "preprocessed_datasets", datasetName, "corpus.tsv")) 35 | if os.path.isfile(file): 36 | f = open(file,) 37 | return {"total_documents": sum(1 for line in f)} 38 | return False 39 | 40 | 41 | def getDocPreview(datasetName, documentNumber): 42 | """ 43 | Retrieve the first 40 words of the selected document 44 | 45 | :param datasetName: name of the dataset in which the document is located 46 | (the dataset must be in the preprocessed_datasets folder) 47 | :type datasetName: String 48 | :param documentNumber: number of the document to retrieve 49 | :type documentNumber: Int 50 | 51 | :return: First 40 words in the document 52 | :rtype: String 53 | """ 54 | datasetPath = str(os.path.join( 55 | path, "preprocessed_datasets", datasetName, "corpus.tsv")) 56 | corpus = [] 57 | if os.path.isfile(datasetPath): 58 | with open(datasetPath, 'r') as corpus_file: 59 | for line in corpus_file: 60 | corpus.append(line.split("\t")[0]) 61 | splitted = corpus[documentNumber].split() 62 | if len(splitted) > 40: 63 | return " ".join(splitted[0:40]) 64 | return corpus[documentNumber] 65 | return False 66 | 67 | 68 | def getVocabulary(path): 69 | """ 70 | Retrieves the vocabulary from the vocabulary file of an ezxperiment 71 | 72 | :param path: path of the vocabulary 73 | :type path: String 74 | 75 | :return: a dictionary with id as a key and word as value, 76 | returns False if the vocabulary is not found 77 | :rtype: Dict 78 | """ 79 | if Path(path).is_file(): 80 | vocabulary_file = open(path,) 81 | return json.load(vocabulary_file) 82 | return False 83 | -------------------------------------------------------------------------------- /octis/dashboard/static/images/batch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/batch.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/batch_selected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/batch_selected.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/dataset.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/dataset_selected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/dataset_selected.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/down-arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/down-arrow.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/favicon.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/logo.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/model.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/model_selected.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/model_selected.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/pic1.png -------------------------------------------------------------------------------- /octis/dashboard/static/images/up-arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dashboard/static/images/up-arrow.png -------------------------------------------------------------------------------- /octis/dashboard/static/jquery.easing.min.js: -------------------------------------------------------------------------------- 1 | /* 2 | * jQuery Easing v1.3 - http://gsgd.co.uk/sandbox/jquery/easing/ 3 | * 4 | * Uses the built in easing capabilities added In jQuery 1.1 5 | * to offer multiple easing options 6 | * 7 | * TERMS OF USE - EASING EQUATIONS 8 | * 9 | * Open source under the BSD License. 10 | * 11 | * Copyright © 2001 Robert Penner 12 | * All rights reserved. 13 | * 14 | * TERMS OF USE - jQuery Easing 15 | * 16 | * Open source under the BSD License. 17 | * 18 | * Copyright © 2008 George McGinley Smith 19 | * All rights reserved. 20 | * 21 | * Redistribution and use in source and binary forms, with or without modification, 22 | * are permitted provided that the following conditions are met: 23 | * 24 | * Redistributions of source code must retain the above copyright notice, this list of 25 | * conditions and the following disclaimer. 26 | * Redistributions in binary form must reproduce the above copyright notice, this list 27 | * of conditions and the following disclaimer in the documentation and/or other materials 28 | * provided with the distribution. 29 | * 30 | * Neither the name of the author nor the names of contributors may be used to endorse 31 | * or promote products derived from this software without specific prior written permission. 32 | * 33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 34 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 35 | * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 36 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 37 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 38 | * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 39 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 40 | * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 41 | * OF THE POSSIBILITY OF SUCH DAMAGE. 42 | * 43 | */ 44 | jQuery.easing.jswing=jQuery.easing.swing;jQuery.extend(jQuery.easing,{def:"easeOutQuad",swing:function(e,f,a,h,g){"use strict";return jQuery.easing[jQuery.easing.def](e,f,a,h,g)},easeInQuad:function(e,f,a,h,g){"use strict";return h*(f/=g)*f+a},easeOutQuad:function(e,f,a,h,g){"use strict";return -h*(f/=g)*(f-2)+a},easeInOutQuad:function(e,f,a,h,g){"use strict";if((f/=g/2)<1){return h/2*f*f+a}return -h/2*((--f)*(f-2)-1)+a},easeInCubic:function(e,f,a,h,g){"use strict";return h*(f/=g)*f*f+a},easeOutCubic:function(e,f,a,h,g){"use strict";return h*((f=f/g-1)*f*f+1)+a},easeInOutCubic:function(e,f,a,h,g){"use strict";if((f/=g/2)<1){return h/2*f*f*f+a}return h/2*((f-=2)*f*f+2)+a},easeInQuart:function(e,f,a,h,g){"use strict";return h*(f/=g)*f*f*f+a},easeOutQuart:function(e,f,a,h,g){"use strict";return -h*((f=f/g-1)*f*f*f-1)+a},easeInOutQuart:function(e,f,a,h,g){"use strict";if((f/=g/2)<1){return h/2*f*f*f*f+a}return -h/2*((f-=2)*f*f*f-2)+a},easeInQuint:function(e,f,a,h,g){"use strict";return h*(f/=g)*f*f*f*f+a},easeOutQuint:function(e,f,a,h,g){"use strict";return h*((f=f/g-1)*f*f*f*f+1)+a},easeInOutQuint:function(e,f,a,h,g){"use strict";if((f/=g/2)<1){return h/2*f*f*f*f*f+a}return h/2*((f-=2)*f*f*f*f+2)+a},easeInSine:function(e,f,a,h,g){"use strict";return -h*Math.cos(f/g*(Math.PI/2))+h+a},easeOutSine:function(e,f,a,h,g){"use strict";return h*Math.sin(f/g*(Math.PI/2))+a},easeInOutSine:function(e,f,a,h,g){"use strict";return -h/2*(Math.cos(Math.PI*f/g)-1)+a},easeInExpo:function(e,f,a,h,g){"use strict";return(f==0)?a:h*Math.pow(2,10*(f/g-1))+a},easeOutExpo:function(e,f,a,h,g){"use strict";return(f==g)?a+h:h*(-Math.pow(2,-10*f/g)+1)+a},easeInOutExpo:function(e,f,a,h,g){"use strict";if(f==0){return a}if(f==g){return a+h}if((f/=g/2)<1){return h/2*Math.pow(2,10*(f-1))+a}return h/2*(-Math.pow(2,-10*--f)+2)+a},easeInCirc:function(e,f,a,h,g){"use strict";return -h*(Math.sqrt(1-(f/=g)*f)-1)+a},easeOutCirc:function(e,f,a,h,g){"use strict";return h*Math.sqrt(1-(f=f/g-1)*f)+a},easeInOutCirc:function(e,f,a,h,g){"use strict";if((f/=g/2)<1){return -h/2*(Math.sqrt(1-f*f)-1)+a}return h/2*(Math.sqrt(1-(f-=2)*f)+1)+a},easeInElastic:function(f,h,e,l,k){"use strict";var i=1.70158;var j=0;var g=l;if(h==0){return e}if((h/=k)==1){return e+l}if(!j){j=k*0.3}if(g 2 | 3 | 4 | 5 | 6 | 7 | 8 | OCTIS - Home 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
21 | 45 |
46 | 47 | 53 |
54 |
55 |
56 | 57 |

What is OCTIS?

58 |

OCTIS is an open-source evaluation and optimization framework 59 | that allows you to optimize the hyperparameters of state-of-the-art Topic Models and 60 | compare their performance with respect to several evaluation metrics on several datasets.

61 |
62 |

Why optimizing the hyperparameters of a topic model?

63 |

Topic models are usually controlled by hyperparameters that have a huge impact on the performance 64 | of the model itself. The value of these hyperparameters are dependent on your task and on the dataset. 65 | To solve the problem of finding an optimal hyperparameter configuration of a topic model, 66 | we can run an optimization algorithm (in our case, we use Bayesian Optimization) that automatically 67 | and efficiently discovers an optimal configuration without a lot of effort. Just select the 68 | hyperparameters of the model you want to optimize, your objective evaluation metric and the iterations of the Bayesian Optimization algorithm and OCTIS 69 | will do the rest of the job!

70 |
71 |

Main Features

72 |

OCTIS allows you to: 73 |

    74 |
  • define and start your optimization experiments by selecting a dataset, a model, its hyperparameters and 75 | an evaluation metric to optimize;
  • 76 |
  • compare the optimization progress of your designed experiments
  • 77 |
  • easily inspect the output of each topic model and the summary of the optimization
  • 78 |
  • manage the queue of the designed experiments, by changing their priority or pausing/restarting their execution.
79 |

80 |

Open-sourceness

81 | OCTIS has been realized for research purposes and 82 | it will be freely released to the NLP community. We collected open-source implementations of topic models, 83 | we used open-source libraries and freely available data. NOTE: We do not own the data. We just downloaded 84 | and prepared public datasets. We do not host or distribute these datasets, vouch for their quality or 85 | fairness, or claim that you have license to use the dataset. If you're a dataset owner and wish to update any part of it, or do not want your dataset to be included in this library, please get in touch through a GitHub issue. 86 |

87 |
88 | 94 | 95 |
96 | 97 | 98 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /octis/dashboard/templates/serverClosed.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | OCTIS - Server Closed 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 23 | 24 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /octis/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/dataset/__init__.py -------------------------------------------------------------------------------- /octis/dataset/downloader.py: -------------------------------------------------------------------------------- 1 | from os import environ, makedirs 2 | from os.path import exists, expanduser, join, splitext 3 | import pickle 4 | import sys 5 | import codecs 6 | import shutil 7 | import requests 8 | import json 9 | 10 | """ 11 | This code is highly inspired by the scikit-learn strategy to download datasets 12 | """ 13 | 14 | 15 | def get_data_home(data_home=None): 16 | """Return the path of the octis data dir. 17 | By default the data dir is set to a folder named 'octis_data' in the 18 | user home folder. 19 | Alternatively, it can be set by the 'OCTIS_DATA' environment 20 | variable or programmatically by giving an explicit folder path. The '~' 21 | symbol is expanded to the user home folder. 22 | If the folder does not already exist, it is automatically created. 23 | Parameters 24 | ---------- 25 | data_home : str | None 26 | The path to octis data dir. 27 | """ 28 | if data_home is None: 29 | data_home = environ.get('OCTIS_DATA', join('~', 'octis_data')) 30 | data_home = expanduser(data_home) 31 | if not exists(data_home): 32 | makedirs(data_home) 33 | return data_home 34 | 35 | 36 | def _pkl_filepath(*args, **kwargs): 37 | """Ensure different filenames for Python 2 and Python 3 pickles 38 | An object pickled under Python 3 cannot be loaded under Python 2. An object 39 | pickled under Python 2 can sometimes not be loaded correctly under Python 3 40 | because some Python 2 strings are decoded as Python 3 strings which can be 41 | problematic for objects that use Python 2 strings as byte buffers for 42 | numerical data instead of "real" strings. 43 | Therefore, dataset loaders in octis use different files for pickles 44 | manages by Python 2 and Python 3 in the same OCTIS_DATA folder so as 45 | to avoid conflicts. 46 | args[-1] is expected to be the ".pkl" filename. Under Python 3, a suffix is 47 | inserted before the extension to s 48 | _pkl_filepath('/path/to/folder', 'filename.pkl') returns: 49 | - /path/to/folder/filename.pkl under Python 2 50 | - /path/to/folder/filename_py3.pkl under Python 3+ 51 | """ 52 | py3_suffix = kwargs.get("py3_suffix", "_py3") 53 | basename, ext = splitext(args[-1]) 54 | if sys.version_info[0] >= 3: 55 | basename += py3_suffix 56 | new_args = args[:-1] + (basename + ext,) 57 | return join(*new_args) 58 | 59 | 60 | def download_dataset(dataset_name, target_dir, cache_path): 61 | """Download the 20 newsgroups data and stored it as a zipped pickle.""" 62 | corpus_path = join(target_dir, "corpus.tsv") 63 | metadata_path = join(target_dir, "metadata.json") 64 | vocabulary_path = join(target_dir, "vocabulary.txt") 65 | 66 | if not exists(target_dir): 67 | makedirs(target_dir) 68 | 69 | dataset_url = "https://raw.githubusercontent.com/MIND-Lab/OCTIS/master/preprocessed_datasets/" + dataset_name 70 | 71 | corpus = requests.get(dataset_url + "/corpus.tsv") 72 | metadata = requests.get(dataset_url + "/metadata.json") 73 | vocabulary = requests.get(dataset_url + "/vocabulary.txt") 74 | 75 | if corpus and metadata and vocabulary: 76 | with open(corpus_path, 'w', encoding='utf8') as f: 77 | f.write(corpus.text) 78 | with open(metadata_path, 'w', encoding='utf8') as f: 79 | f.write(metadata.text) 80 | with open(vocabulary_path, 'w', encoding='utf8') as f: 81 | f.write(vocabulary.text) 82 | 83 | only_docs, labels, partition = [], [], [] 84 | for d in corpus.text.split("\n"): 85 | if len(d.strip()) > 0: 86 | dsplit = d.strip().split("\t") 87 | only_docs.append(dsplit[0]) 88 | if len(dsplit) > 1: 89 | partition.append(dsplit[1]) 90 | if len(dsplit) > 2: 91 | labels.append(dsplit[2]) 92 | 93 | vocab = [word for word in vocabulary.text.split("\n") if len(word) > 0] 94 | metadata = json.loads(metadata.text) 95 | metadata["info"] = {} 96 | 97 | metadata["info"]["name"] = dataset_name 98 | 99 | # Store a zipped pickle 100 | cache = dict(corpus=only_docs, labels=labels, partitions=partition, metadata=metadata, 101 | vocabulary=vocab) 102 | compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') 103 | with open(cache_path, 'wb') as f: 104 | f.write(compressed_content) 105 | 106 | shutil.rmtree(target_dir) 107 | return cache 108 | else: 109 | raise Exception(dataset_name + ' dataset not found') 110 | 111 | 112 | -------------------------------------------------------------------------------- /octis/evaluation_metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/evaluation_metrics/__init__.py -------------------------------------------------------------------------------- /octis/evaluation_metrics/metrics.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractMetric(ABC): 5 | """ 6 | Class structure of a generic metric implementation 7 | """ 8 | 9 | def __init__(self): 10 | """ 11 | init metric 12 | """ 13 | pass 14 | 15 | @abstractmethod 16 | def score(self, model_output): 17 | """ 18 | Retrieves the score of the metric 19 | 20 | :param model_output: output of a topic model in the form of a dictionary. See model for details on 21 | the model output 22 | :type model_output: dict 23 | """ 24 | pass 25 | 26 | def get_params(self): 27 | return [att for att in dir(self) if not att.startswith("_") and att != 'info' and att != 'score' and 28 | att != 'get_params'] 29 | -------------------------------------------------------------------------------- /octis/evaluation_metrics/topic_significance_metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import octis.configuration.citations as citations 3 | from octis.evaluation_metrics.metrics import AbstractMetric 4 | 5 | 6 | def _KL(P, Q): 7 | """ 8 | Perform Kullback-Leibler divergence 9 | 10 | Parameters 11 | ---------- 12 | P : distribution P 13 | Q : distribution Q 14 | 15 | Returns 16 | ------- 17 | divergence : divergence from Q to P 18 | """ 19 | # add epsilon to grant absolute continuity 20 | epsilon = 0.00001 21 | P = P+epsilon 22 | Q = Q+epsilon 23 | 24 | divergence = np.sum(P*np.log(P/Q)) 25 | return divergence 26 | 27 | 28 | def _replace_zeros_lines(arr): 29 | zero_lines = np.where(~arr.any(axis=1))[0] 30 | val = 1.0 / len(arr[0]) 31 | vett = np.full(len(arr[0]), val) 32 | for zero_line in zero_lines: 33 | arr[zero_line] = vett.copy() 34 | return arr 35 | 36 | 37 | class KL_uniform(AbstractMetric): 38 | def __init__(self): 39 | """ 40 | Initialize metric 41 | """ 42 | super().__init__() 43 | 44 | def info(self): 45 | return { 46 | "citation": citations.em_topic_significance, 47 | "name": "KL_Uniform, Uniform distribution over words" 48 | } 49 | 50 | def score(self, model_output, per_topic=False): 51 | """ 52 | Retrieves the score of the metric 53 | 54 | Parameters 55 | ---------- 56 | model_output : dictionary, output of the model 57 | 'topic-word-matrix' required 58 | 59 | per_topic: if True, it returns the score for each topic 60 | 61 | Returns 62 | ------- 63 | result : score 64 | 65 | """ 66 | phi = _replace_zeros_lines(model_output["topic-word-matrix"].astype(float)) 67 | 68 | # make uniform distribution 69 | val = 1.0 / len(phi[0]) 70 | unif_distr = np.full(len(phi[0]), val) 71 | 72 | divergences = [] 73 | for topic in range(len(phi)): 74 | 75 | # normalize phi, sum up to 1 76 | P = phi[topic] / phi[topic].sum() 77 | 78 | divergence = _KL(P, unif_distr) 79 | divergences.append(divergence) 80 | 81 | # KL-uniform = mean of the divergences 82 | # between topic-word distributions and uniform distribution 83 | if per_topic: 84 | return divergences 85 | else: 86 | result = np.array(divergences).mean() 87 | return result 88 | 89 | 90 | class KL_vacuous(AbstractMetric): 91 | def __init__(self): 92 | """ 93 | Initialize metric 94 | """ 95 | super().__init__() 96 | 97 | def info(self): 98 | return { 99 | "citation": citations.em_topic_significance, 100 | "name": "KL_Vacuous, Vacuous semantic distribution" 101 | } 102 | 103 | def score(self, model_output): 104 | """ 105 | Retrieves the score of the metric 106 | 107 | Parameters 108 | ---------- 109 | model_output : dictionary, output of the model 110 | 'topic-word-matrix' required 111 | 'topic-document-matrix' required 112 | 113 | Returns 114 | ------- 115 | result : score 116 | """ 117 | phi = _replace_zeros_lines(model_output["topic-word-matrix"].astype(float)) 118 | theta = _replace_zeros_lines(model_output["topic-document-matrix"].astype(float)) 119 | 120 | vacuous = np.zeros(phi.shape[1]) 121 | for topic in range(len(theta)): 122 | 123 | # get probability of the topic in the corpus 124 | p_topic = theta[topic].sum()/len(theta[0]) 125 | 126 | # get probability of the words: 127 | # P(Wi | vacuous_dist) = P(Wi | topic)*P(topic) 128 | vacuous += phi[topic]*p_topic 129 | 130 | divergences = [] 131 | for topic in range(len(phi)): 132 | 133 | # normalize phi, sum up to 1 134 | P = phi[topic] / phi[topic].sum() 135 | 136 | divergence = _KL(P, vacuous) 137 | divergences.append(divergence) 138 | 139 | # KL-vacuous = mean of the divergences between topic-word distributions and vacuous distribution 140 | result = np.array(divergences).mean() 141 | return result 142 | 143 | 144 | class KL_background(AbstractMetric): 145 | def __init__(self): 146 | """ 147 | Initialize metric 148 | """ 149 | super().__init__() 150 | 151 | def info(self): 152 | return { 153 | "citation": citations.em_topic_significance, 154 | "name": "KL_Background, Background distribution over documents" 155 | } 156 | 157 | def score(self, model_output): 158 | """ 159 | Retrieves the score of the metric 160 | 161 | Parameters 162 | ---------- 163 | model_output : dictionary, output of the model 164 | 'topic-document-matrix' required 165 | 166 | Returns 167 | ------- 168 | result : score 169 | """ 170 | theta = _replace_zeros_lines(model_output["topic-document-matrix"].astype(float)) 171 | 172 | # make uniform distribution 173 | val = 1.0 / len(theta[0]) 174 | unif_distr = np.full(len(theta[0]), val) 175 | 176 | divergences = [] 177 | for topic in range(len(theta)): 178 | # normalize theta, sum up to 1 179 | P = theta[topic] / theta[topic].sum() 180 | 181 | divergence = _KL(P, unif_distr) 182 | divergences.append(divergence) 183 | 184 | # KL-background = mean of the divergences 185 | # between topic-doc distributions and uniform distribution 186 | result = np.array(divergences).mean() 187 | if np.isnan(result): 188 | return 0 189 | return result 190 | -------------------------------------------------------------------------------- /octis/models/DETM_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/DETM_model/__init__.py -------------------------------------------------------------------------------- /octis/models/ETM_model/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Adji B. Dieng, Francisco J. R. Ruiz, David M. Blei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /octis/models/ETM_model/README.md: -------------------------------------------------------------------------------- 1 | # ETM 2 | 3 | This is code that accompanies the paper titled "Topic Modeling in Embedding Spaces" by Adji B. Dieng, Francisco J. R. Ruiz, and David M. Blei. (Arxiv link: https://arxiv.org/abs/1907.04907) 4 | 5 | ETM defines words and topics in the same embedding space. The likelihood of a word under ETM is a Categorical whose natural parameter is given by the dot product between the word embedding and its assigned topic's embedding. ETM is a document model that learns interpretable topics and word embeddings and is robust to large vocabularies that include rare words and stop words. 6 | 7 | ## Dependencies 8 | 9 | + python 3.6.7 10 | + pytorch 1.1.0 11 | 12 | ## Citation 13 | 14 | ``` 15 | @article{dieng2019topic, 16 | title={Topic modeling in embedding spaces}, 17 | author={Dieng, Adji B and Ruiz, Francisco J R and Blei, David M}, 18 | journal={arXiv preprint arXiv:1907.04907}, 19 | year={2019} 20 | } 21 | ``` 22 | 23 | -------------------------------------------------------------------------------- /octis/models/ETM_model/__init__.py: -------------------------------------------------------------------------------- 1 | """Init package""" 2 | -------------------------------------------------------------------------------- /octis/models/ETM_model/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | def get_batch(tokens, counts, ind, vocab_size, device): 5 | """fetch input data by batch.""" 6 | batch_size = len(ind) 7 | data_batch = np.zeros((batch_size, vocab_size)) 8 | for i, doc_id in enumerate(ind): 9 | doc = tokens[doc_id] 10 | count = counts[doc_id] 11 | if doc_id != -1: 12 | for j, word in enumerate(doc): 13 | data_batch[i, word] = count[j] 14 | data_batch = torch.from_numpy(data_batch).float().to(device) 15 | return data_batch 16 | -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_tr_counts.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_tr_counts.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_tr_tokens.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_tr_tokens.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_ts_counts.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_ts_counts.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_ts_h1_counts.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_ts_h1_counts.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_ts_h1_tokens.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_ts_h1_tokens.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_ts_h2_counts.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_ts_h2_counts.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_ts_h2_tokens.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_ts_h2_tokens.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_ts_tokens.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_ts_tokens.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_va_counts.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_va_counts.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/bow_va_tokens.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/bow_va_tokens.mat -------------------------------------------------------------------------------- /octis/models/ETM_model/data/20ng/vocab.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/ETM_model/data/20ng/vocab.pkl -------------------------------------------------------------------------------- /octis/models/ETM_model/etm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | import math 5 | 6 | from torch import nn 7 | 8 | 9 | class ETM(nn.Module): 10 | def __init__(self, num_topics, vocab_size, t_hidden_size, rho_size, emb_size, 11 | theta_act, embeddings=None, train_embeddings=True, enc_drop=0.5): 12 | super(ETM, self).__init__() 13 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 14 | ## define hyperparameters 15 | self.num_topics = num_topics 16 | self.vocab_size = vocab_size 17 | self.t_hidden_size = t_hidden_size 18 | self.rho_size = rho_size 19 | self.enc_drop = enc_drop 20 | self.emb_size = emb_size 21 | self.t_drop = nn.Dropout(enc_drop) 22 | 23 | self.theta_act = self.get_activation(theta_act) 24 | 25 | ## define the word embedding matrix \rho 26 | if train_embeddings: 27 | self.rho = nn.Linear(rho_size, vocab_size, bias=False) 28 | else: 29 | num_embeddings, emb_size = embeddings.size() 30 | rho = nn.Embedding(num_embeddings, emb_size) 31 | self.rho = embeddings.clone().float().to(self.device) 32 | 33 | ## define the matrix containing the topic embeddings 34 | self.alphas = nn.Linear(rho_size, num_topics, bias=False)#nn.Parameter(torch.randn(rho_size, num_topics)) 35 | 36 | ## define variational distribution for \theta_{1:D} via amortizartion 37 | self.q_theta = nn.Sequential( 38 | nn.Linear(vocab_size, t_hidden_size), self.theta_act, 39 | nn.Linear(t_hidden_size, t_hidden_size), self.theta_act, 40 | ) 41 | self.mu_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True) 42 | self.logsigma_q_theta = nn.Linear(t_hidden_size, num_topics, bias=True) 43 | 44 | def get_activation(self, act): 45 | if act == 'tanh': 46 | act = nn.Tanh() 47 | elif act == 'relu': 48 | act = nn.ReLU() 49 | elif act == 'softplus': 50 | act = nn.Softplus() 51 | elif act == 'sigmoid': 52 | act = nn.Sigmoid() 53 | elif act == 'rrelu': 54 | act = nn.RReLU() 55 | elif act == 'leakyrelu': 56 | act = nn.LeakyReLU() 57 | elif act == 'elu': 58 | act = nn.ELU() 59 | elif act == 'selu': 60 | act = nn.SELU() 61 | elif act == 'glu': 62 | act = nn.GLU() #error using glu 63 | else: 64 | print('Defaulting to tanh activations...') 65 | act = nn.Tanh() 66 | return act 67 | 68 | def reparameterize(self, mu, logvar): 69 | """Returns a sample from a Gaussian distribution via reparameterization. 70 | """ 71 | if self.training: 72 | std = torch.exp(0.5 * logvar) 73 | eps = torch.randn_like(std) 74 | return eps.mul_(std).add_(mu) 75 | else: 76 | return mu 77 | 78 | def encode(self, bows): 79 | """Returns paramters of the variational distribution for \theta. 80 | 81 | input: bows 82 | batch of bag-of-words...tensor of shape bsz x V 83 | output: mu_theta, log_sigma_theta 84 | """ 85 | q_theta = self.q_theta(bows) 86 | if self.enc_drop > 0: 87 | q_theta = self.t_drop(q_theta) 88 | mu_theta = self.mu_q_theta(q_theta) 89 | logsigma_theta = self.logsigma_q_theta(q_theta) 90 | kl_theta = -0.5 * torch.sum(1 + logsigma_theta - mu_theta.pow(2) - logsigma_theta.exp(), dim=-1).mean() 91 | return mu_theta, logsigma_theta, kl_theta 92 | 93 | def get_beta(self): 94 | try: 95 | logit = self.alphas(self.rho.weight) # torch.mm(self.rho, self.alphas) 96 | except: 97 | logit = self.alphas(self.rho) 98 | beta = F.softmax(logit, dim=0).transpose(1, 0) ## softmax over vocab dimension 99 | return beta 100 | 101 | def get_theta(self, normalized_bows): 102 | mu_theta, logsigma_theta, kld_theta = self.encode(normalized_bows) 103 | z = self.reparameterize(mu_theta, logsigma_theta) 104 | theta = F.softmax(z, dim=-1) 105 | return theta, kld_theta 106 | 107 | def decode(self, theta, beta): 108 | res = torch.mm(theta, beta) 109 | preds = torch.log(res+1e-6) 110 | return preds 111 | 112 | def forward(self, bows, normalized_bows, theta=None, aggregate=True): 113 | ## get \theta 114 | if theta is None: 115 | theta, kld_theta = self.get_theta(normalized_bows) 116 | else: 117 | kld_theta = None 118 | 119 | ## get \beta 120 | beta = self.get_beta() 121 | 122 | ## get prediction loss 123 | preds = self.decode(theta, beta) 124 | recon_loss = -(preds * bows).sum(1) 125 | if aggregate: 126 | recon_loss = recon_loss.mean() 127 | return recon_loss, kld_theta 128 | 129 | -------------------------------------------------------------------------------- /octis/models/ETM_model/skipgram.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | import pickle 3 | import os 4 | import numpy as np 5 | import argparse 6 | 7 | parser = argparse.ArgumentParser(description='The Embedded Topic Model') 8 | 9 | ### data and file related arguments 10 | parser.add_argument('--data_file', type=str, default='', help='a .txt file containing the corpus') 11 | parser.add_argument('--emb_file', type=str, default='embeddings.txt', help='file to save the word embeddings') 12 | parser.add_argument('--dim_rho', type=int, default=300, help='dimensionality of the word embeddings') 13 | parser.add_argument('--min_count', type=int, default=2, help='minimum term frequency (to define the vocabulary)') 14 | parser.add_argument('--sg', type=int, default=1, help='whether to use skip-gram') 15 | parser.add_argument('--workers', type=int, default=25, help='number of CPU cores') 16 | parser.add_argument('--negative_samples', type=int, default=10, help='number of negative samples') 17 | parser.add_argument('--window_size', type=int, default=4, help='window size to determine context') 18 | parser.add_argument('--iters', type=int, default=50, help='number of iterationst') 19 | 20 | args = parser.parse_args() 21 | 22 | # Class for a memory-friendly iterator over the dataset 23 | class MySentences(object): 24 | def __init__(self, filename): 25 | self.filename = filename 26 | 27 | def __iter__(self): 28 | for line in open(self.filename): 29 | yield line.split() 30 | 31 | # Gensim code to obtain the embeddings 32 | sentences = MySentences(args.data_file) # a memory-friendly iterator 33 | model = gensim.models.Word2Vec(sentences, min_count=args.min_count, sg=args.sg, size=args.dim_rho, 34 | iter=args.iters, workers=args.workers, negative=args.negative_samples, window=args.window_size) 35 | 36 | # Write the embeddings to a file 37 | with open(args.emb_file, 'w') as f: 38 | for v in list(model.wv.vocab): 39 | vec = list(model.wv.__getitem__(v)) 40 | f.write(v + ' ') 41 | vec_str = ['%.9f' % val for val in vec] 42 | vec_str = " ".join(vec_str) 43 | f.write(vec_str + '\n') 44 | -------------------------------------------------------------------------------- /octis/models/ETM_model/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_topic_diversity(beta, topk): 5 | num_topics = beta.shape[0] 6 | list_w = np.zeros((num_topics, topk)) 7 | for k in range(num_topics): 8 | idx = beta[k, :].argsort()[-topk:][::-1] 9 | list_w[k, :] = idx 10 | n_unique = len(np.unique(list_w)) 11 | TD = n_unique / (topk * num_topics) 12 | print('Topic diveristy is: {}'.format(TD)) 13 | 14 | 15 | def get_document_frequency(data, wi, wj=None): 16 | if wj is None: 17 | D_wi = 0 18 | for l in range(len(data)): 19 | doc = data[l].squeeze(0) 20 | if len(doc) == 1: 21 | continue 22 | else: 23 | doc = doc.squeeze() 24 | if wi in doc: 25 | D_wi += 1 26 | return D_wi 27 | D_wj = 0 28 | D_wi_wj = 0 29 | for l in range(len(data)): 30 | doc = data[l].squeeze(0) 31 | if len(doc) == 1: 32 | doc = [doc.squeeze()] 33 | else: 34 | doc = doc.squeeze() 35 | if wj in doc: 36 | D_wj += 1 37 | if wi in doc: 38 | D_wi_wj += 1 39 | return D_wj, D_wi_wj 40 | 41 | 42 | def get_topic_coherence(beta, data, vocab): 43 | D = len(data) # number of docs...data is list of documents 44 | print('D: ', D) 45 | TC = [] 46 | num_topics = len(beta) 47 | for k in range(num_topics): 48 | print('k: {}/{}'.format(k, num_topics)) 49 | top_10 = list(beta[k].argsort()[-11:][::-1]) 50 | top_words = [vocab[a] for a in top_10] 51 | TC_k = 0 52 | counter = 0 53 | for i, word in enumerate(top_10): 54 | # get D(w_i) 55 | D_wi = get_document_frequency(data, word) 56 | j = i + 1 57 | tmp = 0 58 | while j < len(top_10) and j > i: 59 | # get D(w_j) and D(w_i, w_j) 60 | D_wj, D_wi_wj = get_document_frequency(data, word, top_10[j]) 61 | # get f(w_i, w_j) 62 | if D_wi_wj == 0: 63 | f_wi_wj = -1 64 | else: 65 | f_wi_wj = -1 + ( 66 | np.log(D_wi) + np.log(D_wj) - 2.0 * np.log(D)) / ( 67 | np.log(D_wi_wj) - np.log(D)) 68 | # update tmp: 69 | tmp += f_wi_wj 70 | j += 1 71 | counter += 1 72 | # update TC_k 73 | TC_k += tmp 74 | TC.append(TC_k) 75 | print('counter: ', counter) 76 | print('num topics: ', len(TC)) 77 | TC = np.mean(TC) / counter 78 | print('Topic coherence is: {}'.format(TC)) 79 | 80 | 81 | def nearest_neighbors(word, embeddings, vocab): 82 | vectors = embeddings.data.cpu().numpy() 83 | index = vocab.index(word) 84 | print('vectors: ', vectors.shape) 85 | query = vectors[index] 86 | print('query: ', query.shape) 87 | ranks = vectors.dot(query).squeeze() 88 | denom = query.T.dot(query).squeeze() 89 | denom = denom * np.sum(vectors**2, 1) 90 | denom = np.sqrt(denom) 91 | ranks = ranks / denom 92 | mostSimilar = [] 93 | [mostSimilar.append(idx) for idx in ranks.argsort()[::-1]] 94 | nearest_neighbors = mostSimilar[:20] 95 | nearest_neighbors = [vocab[comp] for comp in nearest_neighbors] 96 | return nearest_neighbors 97 | -------------------------------------------------------------------------------- /octis/models/LDA_tomopy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tomotopy as tp 3 | 4 | from octis.models.model import AbstractModel 5 | 6 | """ 7 | Experimental integration of tomotopy's implementation of LDA 8 | """ 9 | 10 | 11 | class LDA_tomopy(AbstractModel): 12 | 13 | def __init__( 14 | self, num_topics=100, alpha=0.1, eta=0.01, max_iters=50, 15 | use_partitions=True): 16 | super().__init__() 17 | self.hyperparameters = dict() 18 | self.hyperparameters['num_topics'] = num_topics 19 | self.hyperparameters['alpha'] = alpha 20 | self.hyperparameters['eta'] = eta 21 | self.hyperparameters['max_iters'] = max_iters 22 | self.use_partitions = use_partitions 23 | 24 | def train_model(self, dataset, hyperparameters=None, top_words=10): 25 | if hyperparameters is None: 26 | hyperparameters = dict() 27 | self.set_default_hyperparameters(hyperparameters) 28 | if self.use_partitions: 29 | x_train, x_test = dataset.get_partitioned_corpus( 30 | use_validation=False) 31 | else: 32 | x_train = dataset.get_corpus() 33 | x_test = None 34 | 35 | lda = tp.LDAModel( 36 | k=self.hyperparameters['num_topics'], 37 | alpha=self.hyperparameters['alpha'], 38 | eta=self.hyperparameters['eta']) 39 | 40 | for i in x_train: 41 | lda.add_doc(i) 42 | 43 | lda.train(self.hyperparameters['max_iters']) 44 | 45 | topic_word_matrix = np.stack( 46 | [lda.get_topic_word_dist(k, normalize=True) 47 | for k in range(lda.k)]) # topic word distribution matrix 48 | topic_document_matrix = np.stack( 49 | [doc.get_topic_dist() 50 | for doc in lda.docs]) # topic document distribution matrix 51 | 52 | additional_words = [ 53 | item for item in dataset.get_vocabulary() 54 | if item not in list(lda.used_vocabs)] 55 | num_additional_words = len(additional_words) 56 | if num_additional_words > 0: 57 | topic_word_matrix = np.concatenate( 58 | (topic_word_matrix, 59 | np.zeros((topic_word_matrix.shape[0], num_additional_words), 60 | dtype=float)), axis=1) 61 | final_vocab = list(lda.used_vocabs) + additional_words 62 | vocab2id = {w: i for i, w in enumerate(final_vocab)} 63 | 64 | sorted_indexes = [ 65 | vocab2id[w] for i, w in enumerate(dataset.get_vocabulary())] 66 | topic_word_matrix = topic_word_matrix[:, sorted_indexes] 67 | 68 | # topics extraction 69 | topic_w = [] 70 | for k in range(lda.k): 71 | topics = [] 72 | for word in lda.get_topic_words(k): 73 | topics.append(word[0]) 74 | topic_w.append(topics) 75 | 76 | # Output model on the Train Set 77 | info = {} 78 | info['topics'] = topic_w 79 | info['topic-word-matrix'] = topic_word_matrix 80 | info['topic-document-matrix'] = topic_document_matrix.T 81 | 82 | # Inference on the test set 83 | if x_test is not None: 84 | doc_inst = [lda.make_doc(i) for i in x_test] 85 | topic_dist, _ = lda.infer(doc_inst) # topic document distribution 86 | info['test-topic-document-matrix'] = np.asarray(topic_dist).T 87 | 88 | return info 89 | 90 | def partitioning(self, use_partitions=False): 91 | self.use_partitions = use_partitions 92 | 93 | def set_default_hyperparameters(self, hyperparameters): 94 | for k in hyperparameters.keys(): 95 | if k in self.hyperparameters.keys(): 96 | self.hyperparameters[k] = hyperparameters.get( 97 | k, self.hyperparameters[k]) 98 | -------------------------------------------------------------------------------- /octis/models/NeuralLDA.py: -------------------------------------------------------------------------------- 1 | from octis.models.pytorchavitm.AVITM import AVITM 2 | 3 | 4 | class NeuralLDA(AVITM): 5 | def __init__( 6 | self, num_topics=10, activation='softplus', dropout=0.2, 7 | learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99, 8 | solver='adam', num_epochs=100, reduce_on_plateau=False, prior_mean=0.0, 9 | prior_variance=None, num_layers=2, num_neurons=100, num_samples=10, 10 | use_partitions=True): 11 | super().__init__( 12 | num_topics=num_topics, model_type='LDA', activation=activation, 13 | dropout=dropout, learn_priors=learn_priors, batch_size=batch_size, 14 | lr=lr, momentum=momentum, solver=solver, num_epochs=num_epochs, 15 | reduce_on_plateau=reduce_on_plateau, prior_mean=prior_mean, 16 | prior_variance=prior_variance, num_layers=num_layers, 17 | num_neurons=num_neurons, num_samples=num_samples, 18 | use_partitions=use_partitions) 19 | 20 | def train_model(self, dataset, hyperparameters=None, top_words=10): 21 | return super().train_model( 22 | dataset=dataset, hyperparameters=hyperparameters, 23 | top_words=top_words) 24 | -------------------------------------------------------------------------------- /octis/models/ProdLDA.py: -------------------------------------------------------------------------------- 1 | from octis.models.pytorchavitm.AVITM import AVITM 2 | 3 | 4 | class ProdLDA(AVITM): 5 | def __init__( 6 | self, num_topics=10, activation='softplus', dropout=0.2, 7 | learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99, 8 | solver='adam', num_epochs=100, reduce_on_plateau=False, prior_mean=0.0, 9 | prior_variance=None, num_layers=2, num_neurons=100, num_samples=10, 10 | use_partitions=True): 11 | super().__init__( 12 | num_topics=num_topics, model_type='prodLDA', activation=activation, 13 | dropout=dropout, learn_priors=learn_priors, batch_size=batch_size, 14 | lr=lr, momentum=momentum, solver=solver, num_epochs=num_epochs, 15 | reduce_on_plateau=reduce_on_plateau, prior_mean=prior_mean, 16 | prior_variance=prior_variance, num_layers=num_layers, 17 | num_neurons=num_neurons, num_samples=num_samples, 18 | use_partitions=use_partitions) 19 | 20 | def train_model(self, dataset, hyperparameters=None, top_words=10): 21 | return super().train_model(dataset, hyperparameters, top_words) 22 | -------------------------------------------------------------------------------- /octis/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * -------------------------------------------------------------------------------- /octis/models/base_etm.py: -------------------------------------------------------------------------------- 1 | from octis.models.model import AbstractModel 2 | import pickle as pkl 3 | import numpy as np 4 | from torch import optim 5 | from gensim.models import KeyedVectors 6 | import torch 7 | 8 | 9 | class BaseETM(AbstractModel): 10 | """ 11 | this is the base model both the embedde 12 | and the dynamic embedded topic model will inherit from 13 | it since it contains the methods that are share among the both models 14 | """ 15 | def set_optimizer(self): 16 | self.hyperparameters['lr'] = float(self.hyperparameters['lr']) 17 | self.hyperparameters['wdecay'] = float(self.hyperparameters['wdecay']) 18 | if self.hyperparameters['optimizer'] == 'adam': 19 | optimizer = optim.Adam( 20 | self.model.parameters(), lr=self.hyperparameters['lr'], 21 | weight_decay=self.hyperparameters['wdecay']) 22 | elif self.hyperparameters['optimizer'] == 'adagrad': 23 | optimizer = optim.Adagrad( 24 | self.model.parameters(), lr=self.hyperparameters['lr'], 25 | weight_decay=self.hyperparameters['wdecay']) 26 | elif self.hyperparameters['optimizer'] == 'adadelta': 27 | optimizer = optim.Adadelta( 28 | self.model.parameters(), lr=self.hyperparameters['lr'], 29 | weight_decay=self.hyperparameters['wdecay']) 30 | elif self.hyperparameters['optimizer'] == 'rmsprop': 31 | optimizer = optim.RMSprop( 32 | self.model.parameters(), lr=self.hyperparameters['lr'], 33 | weight_decay=self.hyperparameters['wdecay']) 34 | elif self.hyperparameters['optimizer'] == 'asgd': 35 | optimizer = optim.ASGD( 36 | self.model.parameters(), lr=self.hyperparameters['lr'], 37 | t0=0, lambd=0., weight_decay=self.hyperparameters['wdecay']) 38 | elif self.hyperparameters['optimizer'] == 'sgd': 39 | optimizer = optim.SGD( 40 | self.model.parameters(), lr=self.hyperparameters['lr'], 41 | weight_decay=self.hyperparameters['wdecay']) 42 | else: 43 | print('Defaulting to vanilla SGD') 44 | optimizer = optim.SGD( 45 | self.model.parameters(), lr=self.hyperparameters['lr']) 46 | 47 | return optimizer 48 | 49 | @staticmethod 50 | def preprocess( 51 | vocab2id, train_corpus, test_corpus=None, validation_corpus=None): 52 | 53 | raise NotImplementedError("Subclasses should implement this!") 54 | 55 | def load_embeddings(self): 56 | if self.hyperparameters['train_embeddings']: 57 | return 58 | 59 | vectors = self._load_word_vectors( 60 | self.hyperparameters['embeddings_path'], 61 | self.hyperparameters['embeddings_type'], 62 | self.hyperparameters['binary_embeddings'], 63 | self.hyperparameters['headerless_embeddings']) 64 | embeddings = np.zeros( 65 | (len(self.vocab.keys()), self.hyperparameters['embedding_size'])) 66 | for i, word in enumerate(self.vocab.values()): 67 | try: 68 | embeddings[i] = vectors[word] 69 | except KeyError: 70 | embeddings[i] = np.random.normal( 71 | scale=0.6, size=(self.hyperparameters['embedding_size'],)) 72 | self.embeddings = torch.from_numpy(embeddings).to(self.device) 73 | 74 | def _load_word_vectors( 75 | self, embeddings_path, embeddings_type, binary_embeddings=True, 76 | headerless_embeddings=False): 77 | """ 78 | Reads word embeddings from a specified file and format. 79 | 80 | :param embeddings_path: string, path to embeddings file. 81 | Can be a binary file for the 'pickle', 'keyedvectors' and 82 | 'word2vec' types or a text file for 'word2vec' 83 | :param embeddings_type: string, defines the format of the embeddings 84 | file. Possible values are 'pickle', 'keyedvectors' or 'word2vec'. 85 | If set to 'pickle', you must provide a file created with 'pickle' 86 | containing an array of word embeddings, composed by words and 87 | their respective vectors. If set to 'keyedvectors', you must 88 | provide a file containing a saved gensim.models.KeyedVectors 89 | instance. If set to 'word2vec', you must provide a file with the 90 | original word2vec format 91 | :param binary_embeddings: bool, indicates if the original word2vec 92 | embeddings file is binary or textual (default True) 93 | :param headerless_embeddings: bool, indicates if the original word2vec 94 | embeddings textual file has a header line in the format 95 | " " (default False) 96 | :returns: gensim.models.KeyedVectors or dict 97 | """ 98 | if embeddings_type == 'keyedvectors': 99 | return KeyedVectors.load(embeddings_path, mmap='r') 100 | elif embeddings_type == 'word2vec': 101 | return KeyedVectors.load_word2vec_format( 102 | embeddings_path, binary=binary_embeddings, 103 | no_header=headerless_embeddings) 104 | 105 | vectors = {} 106 | embs = pkl.load(open(embeddings_path, 'rb')) 107 | for emb in embs: 108 | line = emb.split() 109 | word = line[0] 110 | if word in self.vocab.values(): 111 | vect = np.array(line[1:]).astype(float) 112 | vectors[word] = vect 113 | return vectors 114 | 115 | def filter_pretrained_embeddings( 116 | self, pretrained_embeddings_path, save_embedding_path, 117 | vocab_path, binary=True): 118 | """ 119 | Filter the embeddings from a set of word2vec-format pretrained 120 | embeddings based on the vocabulary. This should allow you to avoid to 121 | load the whole embedding space every time you do Bayesian Optimization 122 | but just the embeddings that are in the vocabulary. 123 | :param pretrained_embeddings_path: 124 | :return: 125 | """ 126 | vocab = [] 127 | with open(vocab_path, 'r') as fr: 128 | for line in fr.readlines(): 129 | vocab.append(line.strip().split(" ")[0]) 130 | 131 | w2v_model = KeyedVectors.load_word2vec_format( 132 | pretrained_embeddings_path, binary=binary) 133 | embeddings = [] 134 | for word in vocab: 135 | if word in w2v_model.vocab: 136 | line = word 137 | for w in w2v_model[word].tolist(): 138 | line = line + " " + str(w) 139 | embeddings.append(line) 140 | pkl.dump(embeddings, open(save_embedding_path, 'wb')) 141 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, Federico Bianchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for Contextualized Topic Models.""" 2 | 3 | __author__ = """Federico Bianchi""" 4 | __email__ = 'f.bianchi@unibocconi.it' 5 | __version__ = '1.7.0' 6 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/contextualized_topic_models.py: -------------------------------------------------------------------------------- 1 | """Main module.""" 2 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/contextualized_topic_models/datasets/__init__.py -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | import scipy.sparse 4 | 5 | 6 | class CTMDataset(Dataset): 7 | 8 | """Class to load BOW dataset.""" 9 | 10 | def __init__(self, X, X_bert, idx2token): 11 | """ 12 | Args 13 | X : array-like, shape=(n_samples, n_features) 14 | Document word matrix. 15 | """ 16 | if X.shape[0] != len(X_bert): 17 | raise Exception("Wait! BoW and Contextual Embeddings have different sizes! " 18 | "You might want to check if the BoW preparation method has removed some documents. ") 19 | 20 | self.X = X 21 | self.X_bert = X_bert 22 | self.idx2token = idx2token 23 | 24 | def __len__(self): 25 | """Return length of dataset.""" 26 | return self.X.shape[0] 27 | 28 | def __getitem__(self, i): 29 | """Return sample from dataset at index i.""" 30 | if type(self.X[i]) == scipy.sparse.csr.csr_matrix: 31 | X = torch.FloatTensor(self.X[i].todense()) 32 | X_bert = torch.FloatTensor(self.X_bert[i]) 33 | else: 34 | X = torch.FloatTensor(self.X[i]) 35 | X_bert = torch.FloatTensor(self.X_bert[i]) 36 | 37 | return {'X': X, 'X_bert': X_bert} 38 | 39 | 40 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/contextualized_topic_models/models/__init__.py -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/contextualized_topic_models/networks/__init__.py -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/networks/decoding_network.py: -------------------------------------------------------------------------------- 1 | """PyTorch class for feed foward AVITM network.""" 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | import numpy as np 7 | 8 | from octis.models.contextualized_topic_models.networks.inference_network import CombinedInferenceNetwork, ContextualInferenceNetwork 9 | 10 | 11 | class DecoderNetwork(nn.Module): 12 | 13 | """AVITM Network.""" 14 | 15 | def __init__(self, input_size, bert_size, infnet, n_components=10, model_type='prodLDA', 16 | hidden_sizes=(100,100), activation='softplus', dropout=0.2, 17 | learn_priors=True, topic_prior_mean=0.0, topic_prior_variance=None): 18 | """ 19 | Initialize InferenceNetwork. 20 | 21 | Args 22 | input_size : int, dimension of input 23 | n_components : int, number of topic components, (default 10) 24 | model_type : string, 'prodLDA' or 'LDA' (default 'prodLDA') 25 | hidden_sizes : tuple, length = n_layers, (default (100, 100)) 26 | activation : string, 'softplus', 'relu', (default 'softplus') 27 | learn_priors : bool, make priors learnable parameter 28 | topic_prior_mean: double, mean parameter of the prior 29 | topic_prior_variance: double, variance parameter of the prior 30 | """ 31 | super(DecoderNetwork, self).__init__() 32 | assert isinstance(input_size, int), "input_size must by type int." 33 | assert (isinstance(n_components, int) or isinstance(n_components, np.int64)) and n_components > 0, \ 34 | "n_components must be type int > 0." 35 | assert model_type in ['prodLDA', 'LDA'], \ 36 | "model type must be 'prodLDA' or 'LDA'" 37 | assert isinstance(hidden_sizes, tuple), \ 38 | "hidden_sizes must be type tuple." 39 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 40 | 'rrelu', 'elu', 'selu'], \ 41 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 42 | " 'rrelu', 'elu', 'selu' or 'tanh'." 43 | assert dropout >= 0, "dropout must be >= 0." 44 | assert isinstance(topic_prior_mean, float), \ 45 | "topic_prior_mean must be type float" 46 | # and topic_prior_variance >= 0, \ 47 | #assert isinstance(topic_prior_variance, float), \ 48 | # "topic prior_variance must be type float" 49 | 50 | self.input_size = input_size 51 | self.n_components = n_components 52 | self.model_type = model_type 53 | self.hidden_sizes = hidden_sizes 54 | self.activation = activation 55 | self.dropout = dropout 56 | self.learn_priors = learn_priors 57 | 58 | if infnet == "zeroshot": 59 | self.inf_net = ContextualInferenceNetwork( 60 | input_size, bert_size, n_components, hidden_sizes, activation) 61 | elif infnet == "combined": 62 | self.inf_net = CombinedInferenceNetwork( 63 | input_size, bert_size, n_components, hidden_sizes, activation) 64 | else: 65 | raise Exception('Missing infnet parameter, options are zeroshot and combined') 66 | if torch.cuda.is_available(): 67 | self.inf_net = self.inf_net.cuda() 68 | # init prior parameters 69 | # \mu_1k = log \alpha_k + 1/K \sum_i log \alpha_i; 70 | # \alpha = 1 \forall \alpha 71 | #self.topic_prior_mean = topic_prior_mean 72 | self.prior_mean = torch.tensor( 73 | [topic_prior_mean] * n_components) 74 | if torch.cuda.is_available(): 75 | self.prior_mean = self.prior_mean.cuda() 76 | if self.learn_priors: 77 | self.prior_mean = nn.Parameter(self.prior_mean) 78 | 79 | 80 | # \Sigma_1kk = 1 / \alpha_k (1 - 2/K) + 1/K^2 \sum_i 1 / \alpha_k; 81 | # \alpha = 1 \forall \alpha 82 | if topic_prior_variance is None: 83 | topic_prior_variance = 1. - (1. / self.n_components) 84 | self.prior_variance = torch.tensor( 85 | [topic_prior_variance] * n_components) 86 | if torch.cuda.is_available(): 87 | self.prior_variance = self.prior_variance.cuda() 88 | if self.learn_priors: 89 | self.prior_variance = nn.Parameter(self.prior_variance) 90 | 91 | self.beta = torch.Tensor(n_components, input_size) 92 | if torch.cuda.is_available(): 93 | self.beta = self.beta.cuda() 94 | self.beta = nn.Parameter(self.beta) 95 | nn.init.xavier_uniform_(self.beta) 96 | 97 | self.beta_batchnorm = nn.BatchNorm1d(input_size, affine=False) 98 | 99 | # dropout on theta 100 | self.drop_theta = nn.Dropout(p=self.dropout) 101 | 102 | @staticmethod 103 | def reparameterize(mu, logvar): 104 | """Reparameterize the theta distribution.""" 105 | std = torch.exp(0.5*logvar) 106 | eps = torch.randn_like(std) 107 | return eps.mul(std).add_(mu) 108 | 109 | def forward(self, x, x_bert): 110 | """Forward pass.""" 111 | # batch_size x n_components 112 | posterior_mu, posterior_log_sigma = self.inf_net(x, x_bert) 113 | posterior_sigma = torch.exp(posterior_log_sigma) 114 | 115 | # generate samples from theta 116 | theta = F.softmax(self.reparameterize(posterior_mu, posterior_log_sigma), dim=1) 117 | 118 | topic_doc = theta 119 | theta = self.drop_theta(theta) 120 | 121 | # prodLDA vs LDA 122 | if self.model_type == 'prodLDA': 123 | # in: batch_size x input_size x n_components 124 | word_dist = F.softmax( 125 | self.beta_batchnorm(torch.matmul(theta, self.beta)), dim=1) 126 | topic_word = self.beta 127 | # word_dist: batch_size x input_size 128 | #self.topic_word_matrix = self.beta 129 | elif self.model_type == 'LDA': 130 | # simplex constrain on Beta 131 | beta = F.softmax(self.beta_batchnorm(self.beta), dim=1) 132 | topic_word = beta 133 | word_dist = torch.matmul(theta, beta) 134 | # word_dist: batch_size x input_size 135 | 136 | return self.prior_mean, self.prior_variance, \ 137 | posterior_mu, posterior_sigma, posterior_log_sigma, word_dist, topic_word, topic_doc 138 | 139 | def get_theta(self, x, x_bert): 140 | with torch.no_grad(): 141 | # batch_size x n_components 142 | posterior_mu, posterior_log_sigma = self.inf_net(x, x_bert) 143 | posterior_sigma = torch.exp(posterior_log_sigma) 144 | 145 | # generate samples from theta 146 | theta = F.softmax( 147 | self.reparameterize(posterior_mu, posterior_log_sigma), dim=1) 148 | 149 | return theta 150 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/networks/inference_network.py: -------------------------------------------------------------------------------- 1 | """PyTorch class for feed foward inference network.""" 2 | 3 | from collections import OrderedDict 4 | from torch import nn 5 | import torch 6 | import numpy as np 7 | 8 | class ContextualInferenceNetwork(nn.Module): 9 | 10 | """Inference Network.""" 11 | 12 | def __init__(self, input_size, bert_size, output_size, hidden_sizes, 13 | activation='softplus', dropout=0.2): 14 | """ 15 | Initialize InferenceNetwork. 16 | 17 | Args 18 | input_size : int, dimension of input 19 | output_size : int, dimension of output 20 | hidden_sizes : tuple, length = n_layers 21 | activation : string, 'softplus' or 'relu', default 'softplus' 22 | dropout : float, default 0.2, default 0.2 23 | """ 24 | super(ContextualInferenceNetwork, self).__init__() 25 | assert isinstance(input_size, int), "input_size must by type int." 26 | assert isinstance(output_size, int) or isinstance(output_size, np.int64), "output_size must be type int." 27 | assert isinstance(hidden_sizes, tuple), \ 28 | "hidden_sizes must be type tuple." 29 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 30 | 'rrelu', 'elu', 'selu'], \ 31 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 32 | " 'rrelu', 'elu', 'selu' or 'tanh'." 33 | assert dropout >= 0, "dropout must be >= 0." 34 | 35 | self.input_size = input_size 36 | self.output_size = output_size 37 | self.hidden_sizes = hidden_sizes 38 | self.dropout = dropout 39 | 40 | if activation == 'softplus': 41 | self.activation = nn.Softplus() 42 | elif activation == 'relu': 43 | self.activation = nn.ReLU() 44 | elif activation == 'sigmoid': 45 | self.activation = nn.Sigmoid() 46 | elif activation == 'tanh': 47 | self.activation = nn.Tanh() 48 | elif activation == 'leakyrelu': 49 | self.activation = nn.LeakyReLU() 50 | elif activation == 'rrelu': 51 | self.activation = nn.RReLU() 52 | elif activation == 'elu': 53 | self.activation = nn.ELU() 54 | elif activation == 'selu': 55 | self.activation = nn.SELU() 56 | 57 | self.input_layer = nn.Linear(input_size+input_size, hidden_sizes[0]) 58 | self.adapt_bert = nn.Linear(bert_size, hidden_sizes[0]) 59 | 60 | self.hiddens = nn.Sequential(OrderedDict([ 61 | ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation)) 62 | for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) 63 | 64 | self.f_mu = nn.Linear(hidden_sizes[-1], output_size) 65 | self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False) 66 | 67 | self.f_sigma = nn.Linear(hidden_sizes[-1], output_size) 68 | self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False) 69 | 70 | self.dropout_enc = nn.Dropout(p=self.dropout) 71 | 72 | def forward(self, x, x_bert): 73 | """Forward pass.""" 74 | x_bert = self.adapt_bert(x_bert) 75 | 76 | x = self.activation(x_bert) 77 | x = self.hiddens(x) 78 | x = self.dropout_enc(x) 79 | mu = self.f_mu_batchnorm(self.f_mu(x)) 80 | log_sigma = self.f_sigma_batchnorm(self.f_sigma(x)) 81 | 82 | return mu, log_sigma 83 | 84 | 85 | class CombinedInferenceNetwork(nn.Module): 86 | 87 | """Inference Network.""" 88 | 89 | def __init__(self, input_size, bert_size, output_size, hidden_sizes, 90 | activation='softplus', dropout=0.2): 91 | """ 92 | Initialize InferenceNetwork. 93 | 94 | Args 95 | input_size : int, dimension of input 96 | output_size : int, dimension of output 97 | hidden_sizes : tuple, length = n_layers 98 | activation : string, 'softplus' or 'relu', default 'softplus' 99 | dropout : float, default 0.2, default 0.2 100 | """ 101 | super(CombinedInferenceNetwork, self).__init__() 102 | assert isinstance(input_size, int), "input_size must by type int." 103 | assert (isinstance(output_size, int) or isinstance(output_size, np.int64)), "output_size must be type int." 104 | assert isinstance(hidden_sizes, tuple), \ 105 | "hidden_sizes must be type tuple." 106 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 107 | 'rrelu', 'elu', 'selu'], \ 108 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 109 | " 'rrelu', 'elu', 'selu' or 'tanh'." 110 | 111 | assert dropout >= 0, "dropout must be >= 0." 112 | 113 | self.input_size = input_size 114 | self.output_size = output_size 115 | self.hidden_sizes = hidden_sizes 116 | self.dropout = dropout 117 | 118 | if activation == 'softplus': 119 | self.activation = nn.Softplus() 120 | elif activation == 'relu': 121 | self.activation = nn.ReLU() 122 | elif activation == 'sigmoid': 123 | self.activation = nn.Sigmoid() 124 | elif activation == 'tanh': 125 | self.activation = nn.Tanh() 126 | elif activation == 'leakyrelu': 127 | self.activation = nn.LeakyReLU() 128 | elif activation == 'rrelu': 129 | self.activation = nn.RReLU() 130 | elif activation == 'elu': 131 | self.activation = nn.ELU() 132 | elif activation == 'selu': 133 | self.activation = nn.SELU() 134 | 135 | self.input_layer = nn.Linear(input_size+input_size, hidden_sizes[0]) 136 | self.adapt_bert = nn.Linear(bert_size, input_size) 137 | self.bert_layer = nn.Linear(hidden_sizes[0], hidden_sizes[0]) 138 | 139 | self.hiddens = nn.Sequential(OrderedDict([ 140 | ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation)) 141 | for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) 142 | 143 | self.f_mu = nn.Linear(hidden_sizes[-1], output_size) 144 | self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False) 145 | 146 | self.f_sigma = nn.Linear(hidden_sizes[-1], output_size) 147 | self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False) 148 | 149 | self.dropout_enc = nn.Dropout(p=self.dropout) 150 | 151 | def forward(self, x, x_bert): 152 | """Forward pass.""" 153 | x_bert = self.adapt_bert(x_bert) 154 | x = torch.cat((x, x_bert), 1) 155 | x = self.input_layer(x) 156 | 157 | x = self.activation(x) 158 | x = self.hiddens(x) 159 | x = self.dropout_enc(x) 160 | mu = self.f_mu_batchnorm(self.f_mu(x)) 161 | log_sigma = self.f_sigma_batchnorm(self.f_sigma(x)) 162 | 163 | return mu, log_sigma 164 | -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/contextualized_topic_models/utils/__init__.py -------------------------------------------------------------------------------- /octis/models/contextualized_topic_models/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | import string 3 | from nltk.corpus import stopwords as stop_words 4 | import warnings 5 | 6 | class WhiteSpacePreprocessing(): 7 | """ 8 | Provides a very simple preprocessing script that filters infrequent tokens from text 9 | """ 10 | def __init__(self, documents, stopwords_language="english", vocabulary_size=2000): 11 | """ 12 | 13 | :param documents: list of strings 14 | :param stopwords_language: string of the language of the stopwords (see nltk stopwords) 15 | :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents 16 | """ 17 | self.documents = documents 18 | self.stopwords = set(stop_words.words(stopwords_language)) 19 | self.vocabulary_size = vocabulary_size 20 | 21 | def preprocess(self): 22 | """ 23 | Note that if after filtering some documents do not contain words we remove them. That is why we return also the 24 | list of unpreprocessed documents. 25 | 26 | :return: preprocessed documents, unpreprocessed documents and the vocabulary list 27 | """ 28 | preprocessed_docs_tmp = self.documents 29 | preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp] 30 | preprocessed_docs_tmp = [doc.translate( 31 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp] 32 | preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords]) 33 | for doc in preprocessed_docs_tmp] 34 | 35 | vectorizer = CountVectorizer(max_features=self.vocabulary_size, token_pattern=r'\b[a-zA-Z]{2,}\b') 36 | vectorizer.fit_transform(preprocessed_docs_tmp) 37 | vocabulary = set(vectorizer.get_feature_names_out()) 38 | preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocabulary]) 39 | for doc in preprocessed_docs_tmp] 40 | 41 | preprocessed_docs, unpreprocessed_docs = [], [] 42 | for i, doc in enumerate(preprocessed_docs_tmp): 43 | if len(doc) > 0: 44 | preprocessed_docs.append(doc) 45 | unpreprocessed_docs.append(self.documents[i]) 46 | 47 | return preprocessed_docs, unpreprocessed_docs, list(vocabulary) 48 | 49 | 50 | class SimplePreprocessing(WhiteSpacePreprocessing): 51 | def __init__(self, documents, stopwords_language="english"): 52 | super().__init__(documents, stopwords_language) 53 | warnings.simplefilter('always', DeprecationWarning) 54 | 55 | if self.__class__.__name__ == "CTM": 56 | 57 | warnings.warn("SimplePrepocessing is deprecated and will be removed in version 2.0, " 58 | "use WhiteSpacePreprocessing", DeprecationWarning) 59 | 60 | 61 | -------------------------------------------------------------------------------- /octis/models/early_stopping/.gitignore: -------------------------------------------------------------------------------- 1 | # Directories 2 | data/* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /octis/models/early_stopping/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at BjarteSunde@outlook.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /octis/models/early_stopping/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Bjarte Mehus Sunde 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /octis/models/early_stopping/README.md: -------------------------------------------------------------------------------- 1 | # Early Stopping for PyTorch 2 | Early stopping is a form of regularization used to avoid overfitting on the training dataset. Early stopping keeps track of the validation loss, if the loss stops decreasing for several epochs in a row the training stops. The ```EarlyStopping``` class in ```pytorchtool.py``` is used to create an object to keep track of the validation loss while training a [PyTorch](https://pytorch.org/) model. It will save a checkpoint of the model each time the validation loss decrease. We set the ```patience``` argument in the ```EarlyStopping``` class to how many epochs we want to wait after the last time the validation loss improved before breaking the training loop. There is a simple example of how to use the ```EarlyStopping``` class in the [MNIST_Early_Stopping_example](MNIST_Early_Stopping_example.ipynb) notebook. 3 | 4 | Underneath is a plot from the example notebook, which shows the last checkpoint made by the EarlyStopping object, right before the model started to overfit. It had patience set to 20. 5 | 6 | ![Loss plot](loss_plot.png?raw=true) 7 | 8 | ## Usage 9 | 10 | You can run this project directly in the browser by clicking this button: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Bjarten/early-stopping-pytorch/master), or you can clone the project to your computer and install the required pip packages specified in the requirements text file. 11 | ``` 12 | pip install -r requirements.txt 13 | ``` 14 | 15 | ## References 16 | The ```EarlyStopping``` class in ```pytorchtool.py``` is inspired by the [ignite EarlyStopping class](https://github.com/pytorch/ignite/blob/master/ignite/handlers/early_stopping.py). 17 | -------------------------------------------------------------------------------- /octis/models/early_stopping/__init__.py: -------------------------------------------------------------------------------- 1 | from octis.models.early_stopping.pytorchtools import EarlyStopping 2 | -------------------------------------------------------------------------------- /octis/models/early_stopping/checkpoint.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/early_stopping/checkpoint.pt -------------------------------------------------------------------------------- /octis/models/early_stopping/loss_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/early_stopping/loss_plot.png -------------------------------------------------------------------------------- /octis/models/early_stopping/pytorchtools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class EarlyStopping: 5 | """Early stops the training if validation loss doesn't improve after a given patience.""" 6 | def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print): 7 | """ 8 | Args: 9 | patience (int): How long to wait after last time validation loss improved. 10 | Default: 7 11 | verbose (bool): If True, prints a message for each validation loss improvement. 12 | Default: False 13 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 14 | Default: 0 15 | path (str): Path for the checkpoint to be saved to. 16 | Default: 'checkpoint.pt' 17 | trace_func (function): trace print function. 18 | Default: print 19 | """ 20 | self.patience = patience 21 | self.verbose = verbose 22 | self.counter = 0 23 | self.best_score = None 24 | self.early_stop = False 25 | self.val_loss_min = np.Inf 26 | self.delta = delta 27 | self.path = path 28 | self.trace_func = trace_func 29 | 30 | def __call__(self, val_loss, model): 31 | 32 | score = -val_loss 33 | 34 | if self.best_score is None: 35 | self.best_score = score 36 | self.save_checkpoint(val_loss, model) 37 | elif score < self.best_score + self.delta: 38 | self.counter += 1 39 | if self.verbose: 40 | self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}') 41 | if self.counter >= self.patience: 42 | self.early_stop = True 43 | else: 44 | self.best_score = score 45 | self.save_checkpoint(val_loss, model) 46 | self.counter = 0 47 | 48 | def save_checkpoint(self, val_loss, model): 49 | '''Saves model when validation loss decrease.''' 50 | if self.verbose: 51 | self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 52 | torch.save(model.state_dict(), self.path) 53 | self.val_loss_min = val_loss 54 | -------------------------------------------------------------------------------- /octis/models/early_stopping/requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | numpy 3 | torchvision 4 | -------------------------------------------------------------------------------- /octis/models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import os 3 | import numpy as np 4 | import json 5 | 6 | 7 | class AbstractModel(ABC): 8 | """ 9 | Class structure of a generic Topic Modeling implementation 10 | """ 11 | 12 | def __init__(self): 13 | """ 14 | Create a blank model to initialize 15 | """ 16 | self.hyperparameters = dict() 17 | 18 | def set_hyperparameters(self, **kwargs): 19 | """ 20 | Set model hyperparameters 21 | 22 | :param **kwargs: a dictionary of in the form {hyperparameter name: value} 23 | """ 24 | for key, value in kwargs.items(): 25 | self.hyperparameters[key] = value 26 | 27 | @abstractmethod 28 | def train_model(self, dataset, hyperparameters, top_words=10): 29 | """ 30 | Train the model. 31 | :param dataset: Dataset 32 | :param hyperparameters: dictionary in the form {hyperparameter name: value} 33 | :param top_words: number of top significant words for each topic (default: 10) 34 | 35 | :return model_output: a dictionary containing up to 4 keys: *topics*, *topic-word-matrix*, 36 | *topic-document-matrix*, *test-topic-document-matrix*. *topics* is the list of the most significant words for 37 | each topic (list of lists of strings). *topic-word-matrix* is the matrix (num topics x ||vocabulary||) 38 | containing the probabilities of a word in a given topic. *topic-document-matrix* is the matrix (||topics|| x 39 | ||training documents||) containing the probabilities of the topics in a given training document. 40 | *test-topic-document-matrix* is the matrix (||topics|| x ||testing documents||) containing the probabilities 41 | of the topics in a given testing document. 42 | """ 43 | pass 44 | 45 | 46 | def save_model_output(model_output, path=os.curdir, appr_order=7): 47 | """ 48 | Saves the model output in the chosen directory 49 | 50 | :param model_output: output of the model 51 | :param path: path in which the file will be saved and name of the file 52 | :param appr_order: approximation order (used to round model_output values) 53 | """ 54 | 55 | to_save = {} 56 | try: 57 | for single_output in model_output.keys(): 58 | if single_output != "topics" and single_output != "test-topics": 59 | to_save[single_output] = ( 60 | model_output[single_output].round(appr_order)) 61 | else: 62 | to_save[single_output] = (model_output[single_output]) 63 | np.savez_compressed(path, **to_save) 64 | except: 65 | raise Exception("error in saving the output model file") 66 | 67 | 68 | def load_model_output(output_path, vocabulary_path=None, top_words=10): 69 | """ 70 | Loads a model output from the choosen directory 71 | 72 | Parameters 73 | ---------- 74 | :param output_path: path in which th model output is saved 75 | :param vocabulary_path: path in which the vocabulary is saved (optional, 76 | used to retrieve the top k words of each topic) 77 | :param top_words: top k words to retrieve for each topic (in case a 78 | vocabulary path is given) 79 | """ 80 | output = dict(np.load(output_path, allow_pickle=True)) 81 | if vocabulary_path is not None: 82 | vocabulary_file = open(vocabulary_path, 'r') 83 | vocabulary = json.load(vocabulary_file) 84 | index2vocab = vocabulary 85 | 86 | topics_output = [] 87 | for topic in output["topic-word-matrix"]: 88 | top_k = np.argsort(topic)[-top_words:] 89 | top_k_words = list( 90 | reversed([[ 91 | index2vocab[str(i)], float(topic[i])] for i in top_k])) 92 | topics_output.append(top_k_words) 93 | 94 | output["topic-word-matrix"] = output["topic-word-matrix"].tolist() 95 | output["topic-document-matrix"] = output[ 96 | "topic-document-matrix"].tolist() 97 | if "test-topic-word-matrix" in output: 98 | output["test-topic-word-matrix"] = output[ 99 | "test-topic-word-matrix"].tolist() 100 | if "test-topic-document-matrix" in output: 101 | output["test-topic-document-matrix"] = output[ 102 | "test-topic-document-matrix"].tolist() 103 | 104 | output["topics"] = topics_output 105 | return output 106 | -------------------------------------------------------------------------------- /octis/models/pytorchavitm/__init__.py: -------------------------------------------------------------------------------- 1 | """Init package""" 2 | 3 | from octis.models.pytorchavitm.avitm.avitm_model import AVITM_model 4 | -------------------------------------------------------------------------------- /octis/models/pytorchavitm/avitm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/models/pytorchavitm/avitm/__init__.py -------------------------------------------------------------------------------- /octis/models/pytorchavitm/avitm/decoder_network.py: -------------------------------------------------------------------------------- 1 | """PyTorch class for feed foward AVITM network.""" 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from octis.models.pytorchavitm.avitm.inference_network import InferenceNetwork 7 | 8 | 9 | class DecoderNetwork(nn.Module): 10 | 11 | """AVITM Network.""" 12 | 13 | def __init__(self, input_size, n_components=10, model_type='prodLDA', 14 | hidden_sizes=(100,100), activation='softplus', dropout=0.2, 15 | learn_priors=True, topic_prior_mean=0.0, topic_prior_variance=None): 16 | """ 17 | Initialize InferenceNetwork. 18 | 19 | Args 20 | input_size : int, dimension of input 21 | n_components : int, number of topic components, (default 10) 22 | model_type : string, 'prodLDA' or 'LDA' (default 'prodLDA') 23 | hidden_sizes : tuple, length = n_layers, (default (100, 100)) 24 | activation : string, 'softplus', 'relu', (default 'softplus') 25 | learn_priors : bool, make priors learnable parameter 26 | topic_prior_mean: double, mean parameter of the prior 27 | topic_prior_variance: double, variance parameter of the prior 28 | """ 29 | super(DecoderNetwork, self).__init__() 30 | assert isinstance(input_size, int), "input_size must by type int." 31 | assert isinstance(n_components, int) and n_components > 0, \ 32 | "n_components must be type int > 0." 33 | assert model_type in ['prodLDA', 'LDA'], \ 34 | "model type must be 'prodLDA' or 'LDA'" 35 | assert isinstance(hidden_sizes, tuple), \ 36 | "hidden_sizes must be type tuple." 37 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 38 | 'rrelu', 'elu', 'selu'], \ 39 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 40 | " 'rrelu', 'elu', 'selu' or 'tanh'." 41 | assert dropout >= 0, "dropout must be >= 0." 42 | assert isinstance(topic_prior_mean, float), \ 43 | "topic_prior_mean must be type float" 44 | # and topic_prior_variance >= 0, \ 45 | #assert isinstance(topic_prior_variance, float), \ 46 | # "topic prior_variance must be type float" 47 | 48 | self.input_size = input_size 49 | self.n_components = n_components 50 | self.model_type = model_type 51 | self.hidden_sizes = hidden_sizes 52 | self.activation = activation 53 | self.dropout = dropout 54 | self.learn_priors = learn_priors 55 | 56 | self.inf_net = InferenceNetwork( 57 | input_size, n_components, hidden_sizes, activation) 58 | if torch.cuda.is_available(): 59 | self.inf_net = self.inf_net.cuda() 60 | 61 | # init prior parameters 62 | # \mu_1k = log \alpha_k + 1/K \sum_i log \alpha_i; 63 | # \alpha = 1 \forall \alpha 64 | 65 | #self.topic_prior_mean = topic_prior_mean 66 | self.prior_mean = torch.tensor( 67 | [topic_prior_mean] * n_components) 68 | if torch.cuda.is_available(): 69 | self.prior_mean = self.prior_mean.cuda() 70 | if self.learn_priors: 71 | self.prior_mean = nn.Parameter(self.prior_mean) 72 | # \Sigma_1kk = 1 / \alpha_k (1 - 2/K) + 1/K^2 \sum_i 1 / \alpha_k; 73 | # \alpha = 1 \forall \alpha 74 | 75 | if topic_prior_variance is None: 76 | topic_prior_variance = 1. - (1. / self.n_components) 77 | self.prior_variance = torch.tensor( 78 | [topic_prior_variance] * n_components) 79 | if torch.cuda.is_available(): 80 | self.prior_variance = self.prior_variance.cuda() 81 | if self.learn_priors: 82 | self.prior_variance = nn.Parameter(self.prior_variance) 83 | 84 | self.beta = torch.Tensor(n_components, input_size) 85 | if torch.cuda.is_available(): 86 | self.beta = self.beta.cuda() 87 | self.beta = nn.Parameter(self.beta) 88 | nn.init.xavier_uniform_(self.beta) 89 | 90 | self.beta_batchnorm = nn.BatchNorm1d(input_size, affine=False) 91 | 92 | # dropout on theta 93 | self.drop_theta = nn.Dropout(p=self.dropout) 94 | 95 | 96 | @staticmethod 97 | def reparameterize(mu, logvar): 98 | """Reparameterize the theta distribution.""" 99 | std = torch.exp(0.5*logvar) 100 | eps = torch.randn_like(std) 101 | return eps.mul(std).add_(mu) 102 | 103 | def forward(self, x): 104 | """Forward pass.""" 105 | # batch_size x n_components 106 | posterior_mu, posterior_log_sigma = self.inf_net(x) 107 | posterior_sigma = torch.exp(posterior_log_sigma) 108 | 109 | # generate samples from theta 110 | theta = F.softmax( 111 | self.reparameterize(posterior_mu, posterior_log_sigma), dim=1) 112 | topic_doc = theta 113 | theta = self.drop_theta(theta) 114 | 115 | # prodLDA vs LDA 116 | if self.model_type == 'prodLDA': 117 | # in: batch_size x input_size x n_components 118 | word_dist = F.softmax( 119 | self.beta_batchnorm(torch.matmul(theta, self.beta)), dim=1) 120 | topic_word = self.beta 121 | # word_dist: batch_size x input_size 122 | self.topic_word_matrix = self.beta 123 | elif self.model_type == 'LDA': 124 | # simplex constrain on Beta 125 | beta = F.softmax(self.beta_batchnorm(self.beta), dim=1) 126 | topic_word = beta 127 | word_dist = torch.matmul(theta, beta) 128 | # word_dist: batch_size x input_size 129 | 130 | return self.prior_mean, self.prior_variance, \ 131 | posterior_mu, posterior_sigma, posterior_log_sigma, word_dist, topic_word,topic_doc 132 | 133 | def get_theta(self, x): 134 | with torch.no_grad(): 135 | # batch_size x n_components 136 | posterior_mu, posterior_log_sigma = self.inf_net(x) 137 | posterior_sigma = torch.exp(posterior_log_sigma) 138 | 139 | # generate samples from theta 140 | theta = F.softmax( 141 | self.reparameterize(posterior_mu, posterior_log_sigma), dim=1) 142 | 143 | return theta 144 | -------------------------------------------------------------------------------- /octis/models/pytorchavitm/avitm/inference_network.py: -------------------------------------------------------------------------------- 1 | """PyTorch class for feed foward inference network.""" 2 | 3 | from collections import OrderedDict 4 | from torch import nn 5 | import torch 6 | 7 | 8 | class InferenceNetwork(nn.Module): 9 | 10 | """Inference Network.""" 11 | 12 | def __init__(self, input_size, output_size, hidden_sizes, 13 | activation='softplus', dropout=0.2): 14 | """ 15 | Initialize InferenceNetwork. 16 | 17 | Args 18 | input_size : int, dimension of input 19 | output_size : int, dimension of output 20 | hidden_sizes : tuple, length = n_layers 21 | activation : string, 'softplus' or 'relu', default 'softplus' 22 | dropout : float, default 0.2, default 0.2 23 | """ 24 | super(InferenceNetwork, self).__init__() 25 | assert isinstance(input_size, int), "input_size must by type int." 26 | assert isinstance(output_size, int), "output_size must be type int." 27 | assert isinstance(hidden_sizes, tuple), \ 28 | "hidden_sizes must be type tuple." 29 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 30 | 'rrelu', 'elu', 'selu'], \ 31 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 32 | " 'rrelu', 'elu', 'selu' or 'tanh'." 33 | assert dropout >= 0, "dropout must be >= 0." 34 | 35 | self.input_size = input_size 36 | self.output_size = output_size 37 | self.hidden_sizes = hidden_sizes 38 | self.dropout = dropout 39 | 40 | if activation == 'softplus': 41 | self.activation = nn.Softplus() 42 | elif activation == 'relu': 43 | self.activation = nn.ReLU() 44 | elif activation == 'sigmoid': 45 | self.activation = nn.Sigmoid() 46 | elif activation == 'tanh': 47 | self.activation = nn.Tanh() 48 | elif activation == 'leakyrelu': 49 | self.activation = nn.LeakyReLU() 50 | elif activation == 'rrelu': 51 | self.activation = nn.RReLU() 52 | elif activation == 'elu': 53 | self.activation = nn.ELU() 54 | elif activation == 'selu': 55 | self.activation = nn.SELU() 56 | 57 | self.input_layer = nn.Linear(input_size, hidden_sizes[0]) 58 | 59 | 60 | self.hiddens = nn.Sequential(OrderedDict([ 61 | ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation)) 62 | for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) 63 | 64 | self.f_mu = nn.Linear(hidden_sizes[-1], output_size) 65 | self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False) 66 | 67 | self.f_sigma = nn.Linear(hidden_sizes[-1], output_size) 68 | self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False) 69 | 70 | self.dropout_enc = nn.Dropout(p=self.dropout) 71 | 72 | def forward(self, x): 73 | """Forward pass.""" 74 | x = self.input_layer(x) 75 | x = self.activation(x) 76 | x = self.hiddens(x) 77 | x = self.dropout_enc(x) 78 | mu = self.f_mu_batchnorm(self.f_mu(x)) 79 | log_sigma = self.f_sigma_batchnorm(self.f_sigma(x)) 80 | 81 | return mu, log_sigma 82 | 83 | 84 | class Swish(nn.Module): 85 | def __init__(self, slope = 1): 86 | super().__init__() 87 | #self.slope = slope * torch.nn.Parameter(torch.ones(1)) 88 | 89 | def forward(self, x): 90 | return x * torch.sigmoid(x) #self.slope * 91 | -------------------------------------------------------------------------------- /octis/models/pytorchavitm/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """Init datasets.""" 2 | 3 | from octis.models.pytorchavitm.datasets.bow import BOWDataset 4 | -------------------------------------------------------------------------------- /octis/models/pytorchavitm/datasets/bow.py: -------------------------------------------------------------------------------- 1 | """Class for loading BOW dataset.""" 2 | 3 | import torch 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class BOWDataset(Dataset): 8 | 9 | """Class to load BOW dataset.""" 10 | 11 | def __init__(self, X, idx2token): 12 | """ 13 | Initialize NewsGroupDataset. 14 | 15 | Args 16 | X : array-like, shape=(n_samples, n_features) 17 | Document word matrix. 18 | """ 19 | self.X = X 20 | self.idx2token = idx2token 21 | 22 | def __len__(self): 23 | """Return length of dataset.""" 24 | return len(self.X) 25 | 26 | def __getitem__(self, i): 27 | """Return sample from dataset at index i.""" 28 | X = torch.FloatTensor(self.X[i]) 29 | 30 | return {'X': X} 31 | -------------------------------------------------------------------------------- /octis/octis.py: -------------------------------------------------------------------------------- 1 | """Main module.""" 2 | -------------------------------------------------------------------------------- /octis/optimization/README.md: -------------------------------------------------------------------------------- 1 | Optimize a model 2 | ---------------- 3 | 4 | To optimize a model you need to select a dataset, a metric and the search space of the hyperparameters to optimize. 5 | 6 | ```python 7 | from optimization.optimizer import Optimizer 8 | from skopt.space.space import Real 9 | 10 | search_space = { 11 | "alpha": Real(low=0.001, high=5.0), 12 | "eta": Real(low=0.001, high=5.0) 13 | } 14 | # Initialize an optimizer object and start the optimization. 15 | optimizer = Optimizer() 16 | result = optimizer.optimize(model, dataset, metric, search_space) 17 | ``` 18 | 19 | Plotting functions can be used to visualize the optimization process. To visualize the results you can set `plot_best_seen` and the `plot_model` to True to save, respectively, the convergence plot and the box plot of the different model_runs, for each iteration. 20 | 21 | Bayesian Optimization 22 | --------------------- 23 | Bayesian_optimization is the core function. 24 | 25 | ```python 26 | 27 | optimize(self, model, dataset, metric, search_space, extra_metrics=None, 28 | number_of_call=5, n_random_starts=1, 29 | initial_point_generator="lhs", 30 | optimization_type='Maximize', model_runs=5, surrogate_model="RF", 31 | kernel=1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5), 32 | acq_func="LCB", random_state=False, x0=None, y0=None, 33 | save_models=True, save_step=1, save_name="result", save_path="results/", early_stop=False, 34 | early_step=5, 35 | plot_best_seen=False, plot_model=False, plot_name="B0_plot", log_scale_plot=False, topk=10) 36 | ``` 37 | To know more you could see the [[Code]](https://octis.readthedocs.io/en/latest/modules.html?highlight=optimizer#octis.optimization.optimizer.Optimizer) 38 | 39 | The results of the optimization are saved in the json file, by default. However, you can save the results of the optimization also in a user-friendly csv file. 40 | 41 | ```python 42 | 43 | optimization_result.save_to_csv("results.csv") 44 | 45 | ``` 46 | 47 | To know more you could see the [[Code]](https://github.com/MIND-Lab/OCTIS/blob/master/docs/optimization.rst) 48 | -------------------------------------------------------------------------------- /octis/optimization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/optimization/__init__.py -------------------------------------------------------------------------------- /octis/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/octis/preprocessing/__init__.py -------------------------------------------------------------------------------- /octis/preprocessing/sources/M10.py: -------------------------------------------------------------------------------- 1 | import octis.preprocessing.sources.source_tools as nu 2 | import octis.configuration.citations as citations 3 | 4 | 5 | def retrieve_m10(): 6 | """ 7 | Retrieve the corpus, partition, edges and the labels 8 | 9 | Returns 10 | ------- 11 | result : dictionary with corpus, training and test partitions, 12 | eges and labels of the corpus 13 | """ 14 | path = 'https://raw.githubusercontent.com/shiruipan/TriDNR/master/data/M10/' 15 | 16 | result = nu._retrieve( 17 | path+'docs.txt', 18 | path+'labels.txt', 19 | path+"adjedges.txt") 20 | 21 | result["info"] = { 22 | "name": "CiteSeer-M10", 23 | "link": "https://github.com/shiruipan/TriDNR", 24 | "source": "https://github.com/shiruipan/TriDNR", 25 | "paper": "https://www.ijcai.org/Proceedings/16/Papers/271.pdf", 26 | "citation": citations.sources_dblp_M10 27 | } 28 | 29 | return result 30 | -------------------------------------------------------------------------------- /octis/preprocessing/sources/custom_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def retrieve(corpus_path, labels_path="", edges_path=""): 5 | """ 6 | Retrieve corpus and labels of a custom dataset 7 | given their path 8 | 9 | Parameters 10 | ---------- 11 | corpus_path : path of the corpus 12 | labels_path : path of the labels document 13 | 14 | Returns 15 | ------- 16 | result : dictionary with corpus and 17 | optionally labels of the corpus 18 | """ 19 | result = {} 20 | corpus = [] 21 | labels = [] 22 | with open(corpus_path) as file_input: 23 | for line in file_input: 24 | corpus.append(str(line)) 25 | result["corpus"] = corpus 26 | 27 | if len(labels_path) > 1: 28 | with open(labels_path) as file_input: 29 | for label in file_input: 30 | labels.append(json.loads(label)) 31 | result["doc_labels"] = labels 32 | 33 | if len(edges_path) > 1: 34 | doc_ids = {} 35 | tmp_edges = [] 36 | with open(edges_path) as file_input: 37 | for line in file_input: 38 | neighbours = str(line) 39 | neighbours = neighbours[2:len(neighbours)-3] 40 | doc_ids[neighbours.split()[0]] = True 41 | tmp_edges.append(neighbours) 42 | 43 | edges_list = [] 44 | 45 | for edges in tmp_edges: 46 | tmp_element = "" 47 | for edge in edges.split(): 48 | if edge in doc_ids: 49 | tmp_element = tmp_element + edge + " " 50 | edges_list.append(tmp_element[0:len(tmp_element)-1]) 51 | result["edges"] = edges_list 52 | 53 | return result 54 | -------------------------------------------------------------------------------- /octis/preprocessing/sources/dblp.py: -------------------------------------------------------------------------------- 1 | import octis.preprocessing.sources.source_tools as nu 2 | import octis.configuration.citations as citations 3 | 4 | 5 | def retrieve_dblp(): 6 | """ 7 | Retrieve the corpus, partition, edges and the labels 8 | 9 | Returns 10 | ------- 11 | result : dictionary with corpus, training and test partitions, 12 | eges and labels of the corpus 13 | """ 14 | path = 'https://raw.githubusercontent.com/shiruipan/TriDNR/master/data/dblp/' 15 | 16 | result = nu._retrieve( 17 | path+'docs.txt', 18 | path+'labels.txt', 19 | path+"adjedges.txt") 20 | 21 | result["info"] = { 22 | "name": "DBLP-Citation-network V4", 23 | "link": "https://github.com/shiruipan/TriDNR", 24 | "source": "https://github.com/shiruipan/TriDNR", 25 | "paper": "https://www.ijcai.org/Proceedings/16/Papers/271.pdf", 26 | "citation": citations.sources_dblp_M10 27 | } 28 | return result 29 | -------------------------------------------------------------------------------- /octis/preprocessing/sources/newsgroup.py: -------------------------------------------------------------------------------- 1 | import gensim.downloader as gd 2 | from sklearn.datasets import fetch_20newsgroups 3 | 4 | 5 | def retrieve_20newsgroup_gensim(): 6 | """ 7 | Retrieve the corpus and the labels 8 | 9 | Returns 10 | ------- 11 | result : dictionary with corpus, training and test partition 12 | and labels of the corpus 13 | """ 14 | dataset = gd.load("20-newsgroups") 15 | corpus = [] 16 | labels = [] 17 | partition = None 18 | 19 | for data in dataset: 20 | corpus.append(data["data"]) 21 | if data["set"] == "test" and partition is None: 22 | partition = len(corpus) - 1 23 | labels.append([data["topic"]]) 24 | result = dict() 25 | result["partition"] = partition 26 | result["corpus"] = corpus 27 | result["doc_labels"] = labels 28 | result["info"] = { 29 | "name": "20-newsgroups", 30 | "link": "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz", 31 | "source": "https://radimrehurek.com/gensim/", 32 | "num_records": 18846, 33 | "description": "The notorious collection of approximately 20,000 newsgroup posts, partitioned (nearly) evenly across 20 different newsgroups.", 34 | "file_name": "20-newsgroups.gz", 35 | "info": "http://qwone.com/~jason/20Newsgroups/" 36 | } 37 | return result 38 | 39 | 40 | def retrieve_20newsgroup_scikit(): 41 | """ 42 | Retrieve the corpus and the labels 43 | 44 | Returns 45 | ------- 46 | result : dictionary with corpus, training and test partition 47 | and labels of the corpus 48 | """ 49 | corpus = [] 50 | labels = [] 51 | 52 | newsgroup = fetch_20newsgroups(remove=('headers', 'footers', 'quotes')) 53 | 54 | cate = newsgroup.target_names 55 | 56 | corpus = corpus + newsgroup.data 57 | 58 | for doc in newsgroup.target: 59 | labels.append([cate[doc]]) 60 | 61 | partition = len(corpus) - 1 62 | 63 | result = dict() 64 | result["partition"] = partition 65 | result["corpus"] = corpus 66 | result["doc_labels"] = labels 67 | result["info"] = { 68 | "name": "20-newsgroups", 69 | "link": "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz", 70 | "source": "https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html", 71 | "num_records": 18846, 72 | "description": "The notorious collection of approximately 20,000 newsgroup posts, partitioned (nearly) evenly across 20 different newsgroups.", 73 | "file_name": "20-newsgroups.gz", 74 | "info": "http://qwone.com/~jason/20Newsgroups/" 75 | } 76 | 77 | return result 78 | -------------------------------------------------------------------------------- /octis/preprocessing/sources/reuters.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import reuters 2 | 3 | 4 | def retrieve_reuters(): 5 | """ 6 | Retrieve the corpus and the labels 7 | 8 | Returns 9 | ------- 10 | result : dictionary with corpus, training and test partition 11 | and labels of the corpus 12 | """ 13 | documents = reuters.fileids() 14 | train_corpus = [] 15 | test_corpus = [] 16 | train_labels = [] 17 | test_labels = [] 18 | for document in documents: 19 | doc_partition = document.split("/")[0] 20 | doc = reuters.words(document) 21 | doc_with_spaces = " ".join(doc) 22 | if doc_partition == "training": 23 | train_labels.append(reuters.categories(document)) 24 | train_corpus.append(doc_with_spaces) 25 | else: 26 | test_labels.append(reuters.categories(document)) 27 | test_corpus.append(doc_with_spaces) 28 | result = {} 29 | result["corpus"] = train_corpus + test_corpus 30 | result["partition"] = len(train_corpus) 31 | result["doc_labels"] = train_labels + test_labels 32 | result["info"] = { 33 | "name": "Reuters-21578, Distribution 1.0", 34 | "link": "https://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz", 35 | "source": "https://www.nltk.org/", 36 | "info": "https://kdd.ics.uci.edu/databases/reuters21578/README.txt" 37 | } 38 | return result 39 | -------------------------------------------------------------------------------- /octis/preprocessing/sources/source_tools.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | from sklearn.model_selection import train_test_split 3 | import re 4 | 5 | def _retrieve(corpus_path, labels_path, edges_path): 6 | """ 7 | Retrieve M10 or DBLP corpus and labels 8 | given their path 9 | 10 | Parameters 11 | ---------- 12 | corpus_path : path of the corpus 13 | labels_path : path of the labels document 14 | edges_path : path of the adjacent neighbours document 15 | 16 | Returns 17 | ------- 18 | result : dictionary with corpus, training and test partition, 19 | adjacent neighbours and labels of the corpus 20 | """ 21 | corpus = [] 22 | url = urlopen(corpus_path) 23 | for line in url: 24 | corpus.append(str(line)) 25 | 26 | labels = [] 27 | url = urlopen(labels_path) 28 | for line in url: 29 | line = re.sub("[^.0-9\\s]", '', str(line)) 30 | label_list = line.split() 31 | labels.append(label_list[1:len(label_list)]) 32 | 33 | doc_ids = {} 34 | tmp_edges = [] 35 | url = urlopen(edges_path) 36 | for line in url: 37 | neighbours = str(line) 38 | neighbours = neighbours[2:len(neighbours)-3] 39 | doc_ids[neighbours.split()[0]] = True 40 | tmp_edges.append(neighbours) 41 | 42 | edges_list = [] 43 | 44 | for edges in tmp_edges: 45 | tmp_element = "" 46 | for edge in edges.split(): 47 | if edge in doc_ids: 48 | tmp_element = tmp_element + edge + " " 49 | edges_list.append(tmp_element[0:len(tmp_element)-1]) 50 | ''' 51 | train, test = train_test_split(range(len(corpus)), 52 | test_size=0.3, 53 | train_size=0.7, 54 | stratify=labels) 55 | 56 | partitioned_corpus = [] 57 | partitioned_labels = [] 58 | partitioned_edges = [] 59 | 60 | for doc in train: 61 | partitioned_corpus.append(corpus[doc]) 62 | partitioned_labels.append(labels[doc]) 63 | partitioned_edges.append(edges_list[doc]) 64 | 65 | for doc in test: 66 | partitioned_corpus.append(corpus[doc]) 67 | partitioned_labels.append(labels[doc]) 68 | partitioned_edges.append(edges_list[doc]) 69 | 70 | result = {} 71 | result["corpus"] = partitioned_corpus 72 | result["edges"] = partitioned_edges 73 | result["partition"] = len(train) 74 | result["doc_labels"] = partitioned_labels 75 | ''' 76 | result = {} 77 | result["corpus"] = corpus 78 | result["edges"] = edges 79 | result["partition"] = len(corpus) 80 | result["doc_labels"] = labels 81 | return result 82 | -------------------------------------------------------------------------------- /octis/preprocessing/sources/wikipedia.py: -------------------------------------------------------------------------------- 1 | import json 2 | from sklearn.model_selection import train_test_split 3 | 4 | 5 | def retrieve_wikipedia(path): 6 | """ 7 | Retrieve the corpus and the labels 8 | 9 | Parameters 10 | ---------- 11 | path : path of the wikipedia dataset 12 | to retrieve 13 | 14 | Returns 15 | ------- 16 | result : dictionary with corpus and 17 | labels of the corpus 18 | """ 19 | corpus = [] 20 | labels = [] 21 | with open(path) as file_input: 22 | for line in file_input: 23 | article = json.loads(line) 24 | corpus.append(article["text"]) 25 | labels.append(article["labels"]) 26 | 27 | train, test = train_test_split(range(len(corpus)), 28 | test_size=0.3, 29 | train_size=0.7, 30 | stratify=labels) 31 | 32 | partitioned_corpus = [] 33 | partitioned_labels = [] 34 | 35 | for doc in train: 36 | partitioned_corpus.append(corpus[doc]) 37 | partitioned_labels.append(labels[doc]) 38 | 39 | for doc in test: 40 | partitioned_corpus.append(corpus[doc]) 41 | partitioned_labels.append(labels[doc]) 42 | 43 | result = {} 44 | result["corpus"] = partitioned_corpus 45 | result["partition"] = len(train) 46 | result["doc_labels"] = partitioned_labels 47 | 48 | return result 49 | -------------------------------------------------------------------------------- /octis/preprocessing/stopwords/english.txt: -------------------------------------------------------------------------------- 1 | a 2 | able 3 | about 4 | above 5 | according 6 | accordingly 7 | across 8 | actually 9 | after 10 | afterwards 11 | again 12 | against 13 | all 14 | allow 15 | allows 16 | almost 17 | alone 18 | along 19 | already 20 | also 21 | although 22 | always 23 | am 24 | among 25 | amongst 26 | an 27 | and 28 | another 29 | any 30 | anybody 31 | anyhow 32 | anyone 33 | anything 34 | anyway 35 | anyways 36 | anywhere 37 | apart 38 | appear 39 | appreciate 40 | appropriate 41 | are 42 | around 43 | as 44 | aside 45 | ask 46 | asking 47 | associated 48 | at 49 | available 50 | away 51 | awfully 52 | b 53 | be 54 | became 55 | because 56 | become 57 | becomes 58 | becoming 59 | been 60 | before 61 | beforehand 62 | behind 63 | being 64 | believe 65 | below 66 | beside 67 | besides 68 | best 69 | better 70 | between 71 | beyond 72 | both 73 | brief 74 | but 75 | by 76 | c 77 | came 78 | can 79 | cannot 80 | cant 81 | cause 82 | causes 83 | certain 84 | certainly 85 | changes 86 | clearly 87 | co 88 | com 89 | come 90 | comes 91 | concerning 92 | consequently 93 | consider 94 | considering 95 | contain 96 | containing 97 | contains 98 | corresponding 99 | could 100 | course 101 | currently 102 | d 103 | definitely 104 | described 105 | despite 106 | did 107 | didn 108 | different 109 | do 110 | does 111 | doesn 112 | doing 113 | don 114 | done 115 | down 116 | downwards 117 | during 118 | e 119 | each 120 | edu 121 | eg 122 | eight 123 | either 124 | else 125 | elsewhere 126 | enough 127 | entirely 128 | especially 129 | et 130 | etc 131 | even 132 | ever 133 | every 134 | everybody 135 | everyone 136 | everything 137 | everywhere 138 | ex 139 | exactly 140 | example 141 | except 142 | f 143 | far 144 | few 145 | fifth 146 | first 147 | five 148 | followed 149 | following 150 | follows 151 | for 152 | former 153 | formerly 154 | forth 155 | four 156 | from 157 | further 158 | furthermore 159 | g 160 | get 161 | gets 162 | getting 163 | given 164 | gives 165 | go 166 | goes 167 | going 168 | gone 169 | got 170 | gotten 171 | greetings 172 | h 173 | had 174 | happens 175 | hardly 176 | has 177 | have 178 | having 179 | he 180 | hello 181 | help 182 | hence 183 | her 184 | here 185 | hereafter 186 | hereby 187 | herein 188 | hereupon 189 | hers 190 | herself 191 | hi 192 | him 193 | himself 194 | his 195 | hither 196 | hopefully 197 | how 198 | howbeit 199 | however 200 | i 201 | ie 202 | if 203 | ignored 204 | immediate 205 | in 206 | inasmuch 207 | inc 208 | indeed 209 | indicate 210 | indicated 211 | indicates 212 | inner 213 | insofar 214 | instead 215 | into 216 | inward 217 | is 218 | isn 219 | it 220 | its 221 | itself 222 | j 223 | just 224 | k 225 | keep 226 | keeps 227 | kept 228 | know 229 | knows 230 | known 231 | l 232 | last 233 | lately 234 | later 235 | latter 236 | latterly 237 | least 238 | less 239 | lest 240 | let 241 | like 242 | liked 243 | likely 244 | little 245 | look 246 | looking 247 | looks 248 | ltd 249 | m 250 | mainly 251 | many 252 | may 253 | maybe 254 | me 255 | mean 256 | meanwhile 257 | merely 258 | might 259 | more 260 | moreover 261 | most 262 | mostly 263 | much 264 | must 265 | my 266 | myself 267 | n 268 | name 269 | namely 270 | nd 271 | near 272 | nearly 273 | necessary 274 | need 275 | needs 276 | neither 277 | never 278 | nevertheless 279 | new 280 | next 281 | nine 282 | no 283 | nobody 284 | non 285 | none 286 | noone 287 | nor 288 | normally 289 | not 290 | nothing 291 | novel 292 | now 293 | nowhere 294 | o 295 | obviously 296 | of 297 | off 298 | often 299 | oh 300 | ok 301 | okay 302 | old 303 | on 304 | once 305 | one 306 | ones 307 | only 308 | onto 309 | or 310 | other 311 | others 312 | otherwise 313 | ought 314 | our 315 | ours 316 | ourselves 317 | out 318 | outside 319 | over 320 | overall 321 | own 322 | p 323 | particular 324 | particularly 325 | per 326 | perhaps 327 | placed 328 | please 329 | plus 330 | possible 331 | presumably 332 | probably 333 | provides 334 | q 335 | que 336 | quite 337 | qv 338 | r 339 | rather 340 | rd 341 | re 342 | really 343 | reasonably 344 | regarding 345 | regardless 346 | regards 347 | relatively 348 | respectively 349 | right 350 | s 351 | said 352 | same 353 | saw 354 | say 355 | saying 356 | says 357 | second 358 | secondly 359 | see 360 | seeing 361 | seem 362 | seemed 363 | seeming 364 | seems 365 | seen 366 | self 367 | selves 368 | sensible 369 | sent 370 | serious 371 | seriously 372 | seven 373 | several 374 | shall 375 | she 376 | should 377 | since 378 | six 379 | so 380 | some 381 | somebody 382 | somehow 383 | someone 384 | something 385 | sometime 386 | sometimes 387 | somewhat 388 | somewhere 389 | soon 390 | sorry 391 | specified 392 | specify 393 | specifying 394 | still 395 | sub 396 | such 397 | sup 398 | sure 399 | t 400 | take 401 | taken 402 | tell 403 | tends 404 | th 405 | than 406 | thank 407 | thanks 408 | thanx 409 | that 410 | thats 411 | the 412 | their 413 | theirs 414 | them 415 | themselves 416 | then 417 | thence 418 | there 419 | thereafter 420 | thereby 421 | therefore 422 | therein 423 | theres 424 | thereupon 425 | these 426 | they 427 | think 428 | third 429 | this 430 | thorough 431 | thoroughly 432 | those 433 | though 434 | three 435 | through 436 | throughout 437 | thru 438 | thus 439 | to 440 | together 441 | too 442 | took 443 | toward 444 | towards 445 | tried 446 | tries 447 | truly 448 | try 449 | trying 450 | twice 451 | two 452 | u 453 | un 454 | under 455 | unfortunately 456 | unless 457 | unlikely 458 | until 459 | unto 460 | up 461 | upon 462 | us 463 | use 464 | used 465 | useful 466 | uses 467 | using 468 | usually 469 | uucp 470 | v 471 | value 472 | various 473 | very 474 | via 475 | viz 476 | vs 477 | w 478 | want 479 | wants 480 | was 481 | way 482 | we 483 | welcome 484 | well 485 | went 486 | were 487 | what 488 | whatever 489 | when 490 | whence 491 | whenever 492 | where 493 | whereafter 494 | whereas 495 | whereby 496 | wherein 497 | whereupon 498 | wherever 499 | whether 500 | which 501 | while 502 | whither 503 | who 504 | whoever 505 | whole 506 | whom 507 | whose 508 | why 509 | will 510 | willing 511 | wish 512 | with 513 | within 514 | without 515 | wonder 516 | would 517 | wouldn 518 | x 519 | y 520 | yes 521 | yet 522 | you 523 | your 524 | yours 525 | yourself 526 | yourselves 527 | z 528 | zero 529 | -------------------------------------------------------------------------------- /preprocessed_datasets/20NewsGroup/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 16309, "words_document_mean": 48.02, "vocabulary_length": 1612, "last-training-doc": 11415, "last-validation-doc": 13862, "preprocessing-info": "Steps:\n remove_punctuation\n lemmatization\n remove_stopwords\n filter_words\n remove_docs\nParameters:\n removed words with less than 0.005 or more than 1 documents with an occurrence of the word in corpus\n removed documents with less than 5 words", "info": {"name": "20-newsgroups", "link": "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz", "source": "https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html", "num_records": 18846, "description": "The notorious collection of approximately 20,000 newsgroup posts, partitioned (nearly) evenly across 20 different newsgroups.", "file_name": "20-newsgroups.gz", "info": "http://qwone.com/~jason/20Newsgroups/"}, "labels": ["rec.autos", "comp.sys.mac.hardware", "comp.graphics", "sci.space", "talk.politics.guns", "sci.med", "comp.sys.ibm.pc.hardware", "rec.motorcycles", "talk.religion.misc", "misc.forsale", "alt.atheism", "sci.electronics", "comp.windows.x", "rec.sport.hockey", "comp.os.ms-windows.misc", "soc.religion.christian", "talk.politics.mideast", "sci.crypt", "rec.sport.baseball", "talk.politics.misc"], "total_labels": 20} -------------------------------------------------------------------------------- /preprocessed_datasets/BBC_News/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 2225, "words_document_mean": 120.12, "vocabulary_length": 2949, "last-training-doc": 1557, "last-validation-doc": 1891, "preprocessing-info": "Steps:\n remove_punctuation\n lemmatization\n remove_stopwords\n filter_words\n remove_docs\nParameters:\n removed words with less than 0.005 or more than 0.35 documents with an occurrence of the word in corpus\n removed documents with less than 5 words", "info": {"name": "BBC NEWS"}, "labels": ["business", "entertainment", "politics", "sport", "tech"], "total_labels": 5} 2 | -------------------------------------------------------------------------------- /preprocessed_datasets/DBLP/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 54595, "words_document_mean": 5.4, "vocabulary_length": 1513, "last-training-doc": 38215, "last-validation-doc": 46405, "preprocessing-info": "Steps:\n remove_punctuation\n lemmatization\n remove_stopwords\n filter_words\n remove_docs\nParameters:\n removed words with less than 0.0005 or more than 1 documents with an occurrence of the word in corpus\n removed documents with less than 3 words", "info": {"name": "DBLP-Citation-network V4", "link": "https://github.com/shiruipan/TriDNR", "source": "https://github.com/shiruipan/TriDNR", "paper": "https://www.ijcai.org/Proceedings/16/Papers/271.pdf", "citation": "@inproceedings{DBLP:conf/ijcai/PanWZZW16,\n author = {Shirui Pan and\n Jia Wu and\n Xingquan Zhu and\n Chengqi Zhang and\n Yang Wang},\n editor = {Subbarao Kambhampati},\n title = {Tri-Party Deep Network Representation},\n booktitle = {Proceedings of the Twenty-Fifth International Joint Conference on\n Artificial Intelligence, {IJCAI} 2016, New York, NY, USA, 9-15 July\n 2016},\n pages = {1895--1901},\n publisher = {{IJCAI/AAAI} Press},\n year = {2016},\n url = {http://www.ijcai.org/Abstract/16/271},\n timestamp = {Tue, 20 Aug 2019 16:19:21 +0200},\n biburl = {https://dblp.org/rec/conf/ijcai/PanWZZW16.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}"}, "labels": ["0", "1", "2", "3"], "total_labels": 4} -------------------------------------------------------------------------------- /preprocessed_datasets/DBPedia_IT/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 5000, "vocabulary_length": 2047, "preprocessing-info": ["lowercase", "remove_punctuation", "lemmatize", "filter words with document frequency lower than 0.002 and higher than 0.5", "filter words with less than 2 character", "filter documents with less than 5 words"], "last-training-doc": 2975, "last-validation-doc": 3613, "info": {"name": "DBPedia IT"}} 2 | -------------------------------------------------------------------------------- /preprocessed_datasets/Europarl_IT/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 5000, "vocabulary_length": 2000, "preprocessing-info": ["lowercase", "remove_punctuation", "lemmatize", "filter words with document frequency lower than 0.001 and higher than 0.5", "filter words with less than 2 character", "filter documents with less than 5 words"], "last-training-doc": 2530, "last-validation-doc": 3073, "info": {"name": "Europarl IT"}} 2 | -------------------------------------------------------------------------------- /preprocessed_datasets/M10/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 8355, "words_document_mean": 5.91, "vocabulary_length": 1696, "last-training-doc": 5847, "last-validation-doc": 7101, "preprocessing-info": "Steps:\n remove_punctuation\n lemmatization\n remove_stopwords\n filter_words\n remove_docs\nParameters:\n removed words with less than 0.0005 or more than 1 documents with an occurrence of the word in corpus\n removed documents with less than 3 words", "info": {"name": "CiteSeer-M10", "link": "https://github.com/shiruipan/TriDNR", "source": "https://github.com/shiruipan/TriDNR", "paper": "https://www.ijcai.org/Proceedings/16/Papers/271.pdf", "citation": "@inproceedings{DBLP:conf/ijcai/PanWZZW16,\n author = {Shirui Pan and\n Jia Wu and\n Xingquan Zhu and\n Chengqi Zhang and\n Yang Wang},\n editor = {Subbarao Kambhampati},\n title = {Tri-Party Deep Network Representation},\n booktitle = {Proceedings of the Twenty-Fifth International Joint Conference on\n Artificial Intelligence, {IJCAI} 2016, New York, NY, USA, 9-15 July\n 2016},\n pages = {1895--1901},\n publisher = {{IJCAI/AAAI} Press},\n year = {2016},\n url = {http://www.ijcai.org/Abstract/16/271},\n timestamp = {Tue, 20 Aug 2019 16:19:21 +0200},\n biburl = {https://dblp.org/rec/conf/ijcai/PanWZZW16.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}"}, "labels": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], "total_labels": 10} -------------------------------------------------------------------------------- /preprocessed_datasets/README.rst: -------------------------------------------------------------------------------- 1 | Available Datasets 2 | ------------------- 3 | 4 | +--------------+--------------+--------+---------+----------+ 5 | | Name | Source | # Docs | # Words | # Labels | 6 | +==============+==============+========+=========+==========+ 7 | | 20NewsGroup | 20Newsgroup_ | 16309 | 1612 | 20 | 8 | +--------------+--------------+--------+---------+----------+ 9 | | BBC_News | BBC-News_ | 2225 | 2949 | 5 | 10 | +--------------+--------------+--------+---------+----------+ 11 | | DBLP | DBLP_ | 54595 | 1513 | 4 | 12 | +--------------+--------------+--------+---------+----------+ 13 | | M10 | M10_ | 8355 | 1696 | 10 | 14 | +--------------+--------------+--------+---------+----------+ 15 | 16 | .. _20Newsgroup: https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html 17 | .. _BBC-News: https://github.com/MIND-Lab/OCTIS 18 | .. _DBLP: https://dblp.org/rec/conf/ijcai/PanWZZW16.html?view=bibtex 19 | .. _M10: https://dblp.org/rec/conf/ijcai/PanWZZW16.html?view=bibtex 20 | 21 | Load a preprocessed dataset 22 | ---------------------------- 23 | 24 | To load one of the already preprocessed datasets as follows: 25 | 26 | .. code-block:: python 27 | 28 | from octis.dataset.dataset import Dataset 29 | dataset = Dataset() 30 | dataset.fetch_dataset("20NewsGroup") 31 | 32 | Just use one of the dataset names listed above. Note: it is case-sensitive! 33 | 34 | 35 | Load a custom preprocessed dataset 36 | ---------------------------- 37 | 38 | Otherwise, you can load a custom preprocessed dataset in the following way: 39 | 40 | .. code-block:: python 41 | 42 | from octis.dataset.dataset import Dataset 43 | dataset = Dataset() 44 | dataset.load_custom_dataset_from_folder("../path/to/the/dataset/folder") 45 | 46 | Make sure that the dataset is in the following format: 47 | * corpus file: a .tsv file (tab-separated) that contains up to three columns, i.e. the document, the partitition, and the label associated to the document (optional). 48 | * vocabulary: a .txt file where each line represents a word of the vocabulary 49 | 50 | The partition can be "training", "test" or "validation". An example of dataset can be found here: `sample_dataset_`. 51 | -------------------------------------------------------------------------------- /preprocessed_datasets/sample_dataset/corpus.tsv: -------------------------------------------------------------------------------- 1 | fax modem card sell mail train misc.forsale 2 | run server server install run add train comp.windows.x 3 | live part lead wait important remember judge judge guess close situation listen statement sense regard passage remember letter church people body talk work translation lack concern make sick point throw faith faith catch meaning offer explanation fire cold make aware child eternal train soc.religion.christian 4 | doesn pain deserve die lie rape train talk.religion.misc 5 | sale mile good condition good condition player component speaker mount door car maintain clean good car solid body spot surface spot touch make car problem firm car average cost interested call email train rec.autos 6 | post real disease disease question case active culture reduce hear work mechanism common minor common major evidence val sci.med 7 | execute future criminal activity compare rate black white commit crime black commit crime perfectly fair system black represent note black white crime rate thing economic thing poor people commit crime black poor reduce minority increase minority val talk.politics.misc 8 | citizen identify religious tradition religious category argument make people subject simply faith separate feel argument accept valid reject reason convert turkish rule ago present generation clue actual fact feel draw national feel separate argument carry weight long feel separate national group end debate simply relevant case date muslim category time result sequence decision past decade recognize group resolution declare separate nation eventually muslim choice form comparison national category demonstrate feel minority feel choose result show category choose choose group choose muslim term rest world present context talk group separate national reason historical religion play small role part culture general area simply religious political religious group security people fine point fact religious val talk.politics.mideast 9 | miss part thread switch line level speaker level audio line level single chip audio switch switch channel input channel output function speaker level mind test sci.electronics 10 | regularly schedule news reading bring message department statistic pass final individual player stat post day filter average game goal assist point penalty minute average pro play league season feel write real code odd list pass list time player small percentage difference category close game goal assist point leave average player pass list include top fly title kind test rec.sport.hockey 11 | people silly team win world expect put good team baseball series bad team baseball bad team win game time win series odd test rec.sport.baseball 12 | -------------------------------------------------------------------------------- /preprocessed_datasets/sample_dataset/metadata.json: -------------------------------------------------------------------------------- 1 | {"total_documents": 11, "vocabulary_length": 230, "preprocessing-info": [], "labels": ["talk.religion.misc", "talk.politics.misc", "sci.med", "comp.windows.x", "soc.religion.christian", "sci.electronics", "misc.forsale", "rec.autos", "rec.sport.baseball", "talk.politics.mideast", "rec.sport.hockey"], "total_labels": 11, "last-training-doc": 5, "last-validation-doc": 8} 2 | -------------------------------------------------------------------------------- /preprocessed_datasets/sample_dataset/vocabulary.txt: -------------------------------------------------------------------------------- 1 | penalty 2 | present 3 | baseball 4 | fact 5 | cost 6 | pass 7 | surface 8 | add 9 | area 10 | audio 11 | goal 12 | offer 13 | convert 14 | pro 15 | write 16 | meaning 17 | input 18 | difference 19 | fire 20 | simply 21 | comparison 22 | bad 23 | doesn 24 | call 25 | thing 26 | mount 27 | choose 28 | silly 29 | reject 30 | letter 31 | pain 32 | accept 33 | kind 34 | crime 35 | game 36 | talk 37 | average 38 | future 39 | card 40 | political 41 | reduce 42 | clue 43 | minor 44 | criminal 45 | identify 46 | declare 47 | valid 48 | schedule 49 | touch 50 | faith 51 | activity 52 | thread 53 | assist 54 | miss 55 | reason 56 | fair 57 | actual 58 | deserve 59 | draw 60 | weight 61 | fly 62 | lack 63 | religion 64 | minute 65 | rate 66 | bring 67 | end 68 | mail 69 | maintain 70 | win 71 | nation 72 | context 73 | cold 74 | increase 75 | date 76 | firm 77 | muslim 78 | stat 79 | guess 80 | output 81 | turkish 82 | culture 83 | translation 84 | result 85 | mechanism 86 | relevant 87 | list 88 | choice 89 | problem 90 | eternal 91 | point 92 | ago 93 | general 94 | religious 95 | evidence 96 | odd 97 | news 98 | child 99 | make 100 | body 101 | perfectly 102 | argument 103 | term 104 | door 105 | historical 106 | security 107 | function 108 | put 109 | throw 110 | mile 111 | generation 112 | represent 113 | group 114 | separate 115 | fax 116 | rape 117 | minority 118 | subject 119 | small 120 | important 121 | people 122 | case 123 | post 124 | condition 125 | common 126 | season 127 | lead 128 | regard 129 | car 130 | poor 131 | statistic 132 | form 133 | note 134 | league 135 | carry 136 | active 137 | demonstrate 138 | judge 139 | close 140 | code 141 | sale 142 | individual 143 | show 144 | email 145 | commit 146 | concern 147 | top 148 | regularly 149 | mind 150 | team 151 | speaker 152 | component 153 | long 154 | disease 155 | tradition 156 | series 157 | sequence 158 | question 159 | recognize 160 | sick 161 | decision 162 | sell 163 | passage 164 | fine 165 | hear 166 | reading 167 | time 168 | situation 169 | past 170 | title 171 | day 172 | player 173 | modem 174 | citizen 175 | channel 176 | part 177 | church 178 | single 179 | expect 180 | aware 181 | level 182 | department 183 | install 184 | live 185 | work 186 | role 187 | execute 188 | catch 189 | leave 190 | listen 191 | eventually 192 | wait 193 | rule 194 | spot 195 | major 196 | black 197 | world 198 | filter 199 | resolution 200 | statement 201 | server 202 | die 203 | real 204 | category 205 | switch 206 | percentage 207 | run 208 | compare 209 | national 210 | clean 211 | rest 212 | sense 213 | solid 214 | white 215 | play 216 | line 217 | system 218 | explanation 219 | feel 220 | message 221 | chip 222 | debate 223 | final 224 | remember 225 | lie 226 | economic 227 | decade 228 | include 229 | interested 230 | good'] 231 | -------------------------------------------------------------------------------- /preprocessed_datasets/sample_texts/unprepr_docs.txt: -------------------------------------------------------------------------------- 1 | Bob Dylan (born Robert Allen Zimmerman; May 24, 1941) is an American singer-songwriter, author and visual artist. Often regarded as one of the greatest songwriters of all time,[3] Dylan has been a major figure in popular culture for more than 50 years. 2 | Toni Piispanen (born 24 July 1976) is a Paralympic athlete for Finland. He started as an able-bodied karate competitor and became disabled due to an accident that injured his spinal cord at a karate show in 1993. This accident occurred in Lahti in front of hundreds of spectators. 3 | Puerto Cortés Airstrip is the military dirt airstrip of Puerto Cortés Naval Base, located on the East side of Isla Santa Margarita, on the coast of Magdalena Bay, in the Mexican state of Baja California Sur. Isla Margarita is a Pacific Ocean island, located to the West of the Baja California Peninsula, and belongs politically to the municipality of Comondú. Puerto Cortés Naval Base is handled by the Mexican Navy and is the center of a Military Naval Sector that depends from the 2nd Military Naval Region in Ensenada, BC. The airstrip is used solely for military aviation purposes. 4 | Nagao clan 5 | William Addison Adamson (27 May 1858 – 10 June 1924) was an Australian politician. Adamson was born in South Yarra to seedsman William Adamson and Isabella Bruce. He attended Brighton Grammar School and became a grain merchant. On 9 June 1880 he married Lucy Jacks 6 | Born in Madrid, Rubio finished his formation with Rayo Vallecano. He made his senior debut with the reserves on 29 October 2017, playing the last seven minutes in a 1–0 Tercera División home win against CF San Agustín del Guadalix.[1] 7 | East London is a popularly and informally defined part of London, capital of the United Kingdom. By most definitions, it is east of the ancient City of London and north of the River Thames. It broadly comprises the London boroughs of Barking and Dagenham, Hackney, Havering, Newham, Redbridge, Tower Hamlets and Waltham Forest. This understanding accords closely, but not exactly, with the interpretation of the area consisting of the former Tower Division, and London east of the Lea. The East End of London is a subset of East London, consisting of areas close to the ancient City of London. The Eastern (E) Postal District is a different subset of East London; and there is also an "East" sub-region used in the London Plan for planning policy reporting purposes. The most recent (2011) iteration includes seven boroughs north of the Thames, with the addition of three boroughs south of the river. 8 | Kiełków [ˈkʲɛu̯kuf] is a village in the administrative district of Gmina Przecław, within Mielec County, Subcarpathian Voivodeship, in south-eastern Poland.[1] It lies approximately 5 kilometres (3 mi) north of Przecław, 7 km (4 mi) south-east of Mielec, and 45 km (28 mi) north-west of the regional capital Rzeszów. 9 | Bretenoux is a commune in the Lot department in southwestern France. 10 | Jan Leopold Tyranowski (9 February 1901 – 15 March 1947) was a Polish Roman Catholic.[1] He was an ardent admirer and follower of the Discalced Carmelite charism – but was not of their order – and was a central figure in the spiritual formation of Karol Józef Wojtyła who became Pope John Paul II.[2] He was both the leader and student mentor of his friend's college parish of Saint Stanisław Kostka in the 1940s as well as a small group he ran on the behalf of the Salesians of Don Bosco during the wartime period.[3][4] 11 | Kafka 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gensim>=4.2.0,<5.0 2 | nltk 3 | pandas 4 | spacy 5 | scikit-learn==1.1.0 6 | scikit-optimize>=0.8.1 7 | matplotlib 8 | torch 9 | numpy>=1.23.0,<2.0 10 | libsvm 11 | flask 12 | sentence_transformers 13 | requests 14 | tomotopy 15 | scipy<1.13 -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | pip==21.1 2 | bump2version==0.5.11 3 | wheel==0.38.1 4 | watchdog==0.9.0 5 | flake8==3.7.8 6 | tox==3.14.0 7 | coverage==4.5.4 8 | Sphinx==1.8.5 9 | twine==1.14.0 10 | Click==7.0 11 | pytest==7.2.0 12 | pytest-runner==5.1 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.14.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | search = version='{current_version}' 8 | replace = version='{new_version}' 9 | 10 | [bumpversion:file:octis/__init__.py] 11 | search = __version__ = '{current_version}' 12 | replace = __version__ = '{new_version}' 13 | 14 | [bdist_wheel] 15 | universal = 1 16 | 17 | [flake8] 18 | exclude = docs 19 | 20 | [aliases] 21 | test = pytest 22 | 23 | [tool:pytest] 24 | addopts = --ignore=setup.py 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """The setup script.""" 4 | 5 | from setuptools import setup, find_packages 6 | 7 | with open('README.rst') as readme_file: 8 | readme = readme_file.read() 9 | 10 | with open('HISTORY.rst') as history_file: 11 | history = history_file.read() 12 | 13 | with open('requirements.txt') as f: 14 | requirements = f.read().splitlines() 15 | 16 | setup_requirements = ['pytest-runner', ] 17 | 18 | test_requirements = ['pytest>=3', ] 19 | 20 | setup( 21 | author="Silvia Terragni", 22 | author_email='s.terragni4@campus.unimib.it', 23 | python_requires='>=3.7', 24 | classifiers=[ 25 | 'Development Status :: 2 - Pre-Alpha', 26 | 'Intended Audience :: Developers', 27 | 'License :: OSI Approved :: MIT License', 28 | 'Natural Language :: English', 29 | 'Programming Language :: Python :: 3', 30 | 'Programming Language :: Python :: 3.7', 31 | 'Programming Language :: Python :: 3.8', 32 | 'Programming Language :: Python :: 3.9', 33 | ], 34 | description="OCTIS: a library for Optimizing and Comparing Topic Models.", 35 | entry_points={ 36 | 'console_scripts': [ 37 | 'octis=octis.cli:main', 38 | ], 39 | }, 40 | install_requires=requirements, 41 | license="MIT license", 42 | long_description=readme, 43 | include_package_data=True, 44 | keywords='octis', 45 | name='octis', 46 | packages=find_packages(include=['octis', 'octis.*']), 47 | setup_requires=setup_requirements, 48 | test_suite='tests', 49 | tests_require=test_requirements, 50 | url='https://github.com/MIND-LAB/OCTIS', 51 | version='1.14.0', 52 | zip_safe=False, 53 | ) 54 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit test package for octis.""" 2 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Tests for `octis` package.""" 4 | 5 | import pytest 6 | 7 | from click.testing import CliRunner 8 | from octis.evaluation_metrics.classification_metrics import F1Score 9 | 10 | from octis.evaluation_metrics.coherence_metrics import * 11 | from octis.dataset.dataset import Dataset 12 | 13 | import os 14 | from octis.preprocessing.preprocessing import Preprocessing 15 | 16 | from octis.dataset.downloader import get_data_home, _pkl_filepath 17 | 18 | @pytest.fixture 19 | def root_dir(): 20 | return os.path.dirname(os.path.abspath(__file__)) 21 | 22 | 23 | @pytest.fixture 24 | def data_dir(root_dir): 25 | return root_dir + "/../preprocessed_datasets/" 26 | 27 | 28 | def test_preprocessing_custom_stops(data_dir): 29 | texts_path = data_dir+"/sample_texts/unprepr_docs.txt" 30 | p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True, punctuation=".,?:", 31 | lemmatize=False, stopword_list=['am', 'are', 'this', 'that'], 32 | min_chars=2, min_words_docs=5,min_df=0.0001) 33 | dataset = p.preprocess_dataset( 34 | documents_path=texts_path, 35 | ) 36 | 37 | dataset.save(data_dir+"/sample_texts/") 38 | dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") 39 | 40 | 41 | def test_preprocessing_english_stops_split(data_dir): 42 | texts_path = data_dir+"/sample_texts/unprepr_docs.txt" 43 | p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True, 44 | lemmatize=False, stopword_list='english', split=False, 45 | min_chars=2, min_words_docs=1) 46 | dataset = p.preprocess_dataset( 47 | documents_path=texts_path, 48 | ) 49 | 50 | dataset.save(data_dir+"/sample_texts/") 51 | dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") 52 | 53 | 54 | def test_preprocessing_multiprocess(data_dir): 55 | texts_path = data_dir+"/sample_texts/unprepr_docs.txt" 56 | p = Preprocessing(vocabulary=None, max_features=None, remove_punctuation=True, 57 | lemmatize=False, num_processes=10, split=False, 58 | min_chars=2, min_words_docs=1) 59 | dataset = p.preprocess_dataset( 60 | documents_path=texts_path, 61 | ) 62 | 63 | dataset.save(data_dir+"/sample_texts/") 64 | dataset.load_custom_dataset_from_folder(data_dir + "/sample_texts") 65 | 66 | 67 | def test_load_20ng(): 68 | data_home = get_data_home(data_home=None) 69 | cache_path = _pkl_filepath(data_home, "20NewsGroup" + ".pkz") 70 | if os.path.exists(cache_path): 71 | os.remove(cache_path) 72 | 73 | dataset = Dataset() 74 | dataset.fetch_dataset("20NewsGroup") 75 | assert len(dataset.get_corpus()) == 16309 76 | assert len(dataset.get_labels()) == 16309 77 | assert os.path.exists(cache_path) 78 | 79 | dataset = Dataset() 80 | dataset.fetch_dataset("20NewsGroup") 81 | assert len(dataset.get_corpus()) == 16309 82 | 83 | 84 | def test_load_M10(): 85 | dataset = Dataset() 86 | dataset.fetch_dataset("M10") 87 | assert len(set(dataset.get_labels())) == 10 88 | 89 | 90 | def test_partitions_fetch(): 91 | dataset = Dataset() 92 | dataset.fetch_dataset("M10") 93 | partitions = dataset.get_partitioned_corpus() 94 | assert len(partitions[0]) == 5847 95 | assert len(partitions[1]) == 1254 96 | 97 | 98 | def test_partitions_custom(data_dir): 99 | dataset = Dataset() 100 | dataset.load_custom_dataset_from_folder(data_dir+"M10") 101 | partitions = dataset.get_partitioned_corpus() 102 | assert len(partitions[0]) == 5847 103 | assert len(partitions[1]) == 1254 104 | 105 | 106 | def test_fetch_encoding(): 107 | dataset = Dataset() 108 | dataset.fetch_dataset('DBPedia_IT') 109 | -------------------------------------------------------------------------------- /tests/test_evaluation_metrics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Tests for `octis` package.""" 4 | 5 | import pytest 6 | 7 | from click.testing import CliRunner 8 | from octis.evaluation_metrics.topic_significance_metrics import * 9 | from octis.evaluation_metrics.classification_metrics import F1Score, PrecisionScore 10 | from octis.evaluation_metrics.classification_metrics import AccuracyScore, RecallScore 11 | from octis.evaluation_metrics.diversity_metrics import TopicDiversity, InvertedRBO, KLDivergence, LogOddsRatio, \ 12 | WordEmbeddingsInvertedRBO 13 | from octis.evaluation_metrics.similarity_metrics import WordEmbeddingsRBOMatch, PairwiseJaccardSimilarity, RBO, \ 14 | WordEmbeddingsCentroidSimilarity, WordEmbeddingsPairwiseSimilarity 15 | 16 | from octis.evaluation_metrics.coherence_metrics import * 17 | from octis.dataset.dataset import Dataset 18 | from octis.models.LDA import LDA 19 | 20 | import os 21 | 22 | 23 | @pytest.fixture 24 | def root_dir(): 25 | return os.path.dirname(os.path.abspath(__file__)) 26 | 27 | 28 | @pytest.fixture 29 | def dataset(root_dir): 30 | dataset = Dataset() 31 | dataset.load_custom_dataset_from_folder(root_dir + "/../preprocessed_datasets/" + '/M10') 32 | return dataset 33 | 34 | 35 | @pytest.fixture 36 | def model_output(dataset): 37 | model = LDA(num_topics=3, iterations=5) 38 | output = model.train_model(dataset) 39 | return output 40 | 41 | 42 | def test_f1score(dataset, model_output): 43 | metric = F1Score(dataset=dataset) 44 | score = metric.score(model_output) 45 | assert type(score) == np.float64 or type(score) == float 46 | 47 | 48 | def test_accuracyscore(dataset, model_output): 49 | metric = AccuracyScore(dataset=dataset) 50 | score = metric.score(model_output) 51 | assert type(score) == np.float64 or type(score) == float 52 | 53 | 54 | def test_precisionscore(dataset, model_output): 55 | metric = PrecisionScore(dataset=dataset) 56 | score = metric.score(model_output) 57 | assert type(score) == np.float64 or type(score) == float 58 | 59 | 60 | def test_recallscore(dataset, model_output): 61 | metric = RecallScore(dataset=dataset) 62 | score = metric.score(model_output) 63 | assert type(score) == np.float64 or type(score) == float 64 | 65 | 66 | def test_svm_persistency(dataset, model_output): 67 | metric = F1Score(dataset=dataset) 68 | metric.score(model_output) 69 | metric = AccuracyScore(dataset=dataset) 70 | metric.score(model_output) 71 | assert metric.same_svm 72 | metric = F1Score(dataset=dataset, average="macro") 73 | metric.score(model_output) 74 | assert not metric.same_svm 75 | 76 | 77 | def test_npmi_coherence_measures(dataset, model_output): 78 | metric = Coherence(topk=10, texts=dataset.get_corpus()) 79 | score = metric.score(model_output) 80 | assert type(score) == np.float64 or type(score) == float 81 | assert -1 <= score <= 1 82 | 83 | 84 | def test_we_coherence_measures(dataset, model_output): 85 | metric = WECoherenceCentroid(topk=5) 86 | score = metric.score(model_output) 87 | assert type(score) == np.float64 or type(score) == np.float32 or type(score) == float 88 | assert -1 <= score <= 1 89 | 90 | metric = WECoherencePairwise(topk=10) 91 | score = metric.score(model_output) 92 | assert type(score) == np.float64 or type(score) == np.float32 or type(score) == float 93 | assert -1 <= score <= 1 94 | 95 | 96 | def test_we_coherence_measures_oov(dataset): 97 | model_output = {'topics': 98 | [['dsa', 'dsadgfd', '11111', '22222', 'bbbbbbbb'], 99 | ['aaaaa', 'bbb', 'cc', 'd', 'EEE']]} 100 | metric = WECoherenceCentroid(topk=5) 101 | score = metric.score(model_output) 102 | assert type(score) == np.float64 or type(score) == np.float32 or type(score) == float 103 | assert -1 <= score <= 1 104 | print(score) 105 | 106 | metric = WECoherencePairwise(topk=10) 107 | score = metric.score(model_output) 108 | assert type(score) == np.float64 or type(score) == np.float32 or type(score) == float 109 | assert -1 <= score <= 1 110 | print(score) 111 | 112 | 113 | def test_diversity_measures(dataset, model_output): 114 | metric = TopicDiversity(topk=10) 115 | score = metric.score(model_output) 116 | assert type(score) == np.float64 or type(score) == float 117 | assert 0 <= score <= 1 118 | 119 | metric = KLDivergence() 120 | score = metric.score(model_output) 121 | assert type(score) == np.float64 or type(score) == float 122 | assert 0 <= score <= 1 123 | 124 | metric = LogOddsRatio() 125 | score = metric.score(model_output) 126 | assert type(score) == np.float64 or type(score) == float 127 | assert 0 <= score <= 1 128 | 129 | metric = WordEmbeddingsInvertedRBO(normalize=True) 130 | score = metric.score(model_output) 131 | assert type(score) == np.float64 or type(score) == float 132 | assert 0 <= score <= 1 133 | 134 | 135 | def test_similarity_measures(dataset, model_output): 136 | metric = RBO(topk=10) 137 | score = metric.score(model_output) 138 | assert type(score) == np.float64 or type(score) == float 139 | assert 0 <= score <= 1 140 | 141 | metric = WordEmbeddingsRBOMatch(topk=10, normalize=True) 142 | score = metric.score(model_output) 143 | assert type(score) == np.float64 or type(score) == float 144 | assert 0 <= score <= 1 145 | 146 | metric = PairwiseJaccardSimilarity(topk=10) 147 | score = metric.score(model_output) 148 | assert type(score) == np.float64 or type(score) == float 149 | assert 0 <= score <= 1 150 | 151 | metric = WordEmbeddingsCentroidSimilarity(topk=10) 152 | score = metric.score(model_output) 153 | assert type(score) == np.float64 or type(score) == float 154 | assert 0 <= score <= 1 155 | 156 | metric = WordEmbeddingsPairwiseSimilarity(topk=10) 157 | score = metric.score(model_output) 158 | assert type(score) == np.float64 or type(score) == float 159 | assert 0 <= score <= 1 160 | 161 | 162 | def test_irbo(dataset, model_output): 163 | metric = InvertedRBO(topk=10) 164 | score = metric.score(model_output) 165 | assert type(score) == np.float64 or type(score) == float 166 | assert 0 <= score <= 1 167 | 168 | 169 | def test_kl_b(dataset, model_output): 170 | metric = KL_background() 171 | score = metric.score(model_output) 172 | assert type(score) == np.float64 or type(score) == float 173 | assert score >= 0 174 | 175 | 176 | def test_kl_v(dataset, model_output): 177 | metric = KL_vacuous() 178 | score = metric.score(model_output) 179 | assert type(score) == np.float64 or type(score) == float 180 | assert score >= 0 181 | 182 | 183 | def test_kl_u(dataset, model_output): 184 | metric = KL_uniform() 185 | score = metric.score(model_output) 186 | assert type(score) == np.float64 or type(score) == float 187 | assert score >= 0 188 | -------------------------------------------------------------------------------- /trained_embeddings/test_example/example.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/trained_embeddings/test_example/example.bin -------------------------------------------------------------------------------- /trained_embeddings/test_example/example.keyedvectors: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/trained_embeddings/test_example/example.keyedvectors -------------------------------------------------------------------------------- /trained_embeddings/test_example/example.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MIND-Lab/OCTIS/8f9a0ac5a12e3b72ecd77dcba4f181c77e4595f1/trained_embeddings/test_example/example.pickle --------------------------------------------------------------------------------