├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── README.md
├── models
    ├── BTM.py
    ├── CTM.py
    ├── LDA.py
    ├── NMF.py
    ├── contextualized_topic_models
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── contextualized_topic_models.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── dataset.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── ctm.py
    │   ├── networks
    │   │   ├── __init__.py
    │   │   ├── decoding_network.py
    │   │   └── inference_network.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── data_preparation.py
    │   │   └── preprocessing.py
    ├── model.py
    └── pytorchtools.py
├── requirements.txt
├── setup.cfg
├── setup.py
├── stop_words
    ├── stop_words.txt
    └── swear_words.txt
└── tools
    ├── Dataset.py
    ├── create_dataset.py
    └── scraper.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Topic Modeling Tool for Persian Short Texts
 2 | 
 3 | The tool for topic modeling provided by the **[Data Science Innovation Center](http://dslab.aut.ac.ir/fa/)** extracts topics from digitized **Persian texts** and compares their performance in short texts using a variety of topic modeling techniques.
 4 | 
 5 | Visit the **[website](http://dslab.aut.ac.ir/fa/products/%d9%be%d8%b1%d8%af%d8%a7%d8%b2%d8%b4-%d9%85%d8%aa%d9%86-%d9%88-%d8%b2%d8%a8%d8%a7%d9%86-%d8%b7%d8%a8%db%8c%d8%b9%db%8c/%d8%a7%d8%a8%d8%b2%d8%a7%d8%b1-%d8%af%d8%b3%d8%aa%d9%87-%d8%a8%d9%86%d8%af%db%8c-%d9%85%d9%88%d8%b6%d9%88%d8%b9%db%8c/)** to view the description in Persian.
 6 | 
 7 | ## Installation
 8 | We recommend **Python 3.6** or higher, **[gensim 4.2](https://radimrehurek.com/gensim/)** or higher.
 9 | 
10 | **Install from sources**
11 | 
12 | You can also clone the latest version from the repository and install it directly from the source code:
13 | 
14 | ```
15 | git clone https://github.com/DSInCenter/topicmodel.git
16 | cd topicmodel
17 | pip install -r requirements.txt
18 | ```
19 | 
20 | ## Getting Started
21 | To get started, you can see the demo of GSDMM's algorithm in this link:
22 | - [GSDMM demo](https://dsic-topic-model.herokuapp.com/)
23 | 
24 | These examples demonstrate how to clone and execute a model on Google Colab:
25 | - [Run NMF model on Google colab](https://colab.research.google.com/drive/1l7Fs6yYrbIy9fXyTBflMXGaVQjh10RPn?usp=sharing)
26 | - [Run LDA model on Google colab](https://colab.research.google.com/drive/1yhNeh6J177fSQxEZE7OTLJMWtvff7LDA?usp=sharing) 
27 | 
28 | **LDA demonstration**:
29 | 
30 | First, import Dataset Class from Dataset.py and import LDA model from LDA.py:
31 | ````python
32 | from tools.Dataset import Dataset
33 | from LDA import LDA
34 | ````
35 | 
36 | Create Objects from Dataset and LDA Classes and Traing The Model:
37 | ````python
38 | lda = LDA(num_topics=11, iterations=5)
39 | dataset = Dataset('Dataset', 'utf-8')
40 | lda_result = lda.train_model(dataset, hyperparams=None, top_words=10)
41 | print(lda_result)
42 | ````
43 | 
44 | ## Citing & Authors
45 | If you find this repository helpful, feel free to cite this work []():
46 | 
47 | ```bibtex 
48 | @article{karimi2023comparative,
49 |   title={Comparative Analysis of Topic Modeling Algorithms for Short Texts in Persian Tweets},
50 |   author={Karimi, Amir Hossein and Akbari, Masoud and Akbari, Mohammad},
51 |   year={2023}
52 | }
53 | ```
54 | 
55 | Don't hesitate to send us an e-mail or report an issue, if something is broken (and it shouldn't be) or if you have further questions.
56 | 
57 | 


--------------------------------------------------------------------------------
/models/BTM.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import normalized_mutual_info_score
  2 | from models.model import AbstractModel
  3 | from tools.Dataset import Dataset
  4 | import numpy as np
  5 | import pandas as pd
  6 | import bitermplus as btm
  7 | import argparse
  8 | import pickle
  9 | 
 10 | 
 11 | 
 12 | class BTM(AbstractModel):
 13 |     '''
 14 |     Bitermplus implements Biterm topic model for short texts 
 15 |     introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi Cheng. 
 16 |     Actually, it is a cythonized version of BTM. 
 17 | 
 18 |     Source code: https://github.com/maximtrp/bitermplus
 19 |     
 20 |     '''
 21 |     
 22 |     def __init__(self,
 23 |                 num_topics: int = 10,
 24 |                 iterations: int = 20,
 25 |                 num_top_words: int = 20,
 26 |                 alpha: float = 1,
 27 |                 beta: float = 0.01,
 28 |                 seed: int = 123
 29 |         ):
 30 |         """
 31 |         initialization of BTM
 32 | 
 33 |         :param num_topics : int, Number of topics.
 34 |         :param iterations : int, Number of iterations for the model fitting process
 35 |         :param num_top_words : int, Number of top words for coherence calculation. 
 36 |         :param alpha : float, Model parameter.
 37 |         :param beta : float,  Model parameter.
 38 |         :param seed : int, Random state seed.
 39 |     
 40 |         see https://bitermplus.readthedocs.io/en/latest/bitermplus.html
 41 |         """
 42 |         super().__init__()
 43 |         self.hyperparameters['num_topics'] = num_topics
 44 |         self.hyperparameters['iterations'] = iterations
 45 |         self.hyperparameters['num_top_words'] = num_top_words
 46 |         self.hyperparameters['alpha'] = alpha
 47 |         self.hyperparameters['beta'] = beta
 48 |         self.hyperparameters['seed'] = seed
 49 |         self.model = None
 50 |         self.vocab = None
 51 | 
 52 | 
 53 |     def hyperparameters_info(self):
 54 |         """
 55 |         Returns hyperparameters informations
 56 |         """
 57 |         return self.hyperparameters
 58 | 
 59 | 
 60 |     def set_hyperparameters(self, **kwargs):
 61 |         """
 62 |         Set model hyperparameters
 63 |         """
 64 |         super().set_hyperparameters(**kwargs)
 65 | 
 66 | 
 67 |     def train_model(self, dataset, hyperparameters=None, top_words=10):
 68 |         '''
 69 |         Train the model
 70 | 
 71 |         :param dataset: Dataset
 72 |         :param hyperparameters: dictionary in the form {hyperparameter name: value}
 73 |         :param top_words: number of top significant words for each topic (default: 10)
 74 |         '''
 75 | 
 76 |         if hyperparameters is None:
 77 |             hyperparameters = {}
 78 |         self.hyperparameters.update(hyperparameters)
 79 | 
 80 |         ######################### Need Dataset.texts
 81 | 
 82 |         # Obtaining terms frequency in a sparse matrix and corpus vocabulary
 83 |         X, vocabulary, vocab_dict = btm.get_words_freqs()
 84 | 
 85 |         # Vectorizing documents
 86 |         docs_vec = btm.get_vectorized_docs(Dataset.texts, vocabulary)
 87 |         # Generating biterms
 88 |         biterms = btm.get_biterms(docs_vec)
 89 | 
 90 |         # Initializing and running model
 91 |         model = btm.BTM(X, vocabulary, seed=12321, T=11, M=10, alpha=50/8, beta=0.01)
 92 |         model.fit(biterms, iterations=20)
 93 | 
 94 |         #Now, we will calculate documents vs topics probability matrix (make an inference).
 95 |         p_zd = model.transform(docs_vec)
 96 | 
 97 |         self.model = model
 98 | 
 99 | 
100 |     def _select_words(self, topic_id: int, words_num):
101 |         probs = self.model.matrix_topics_words_[topic_id, :]
102 |         idx = np.argsort(probs)[:-words_num-1:-1]
103 |         result = pd.Series(self.model.vocabulary_[idx])
104 |         result.name = 'topic{}'.format(topic_id)
105 |         return result
106 | 
107 | 
108 |     def _get_topics_words(self, words_num=20):
109 |         topics_num = self.model.topics_num_
110 |         topics_idx = np.arange(topics_num)
111 |         top_words_btm = pd.concat(map(lambda x: self._select_words(x, words_num), topics_idx), axis=1)
112 |         return top_words_btm
113 | 
114 | 
115 | 
116 | 
117 | # def __save_pickle(file, path):
118 | #     with open(path, 'wb') as handle:
119 | #         pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL)
120 | 
121 | 
122 | # def __get_data(path:str, encoding:str) -> pd.DataFrame :
123 | #     return pd.read_csv(path, encoding=encoding)
124 | 
125 | # def __run_btm(corpus, labels, seed, num_of_topics, iterations):
126 | #     print('preparing data...')
127 | #     X, vocabulary, vocab_dict = btm.get_words_freqs(corpus)
128 | #     tf = np.array(X.sum(axis=0)).ravel()
129 | 
130 | #     # Vectorizing documents
131 | #     docs_vec = btm.get_vectorized_docs(texts, vocabulary)
132 | #     docs_lens = list(map(len, docs_vec))
133 | #     # Generating biterms
134 | #     biterms = btm.get_biterms(docs_vec)
135 | 
136 | #     print('running model...')
137 | #     # INITIALIZING AND RUNNING MODEL
138 | #     model = btm.BTM(X, vocabulary, seed=12321, T=num_of_topics, M=10, alpha=50/8, beta=0.01)
139 | #     model.fit(biterms, iterations=iterations)
140 | #     #Now, we will calculate documents vs topics probability matrix (make an inference).
141 | #     p_zd = model.transform(docs_vec)
142 | 
143 | #     # Get index of max probability for each document
144 | #     top_prob = [np.argmax(i) for i in p_zd]
145 | 
146 | #     print('*****************************')
147 | #     print('Evaluating model performance:')
148 | #     print('NMI : {}'.format(normalized_mutual_info_score(labels, top_prob)))
149 | #     print('*****************************')
150 | #     print('savin results...')
151 | #     _save_pickle(p_zd, 'btm_result.pickle')
152 | #     print('saving model...')
153 | #     _save_pickle(model, 'btm_model.pickle')
154 | 
155 |     
156 | 
157 | 
158 | # if __name__ == '__main__':
159 | 
160 | #     parser = argparse.ArgumentParser(description='Run btm model')
161 | #     parser.add_argument('--data', help='path to dataset', nargs='?', default='./data/new_dataset.csv', type=str)
162 | #     parser.add_argument('--num_of_topics', help='number of topics', nargs='?', default=11, type=int)
163 | #     parser.add_argument('--seed', nargs='?', default=12321, type=int)
164 | #     parser.add_argument('--M', nargs='?', default=10, type=int)
165 | #     parser.add_argument('--alpha', nargs='?', default=50/8, type=float)
166 | #     parser.add_argument('--beta', nargs='?', default=0.01, type=float)
167 | #     parser.add_argument('--iterations', nargs='?', default=20, type=int)
168 | #     parser.add_argument('--encoding', help='encoding to read dataset', nargs='?', default='utf-8', type=str)
169 | 
170 | #     args = parser.parse_args()
171 | 
172 | #     data = __get_data(args.data, args.encoding)
173 | #     __run_btm(
174 | #         corpus=data['processed_text'].str.strip().tolist(),
175 | #         labels=data['topic'],
176 | #         seed=args.seed,
177 | #         num_of_topics=args.num_of_topics,
178 | #         iterations=args.iterations)
179 | 
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/models/CTM.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import CountVectorizer
  2 | 
  3 | from models.model import AbstractModel
  4 | from models.contextualized_topic_models.datasets import dataset
  5 | from models.contextualized_topic_models.models import ctm
  6 | from models.contextualized_topic_models.utils.data_preparation import bert_embeddings_from_list
  7 | 
  8 | import os
  9 | import pickle as pkl
 10 | 
 11 | 
 12 | class CTM(AbstractModel):
 13 | 
 14 |     def __init__(self, num_topics=10, model_type='prodLDA', activation='softplus',
 15 |                  dropout=0.2, learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99,
 16 |                  solver='adam', num_epochs=100, reduce_on_plateau=False, prior_mean=0.0,
 17 |                  prior_variance=None, num_layers=2, num_neurons=100, use_partitions=True, num_samples=10,
 18 |                  inference_type="zeroshot", bert_path="", bert_model="m3hrdadfi/bert-zwnj-wnli-mean-tokens"):
 19 |         """
 20 |         initialization of CTM
 21 |         :param num_topics : int, number of topic components, (default 10)
 22 |         :param model_type : string, 'prodLDA' or 'LDA' (default 'prodLDA')
 23 |         :param activation : string, 'softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu',
 24 |         'selu' (default 'softplus')
 25 |         :param num_layers : int, number of layers (default 2)
 26 |         :param dropout : float, dropout to use (default 0.2)
 27 |         :param learn_priors : bool, make priors a learnable parameter (default True)
 28 |         :param batch_size : int, size of batch to use for training (default 64)
 29 |         :param lr : float, learning rate to use for training (default 2e-3)
 30 |         :param momentum : float, momentum to use for training (default 0.99)
 31 |         :param solver : string, optimizer 'adam' or 'sgd' (default 'adam')
 32 |         :param num_epochs : int, number of epochs to train for, (default 100)
 33 |         :param num_samples: int, number of times theta needs to be sampled (default: 10)
 34 |         :param use_partitions: bool, if true the model will be trained on the training set and evaluated on the test
 35 |         set (default: true)
 36 |         :param reduce_on_plateau : bool, reduce learning rate by 10x on plateau of 10 epochs (default False)
 37 |         :param inference_type: the type of the CTM model. It can be "zeroshot" or "combined" (default zeroshot)
 38 |         :param bert_path: path to store the document contextualized representations
 39 |         :param bert_model: name of the contextualized model (default: m3hrdadfi/bert-zwnj-wnli-mean-tokens).
 40 |         see https://www.sbert.net/docs/pretrained_models.html
 41 |         """
 42 | 
 43 |         super().__init__()
 44 | 
 45 |         self.hyperparameters['num_topics'] = num_topics
 46 |         self.hyperparameters['model_type'] = model_type
 47 |         self.hyperparameters['activation'] = activation
 48 |         self.hyperparameters['dropout'] = dropout
 49 |         self.hyperparameters['inference_type'] = inference_type
 50 |         self.hyperparameters['learn_priors'] = learn_priors
 51 |         self.hyperparameters['batch_size'] = batch_size
 52 |         self.hyperparameters['lr'] = lr
 53 |         self.hyperparameters['num_samples'] = num_samples
 54 |         self.hyperparameters['momentum'] = momentum
 55 |         self.hyperparameters['solver'] = solver
 56 |         self.hyperparameters['num_epochs'] = num_epochs
 57 |         self.hyperparameters['reduce_on_plateau'] = reduce_on_plateau
 58 |         self.hyperparameters["prior_mean"] = prior_mean
 59 |         self.hyperparameters["prior_variance"] = prior_variance
 60 |         self.hyperparameters["num_neurons"] = num_neurons
 61 |         self.hyperparameters["bert_path"] = bert_path
 62 |         self.hyperparameters["num_layers"] = num_layers
 63 |         self.hyperparameters["bert_model"] = bert_model
 64 |         self.use_partitions = use_partitions
 65 | 
 66 |         hidden_sizes = tuple([num_neurons for _ in range(num_layers)])
 67 |         self.hyperparameters['hidden_sizes'] = tuple(hidden_sizes)
 68 | 
 69 |         self.model = None
 70 |         self.vocab = None
 71 | 
 72 |     def train_model(self, dataset, hyperparameters=None, top_words=10):
 73 |         """
 74 |         trains CTM model
 75 |         :param dataset: octis Dataset for training the model
 76 |         :param hyperparameters: dict, with optionally) the following information:
 77 |         :param top_words: number of top-n words of the topics (default 10)
 78 |         """
 79 |         if hyperparameters is None:
 80 |             hyperparameters = {}
 81 | 
 82 |         self.set_params(hyperparameters)
 83 |         self.vocab = dataset.vocab
 84 | 
 85 |         if self.use_partitions:
 86 |             train, validation, test = dataset.train_corpus, dataset.dev_corpus, dataset.test_corpus
 87 | 
 88 |             data_corpus_train = [' '.join(i) for i in train]
 89 |             data_corpus_test = [' '.join(i) for i in test]
 90 |             data_corpus_validation = [' '.join(i) for i in validation]
 91 | 
 92 |             x_train, x_test, x_valid, input_size = self.preprocess(
 93 |                 self.vocab, data_corpus_train, test=data_corpus_test, validation=data_corpus_validation,
 94 |                 bert_train_path=self.hyperparameters['bert_path'] + "_train.pkl",
 95 |                 bert_test_path=self.hyperparameters['bert_path'] + "_test.pkl",
 96 |                 bert_val_path=self.hyperparameters['bert_path'] + "_val.pkl",
 97 |                 bert_model=self.hyperparameters["bert_model"])
 98 |             self.model = ctm.CTM(input_size=input_size, bert_input_size=x_train.X_bert.shape[1], model_type='prodLDA',
 99 |                                  num_topics=self.hyperparameters['num_topics'], dropout=self.hyperparameters['dropout'],
100 |                                  activation=self.hyperparameters['activation'], lr=self.hyperparameters['lr'],
101 |                                  inference_type=self.hyperparameters['inference_type'],
102 |                                  hidden_sizes=self.hyperparameters['hidden_sizes'],
103 |                                  solver=self.hyperparameters['solver'],
104 |                                  momentum=self.hyperparameters['momentum'],
105 |                                  num_epochs=self.hyperparameters['num_epochs'],
106 |                                  learn_priors=self.hyperparameters['learn_priors'],
107 |                                  batch_size=self.hyperparameters['batch_size'],
108 |                                  num_samples=self.hyperparameters['num_samples'],
109 |                                  topic_prior_mean=self.hyperparameters["prior_mean"],
110 |                                  reduce_on_plateau=self.hyperparameters['reduce_on_plateau'],
111 |                                  topic_prior_variance=self.hyperparameters["prior_variance"])
112 |             self.model.fit(x_train, x_valid, verbose=False)
113 |             result = self.inference(x_test)
114 |             return result
115 | 
116 |         else:
117 |             data_corpus = [' '.join(i) for i in dataset.train_corpus()]
118 |             x_train, input_size = self.preprocess(
119 |                 self.vocab, train=data_corpus, bert_train_path=self.hyperparameters['bert_path'] + "_train.pkl",
120 |                 bert_model=self.hyperparameters["bert_model"])
121 | 
122 |         self.model = ctm.CTM(input_size=input_size, bert_input_size=x_train.X_bert.shape[1], model_type='prodLDA',
123 |                              num_topics=self.hyperparameters['num_topics'], dropout=self.hyperparameters['dropout'],
124 |                              activation=self.hyperparameters['activation'], lr=self.hyperparameters['lr'],
125 |                              inference_type=self.hyperparameters['inference_type'],
126 |                              hidden_sizes=self.hyperparameters['hidden_sizes'], solver=self.hyperparameters['solver'],
127 |                              momentum=self.hyperparameters['momentum'], num_epochs=self.hyperparameters['num_epochs'],
128 |                              learn_priors=self.hyperparameters['learn_priors'],
129 |                              batch_size=self.hyperparameters['batch_size'],
130 |                              num_samples=self.hyperparameters['num_samples'],
131 |                              topic_prior_mean=self.hyperparameters["prior_mean"],
132 |                              reduce_on_plateau=self.hyperparameters['reduce_on_plateau'],
133 |                              topic_prior_variance=self.hyperparameters["prior_variance"])
134 | 
135 | 
136 |         self.model.fit(x_train, None, verbose=False)
137 |         result = self.model.get_info()
138 |         return result
139 | 
140 |     def set_params(self, hyperparameters):
141 |         for k in hyperparameters.keys():
142 |             if k in self.hyperparameters.keys() and k != 'hidden_sizes':
143 |                 self.hyperparameters[k] = hyperparameters.get(k, self.hyperparameters[k])
144 | 
145 |         self.hyperparameters['hidden_sizes'] = tuple(
146 |             [self.hyperparameters["num_neurons"] for _ in range(self.hyperparameters["num_layers"])])
147 | 
148 |     def inference(self, x_test):
149 |         assert isinstance(self.use_partitions, bool) and self.use_partitions
150 |         results = self.model.predict(x_test)
151 |         return results
152 | 
153 |     def partitioning(self, use_partitions=False):
154 |         self.use_partitions = use_partitions
155 | 
156 |     @staticmethod
157 |     def preprocess(vocab, train, bert_model, test=None, validation=None,
158 |                    bert_train_path=None, bert_test_path=None, bert_val_path=None):
159 |         vocab2id = {w: i for i, w in enumerate(vocab)}
160 |         vec = CountVectorizer(
161 |             vocabulary=vocab2id, token_pattern=r'(?u)\b[\w+|\-]+\b')
162 |         entire_dataset = train.copy()
163 |         if test is not None:
164 |             entire_dataset.extend(test)
165 |         if validation is not None:
166 |             entire_dataset.extend(validation)
167 | 
168 |         vec.fit(entire_dataset)
169 |         idx2token = {v: k for (k, v) in vec.vocabulary_.items()}
170 | 
171 |         x_train = vec.transform(train)
172 |         b_train = CTM.load_bert_data(bert_train_path, train, bert_model)
173 | 
174 |         train_data = dataset.CTMDataset(x_train.toarray(), b_train, idx2token)
175 |         input_size = len(idx2token.keys())
176 | 
177 |         if test is not None and validation is not None:
178 |             x_test = vec.transform(test)
179 |             b_test = CTM.load_bert_data(bert_test_path, test, bert_model)
180 |             test_data = dataset.CTMDataset(x_test.toarray(), b_test, idx2token)
181 | 
182 |             x_valid = vec.transform(validation)
183 |             b_val = CTM.load_bert_data(bert_val_path, validation, bert_model)
184 |             valid_data = dataset.CTMDataset(x_valid.toarray(), b_val, idx2token)
185 |             return train_data, test_data, valid_data, input_size
186 |         if test is None and validation is not None:
187 |             x_valid = vec.transform(validation)
188 |             b_val = CTM.load_bert_data(bert_val_path, validation, bert_model)
189 |             valid_data = dataset.CTMDataset(x_valid.toarray(), b_val, idx2token)
190 |             return train_data, valid_data, input_size
191 |         if test is not None and validation is None:
192 |             x_test = vec.transform(test)
193 |             b_test = CTM.load_bert_data(bert_test_path, test, bert_model)
194 |             test_data = dataset.CTMDataset(x_test.toarray(), b_test, idx2token)
195 |             return train_data, test_data, input_size
196 |         if test is None and validation is None:
197 |             return train_data, input_size
198 | 
199 |     @staticmethod
200 |     def load_bert_data(bert_path, texts, bert_model):
201 |         if bert_path is not None:
202 |             if os.path.exists(bert_path):
203 |                 bert_ouput = pkl.load(open(bert_path, 'rb'))
204 |             else:
205 |                 bert_ouput = bert_embeddings_from_list(texts, bert_model)
206 |                 pkl.dump(bert_ouput, open(bert_path, 'wb'))
207 |         else:
208 |             bert_ouput = bert_embeddings_from_list(texts, bert_model)
209 |         return bert_ouput
210 | 


--------------------------------------------------------------------------------
/models/LDA.py:
--------------------------------------------------------------------------------
  1 | from models.model import AbstractModel
  2 | import numpy as np
  3 | from gensim.models import ldamodel
  4 | import gensim.corpora as corpora
  5 | #import octis.configuration.citations as citations
  6 | #import octis.configuration.defaults as defaults
  7 | 
  8 | 
  9 | class LDA(AbstractModel):
 10 | 
 11 |     id2word = None
 12 |     id_corpus = None
 13 |     use_partitions = True
 14 |     update_with_test = False
 15 | 
 16 |     def __init__(self, num_topics=100, distributed=False, chunksize=2000, passes=1, update_every=1, alpha="symmetric",
 17 |                  eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
 18 |                  random_state=None):
 19 |         """
 20 |         Initialize LDA model
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         num_topics (int, optional) – The number of requested latent topics to be
 25 |         extracted from the training corpus.
 26 | 
 27 |         distributed (bool, optional) – Whether distributed computing should be
 28 |         used to accelerate training.
 29 | 
 30 |         chunksize (int, optional) – Number of documents to be used in each
 31 |         training chunk.
 32 | 
 33 |         passes (int, optional) – Number of passes through the corpus during
 34 |         training.
 35 | 
 36 |         update_every (int, optional) – Number of documents to be iterated
 37 |         through for each update. Set to 0 for batch learning, > 1 for
 38 |         online iterative learning.
 39 | 
 40 |         alpha ({numpy.ndarray, str}, optional) – Can be set to an 1D array of
 41 |         length equal to the number of expected topics that expresses our
 42 |         a-priori belief for the each topics’ probability. Alternatively
 43 |         default prior selecting strategies can be employed by supplying
 44 |         a string:
 45 | 
 46 |             ’asymmetric’: Uses a fixed normalized asymmetric prior of
 47 |             1.0 / topicno.
 48 | 
 49 |             ’auto’: Learns an asymmetric prior from the corpus
 50 |             (not available if distributed==True).
 51 | 
 52 |         eta ({float, np.array, str}, optional) – A-priori belief on word
 53 |         probability, this can be:
 54 | 
 55 |             scalar for a symmetric prior over topic/word probability,
 56 | 
 57 |             vector of length num_words to denote an asymmetric user defined
 58 |             probability for each word,
 59 | 
 60 |             matrix of shape (num_topics, num_words) to assign a probability
 61 |             for each word-topic combination,
 62 | 
 63 |             the string ‘auto’ to learn the asymmetric prior from the data.
 64 | 
 65 |         decay (float, optional) – A number between (0.5, 1] to weight what
 66 |         percentage of the previous lambda value is forgotten when each new
 67 |         document is examined.
 68 | 
 69 |         offset (float, optional) – Hyper-parameter that controls how much
 70 |         we will slow down the first steps the first few iterations.
 71 | 
 72 |         eval_every (int, optional) – Log perplexity is estimated every
 73 |         that many updates. Setting this to one slows down training by ~2x.
 74 | 
 75 |         iterations (int, optional) – Maximum number of iterations through the
 76 |         corpus when inferring the topic distribution of a corpus.
 77 | 
 78 |         gamma_threshold (float, optional) – Minimum change in the value of the
 79 |         gamma parameters to continue iterating.
 80 | 
 81 |         random_state ({np.random.RandomState, int}, optional) – Either a
 82 |         randomState object or a seed to generate one. Useful for reproducibility.
 83 | 
 84 | 
 85 |         """
 86 |         super().__init__()
 87 |         self.hyperparameters = dict()
 88 |         self.hyperparameters["num_topics"] = num_topics
 89 |         self.hyperparameters["distributed"] = distributed
 90 |         self.hyperparameters["chunksize"] = chunksize
 91 |         self.hyperparameters["passes"] = passes
 92 |         self.hyperparameters["update_every"] = update_every
 93 |         self.hyperparameters["alpha"] = alpha
 94 |         self.hyperparameters["eta"] = eta
 95 |         self.hyperparameters["decay"] = decay
 96 |         self.hyperparameters["offset"] = offset
 97 |         self.hyperparameters["eval_every"] = eval_every
 98 |         self.hyperparameters["iterations"] = iterations
 99 |         self.hyperparameters["gamma_threshold"] = gamma_threshold
100 |         self.hyperparameters["random_state"] = random_state
101 | 
102 |     # def info(self):
103 |     #     """
104 |     #     Returns model informations
105 |     #     """
106 |     #     return {
107 |     #         "citation": citations.models_LDA,
108 |     #         "name": "LDA, Latent Dirichlet Allocation"
109 |     #     }
110 | 
111 |     # def hyperparameters_info(self):
112 |     #     """
113 |     #     Returns hyperparameters informations
114 |     #     """
115 |     #     return defaults.LDA_hyperparameters_info
116 | 
117 |     def set_hyperparameters(self, **kwargs):
118 |         """
119 |         Set model hyperparameters
120 |         """
121 |         super().set_hyperparameters(**kwargs)
122 |         # Allow alpha to be a float in case of symmetric alpha
123 |         if "alpha" in kwargs:
124 |             if isinstance(kwargs["alpha"], float):
125 |                 self.hyperparameters["alpha"] = [
126 |                     kwargs["alpha"]
127 |                 ] * self.hyperparameters["num_topics"]
128 | 
129 |     def partitioning(self, use_partitions, update_with_test=False):
130 |         """
131 |         Handle the partitioning system to use and reset the model to perform
132 |         new evaluations
133 | 
134 |         Parameters
135 |         ----------
136 |         use_partitions: True if train/set partitioning is needed, False
137 |                         otherwise
138 |         update_with_test: True if the model should be updated with the test set,
139 |                           False otherwise
140 |         """
141 |         self.use_partitions = use_partitions
142 |         self.update_with_test = update_with_test
143 |         self.id2word = None
144 |         self.id_corpus = None
145 | 
146 |     def train_model(self, dataset, hyperparams=None, top_words=10):
147 |         """
148 |         Train the model and return output
149 | 
150 |         Parameters
151 |         ----------
152 |         dataset : dataset to use to build the model
153 |         hyperparams : hyperparameters to build the model
154 |         top_words : if greater than 0 returns the most significant words for each topic in the output
155 |                  (Default True)
156 |         Returns
157 |         -------
158 |         result : dictionary with up to 3 entries,
159 |                  'topics', 'topic-word-matrix' and
160 |                  'topic-document-matrix'
161 |         """
162 |         if hyperparams is None:
163 |             hyperparams = {}
164 | 
165 |         if self.use_partitions:
166 |             train_corpus = dataset.train_corpus
167 |             test_corpus = dataset.test_corpus
168 |         else:
169 |             train_corpus = dataset.train_corpus + dataset.test_corpus
170 | 
171 |         if self.id2word is None:
172 |             _corpus = dataset.train_corpus + dataset.test_corpus
173 |             self.id2word = corpora.Dictionary([doc.split() for doc in _corpus])
174 | 
175 |         if self.id_corpus is None:
176 |             self.id_corpus = [self.id2word.doc2bow(document.split())
177 |                               for document in train_corpus]
178 | 
179 |         if "num_topics" not in hyperparams:
180 |             hyperparams["num_topics"] = self.hyperparameters["num_topics"]
181 | 
182 |         # Allow alpha to be a float in case of symmetric alpha
183 |         if "alpha" in hyperparams:
184 |             if isinstance(hyperparams["alpha"], float):
185 |                 hyperparams["alpha"] = [
186 |                     hyperparams["alpha"]
187 |                 ] * hyperparams["num_topics"]
188 | 
189 |         hyperparams["corpus"] = self.id_corpus
190 |         hyperparams["id2word"] = self.id2word
191 |         self.hyperparameters.update(hyperparams)
192 | 
193 |         self.trained_model = ldamodel.LdaModel(**self.hyperparameters)
194 | 
195 |         result = {}
196 | 
197 |         result["topic-word-matrix"] = self.trained_model.get_topics()
198 | 
199 |         if top_words > 0:
200 |             topics_output = []
201 |             for topic in result["topic-word-matrix"]:
202 |                 top_k = np.argsort(topic)[-top_words:]
203 |                 top_k_words = list(reversed([self.id2word[i] for i in top_k]))
204 |                 topics_output.append(top_k_words)
205 |             result["topics"] = topics_output
206 | 
207 |         result["topic-document-matrix"] = self._get_topic_document_matrix()
208 | 
209 |         if self.use_partitions:
210 |             new_corpus = [self.id2word.doc2bow(document.split()) for document in test_corpus]
211 |             if self.update_with_test:
212 |                 self.trained_model.update(new_corpus)
213 |                 self.id_corpus.extend(new_corpus)
214 | 
215 |                 result["test-topic-word-matrix"] = self.trained_model.get_topics()
216 | 
217 |                 if top_words > 0:
218 |                     topics_output = []
219 |                     for topic in result["test-topic-word-matrix"]:
220 |                         top_k = np.argsort(topic)[-top_words:]
221 |                         top_k_words = list(
222 |                             reversed([self.id2word[i] for i in top_k]))
223 |                         topics_output.append(top_k_words)
224 |                     result["test-topics"] = topics_output
225 | 
226 |                 result["test-topic-document-matrix"] = self._get_topic_document_matrix()
227 | 
228 |             else:
229 |                 test_document_topic_matrix = []
230 |                 for document in new_corpus:
231 |                     document_topics_tuples = self.trained_model[document]
232 |                     document_topics = np.zeros(
233 |                         self.hyperparameters["num_topics"])
234 |                     for single_tuple in document_topics_tuples:
235 |                         document_topics[single_tuple[0]] = single_tuple[1]
236 | 
237 |                     test_document_topic_matrix.append(document_topics)
238 |                 result["test-topic-document-matrix"] = np.array(
239 |                     test_document_topic_matrix).transpose()
240 |         return result
241 | 
242 |     def _get_topics_words(self, topk):
243 |         """
244 |         Return the most significative words for each topic.
245 |         """
246 |         topic_terms = []
247 |         for i in range(self.hyperparameters["num_topics"]):
248 |             topic_words_list = []
249 |             for word_tuple in self.trained_model.get_topic_terms(i, topk):
250 |                 topic_words_list.append(self.id2word[word_tuple[0]])
251 |             topic_terms.append(topic_words_list)
252 |         return topic_terms
253 | 
254 |     def _get_topic_document_matrix(self):
255 |         """
256 |         Return the topic representation of the
257 |         corpus
258 |         """
259 |         doc_topic_tuples = []
260 |         for document in self.id_corpus:
261 |             doc_topic_tuples.append(
262 |                 self.trained_model.get_document_topics(document,
263 |                                                        minimum_probability=0))
264 | 
265 |         topic_document = np.zeros((
266 |             self.hyperparameters["num_topics"],
267 |             len(doc_topic_tuples)))
268 | 
269 |         for ndoc in range(len(doc_topic_tuples)):
270 |             document = doc_topic_tuples[ndoc]
271 |             for topic_tuple in document:
272 |                 topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
273 |         return topic_document


--------------------------------------------------------------------------------
/models/NMF.py:
--------------------------------------------------------------------------------
  1 | from models.model import AbstractModel
  2 | import numpy as np
  3 | from gensim.models import nmf
  4 | import gensim.corpora as corpora
  5 | 
  6 | 
  7 | class NMF(AbstractModel):
  8 | 
  9 |     def __init__(self, num_topics=100, chunksize=2000, passes=1, kappa=1.0, minimum_probability=0.01, w_max_iter=200,
 10 |                  w_stop_condition=0.0001, h_max_iter=50, h_stop_condition=0.001, eval_every=10, normalize=True,
 11 |                  random_state=None, use_partitions=True):
 12 |         """
 13 |         Initialize NMF model
 14 |         Parameters
 15 |         ----------
 16 |         num_topics (int, optional) – Number of topics to extract.
 17 |         chunksize (int, optional) – Number of documents to be used in each
 18 |         training chunk.
 19 |         passes (int, optional) – Number of full passes over the
 20 |         training corpus. Leave at default passes=1 if your input
 21 |         is an iterator.
 22 |         kappa (float, optional) – Gradient descent step size.
 23 |         Larger value makes the model train faster, but could
 24 |         lead to non-convergence if set too large.
 25 |         minimum_probability – If normalize is True, topics with
 26 |         smaller probabilities are filtered out. If normalize is False,
 27 |         topics with smaller factors are filtered out. If set to None,
 28 |         a value of 1e-8 is used to prevent 0s.
 29 |         w_max_iter (int, optional) – Maximum number of iterations to
 30 |         train W per each batch.
 31 |         w_stop_condition (float, optional) – If error difference gets less
 32 |         than that, training of W stops for the current batch.
 33 |         h_max_iter (int, optional) – Maximum number of iterations to train
 34 |         h per each batch.
 35 |         h_stop_condition (float) – If error difference gets less than that,
 36 |         training of h stops for the current batch.
 37 |         eval_every (int, optional) – Number of batches after which l2 norm
 38 |         of (v - Wh) is computed. Decreases performance if set too low.
 39 |         normalize (bool or None, optional) – Whether to normalize the result.
 40 |         random_state ({np.random.RandomState, int}, optional) – Seed for
 41 |         random generator. Needed for reproducibility.
 42 |         """
 43 |         super().__init__()
 44 |         self.hyperparameters["num_topics"] = num_topics
 45 |         self.hyperparameters["chunksize"] = chunksize
 46 |         self.hyperparameters["passes"] = passes
 47 |         self.hyperparameters["kappa"] = kappa
 48 |         self.hyperparameters["minimum_probability"] = minimum_probability
 49 |         self.hyperparameters["w_max_iter"] = w_max_iter
 50 |         self.hyperparameters["w_stop_condition"] = w_stop_condition
 51 |         self.hyperparameters["h_max_iter"] = h_max_iter
 52 |         self.hyperparameters["h_stop_condition"] = h_stop_condition
 53 |         self.hyperparameters["eval_every"] = eval_every
 54 |         self.hyperparameters["normalize"] = normalize
 55 |         self.hyperparameters["random_state"] = random_state
 56 |         self.use_partitions = use_partitions
 57 | 
 58 |         self.id2word = None
 59 |         self.id_corpus = None
 60 |         self.update_with_test = False
 61 | 
 62 |     def info(self):
 63 |         """
 64 |         Returns model informations
 65 |         """
 66 |         return {
 67 |             "citation": citations.models_NMF,
 68 |             "name": "NMF, Non-negative Matrix Factorization"
 69 |         }
 70 | 
 71 |     def hyperparameters_info(self):
 72 |         """
 73 |         Returns hyperparameters informations
 74 |         """
 75 |         return defaults.NMF_gensim_hyperparameters_info
 76 | 
 77 |     def partitioning(self, use_partitions, update_with_test=False):
 78 |         """
 79 |         Handle the partitioning system to use and reset the model to perform
 80 |         new evaluations
 81 |         Parameters
 82 |         ----------
 83 |         use_partitions: True if train/set partitioning is needed, False
 84 |                         otherwise
 85 |         update_with_test: True if the model should be updated with the test set,
 86 |                           False otherwise
 87 |         """
 88 |         self.use_partitions = use_partitions
 89 |         self.update_with_test = update_with_test
 90 |         self.id2word = None
 91 |         self.id_corpus = None
 92 | 
 93 |     def train_model(self, dataset, hyperparameters=None, top_words=10):
 94 |         """
 95 |         Train the model and return output
 96 |         Parameters
 97 |         ----------
 98 |         dataset : dataset to use to build the model
 99 |         hyperparameters : hyperparameters to build the model
100 |         top_words : if greather than 0 returns the most significant words
101 |                  for each topic in the output
102 |                  Default True
103 |         Returns
104 |         -------
105 |         result : dictionary with up to 3 entries,
106 |                  'topics', 'topic-word-matrix' and
107 |                  'topic-document-matrix'
108 |         """
109 |         if hyperparameters is None:
110 |             hyperparameters = {}
111 |         if self.use_partitions:
112 |             partition = [dataset.train_corpus, dataset.test_corpus]
113 |         else:
114 |             partition = [dataset.train, []]
115 | 
116 |         if self.id2word is None:
117 |             _corpus = dataset.train_corpus + dataset.test_corpus
118 |             self.id2word = corpora.Dictionary([doc.split() for doc in _corpus])
119 |         if self.id_corpus is None:
120 |             self.id_corpus = [self.id2word.doc2bow(
121 |                 document.split()) for document in partition[0]]
122 | 
123 |         hyperparameters["corpus"] = self.id_corpus
124 |         hyperparameters["id2word"] = self.id2word
125 |         self.hyperparameters.update(hyperparameters)
126 | 
127 |         self.trained_model = nmf.Nmf(**self.hyperparameters)
128 | 
129 |         result = {}
130 | 
131 |         result["topic-word-matrix"] = self.trained_model.get_topics()
132 | 
133 |         if top_words > 0:
134 |             topics_output = []
135 |             for topic in result["topic-word-matrix"]:
136 |                 top_k = np.argsort(topic)[-top_words:]
137 |                 top_k_words = list(reversed([self.id2word[i] for i in top_k]))
138 |                 topics_output.append(top_k_words)
139 |             result["topics"] = topics_output
140 | 
141 |         result["topic-document-matrix"] = self._get_topic_document_matrix()
142 | 
143 |         if self.use_partitions:
144 |             new_corpus = [self.id2word.doc2bow(
145 |                 document.split()) for document in partition[1]]
146 |             if self.update_with_test:
147 |                 self.trained_model.update(new_corpus)
148 |                 self.id_corpus.extend(new_corpus)
149 | 
150 |                 result["test-topic-word-matrix"] = self.trained_model.get_topics()
151 | 
152 |                 if top_words > 0:
153 |                     topics_output = []
154 |                     for topic in result["test-topic-word-matrix"]:
155 |                         top_k = np.argsort(topic)[-top_words:]
156 |                         top_k_words = list(
157 |                             reversed([self.id2word[i] for i in top_k]))
158 |                         topics_output.append(top_k_words)
159 |                     result["test-topics"] = topics_output
160 | 
161 |                 result["test-topic-document-matrix"] = self._get_topic_document_matrix()
162 |             else:
163 |                 result["test-topic-document-matrix"] = self._get_topic_document_matrix(new_corpus)
164 |         return result
165 | 
166 |     def _get_topics_words(self, topk):
167 |         """
168 |         Return the most significative words for each topic.
169 |         """
170 |         topic_terms = []
171 |         for i in range(self.hyperparameters["num_topics"]):
172 |             topic_words_list = []
173 |             for word_tuple in self.trained_model.get_topic_terms(i, topk):
174 |                 topic_words_list.append(self.id2word[word_tuple[0]])
175 |             topic_terms.append(topic_words_list)
176 |         return topic_terms
177 | 
178 |     def _get_topic_document_matrix(self, test_corpus=None):
179 |         """
180 |         Return the topic representation of the
181 |         corpus
182 |         """
183 |         doc_topic_tuples = []
184 | 
185 |         if test_corpus is None:
186 |             for document in self.id_corpus:
187 |                 doc_topic_tuples.append(
188 |                     self.trained_model.get_document_topics(document, minimum_probability=0))
189 |         else:
190 |             for document in test_corpus:
191 |                 doc_topic_tuples.append(
192 |                     self.trained_model.get_document_topics(document, minimum_probability=0))
193 |         topic_document = np.zeros((
194 |             self.hyperparameters["num_topics"],
195 |             len(doc_topic_tuples)))
196 | 
197 |         for ndoc in range(len(doc_topic_tuples)):
198 |             document = doc_topic_tuples[ndoc]
199 |             for topic_tuple in document:
200 |                 topic_document[topic_tuple[0]][ndoc] = topic_tuple[1]
201 |         return topic_document
202 | 
203 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, Federico Bianchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/__init__.py:
--------------------------------------------------------------------------------
1 | """Top-level package for Contextualized Topic Models."""
2 | 
3 | __author__ = """Federico Bianchi"""
4 | __email__ = 'f.bianchi@unibocconi.it'
5 | __version__ = '1.7.0'
6 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/contextualized_topic_models.py:
--------------------------------------------------------------------------------
1 | """Main module."""
2 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/datasets/__init__.py


--------------------------------------------------------------------------------
/models/contextualized_topic_models/datasets/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset
 3 | import scipy.sparse
 4 | 
 5 | 
 6 | class CTMDataset(Dataset):
 7 | 
 8 |     """Class to load BOW dataset."""
 9 | 
10 |     def __init__(self, X, X_bert, idx2token):
11 |         """
12 |         Args
13 |             X : array-like, shape=(n_samples, n_features)
14 |                 Document word matrix.
15 |         """
16 |         if X.shape[0] != len(X_bert):
17 |             raise Exception("Wait! BoW and Contextual Embeddings have different sizes! "
18 |                             "You might want to check if the BoW preparation method has removed some documents. ")
19 | 
20 |         self.X = X
21 |         self.X_bert = X_bert
22 |         self.idx2token = idx2token
23 | 
24 |     def __len__(self):
25 |         """Return length of dataset."""
26 |         return self.X.shape[0]
27 | 
28 |     def __getitem__(self, i):
29 |         """Return sample from dataset at index i."""
30 |         if type(self.X[i]) == scipy.sparse.csr.csr_matrix:
31 |             X = torch.FloatTensor(self.X[i].todense())
32 |             X_bert = torch.FloatTensor(self.X_bert[i])
33 |         else:
34 |             X = torch.FloatTensor(self.X[i])
35 |             X_bert = torch.FloatTensor(self.X_bert[i])
36 | 
37 |         return {'X': X, 'X_bert': X_bert}
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/models/__init__.py


--------------------------------------------------------------------------------
/models/contextualized_topic_models/models/ctm.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | from collections import defaultdict
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from torch import optim
  8 | from torch.optim.lr_scheduler import ReduceLROnPlateau
  9 | from torch.utils.data import DataLoader
 10 | 
 11 | from models.contextualized_topic_models.networks.decoding_network import DecoderNetwork
 12 | from models.pytorchtools import EarlyStopping
 13 | 
 14 | 
 15 | class CTM(object):
 16 |     """Class to train the contextualized topic model
 17 |     """
 18 | 
 19 |     def __init__(self, input_size, bert_input_size, inference_type="zeroshot", num_topics=10, model_type='prodLDA',
 20 |                  hidden_sizes=(100, 100), activation='softplus', dropout=0.2, learn_priors=True, batch_size=64,
 21 |                  lr=2e-3, momentum=0.99, solver='adam', num_epochs=100, num_samples=10,
 22 |                  reduce_on_plateau=False, topic_prior_mean=0.0, topic_prior_variance=None, num_data_loader_workers=0):
 23 |         """
 24 |         :param input_size: int, dimension of input
 25 |         :param bert_input_size: int, dimension of input that comes from BERT embeddings
 26 |         :param inference_type: string, you can choose between the contextual model and the combined model
 27 |         :param num_topics: int, number of topic components, (default 10)
 28 |         :param model_type: string, 'prodLDA' or 'LDA' (default 'prodLDA')
 29 |         :param hidden_sizes: tuple, length = n_layers, (default (100, 100))
 30 |         :param activation: string, 'softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu',
 31 |          'selu' (default 'softplus')
 32 |         :param dropout: float, dropout to use (default 0.2)
 33 |         :param learn_priors: bool, make priors a learnable parameter (default True)
 34 |         :param batch_size: int, size of batch to use for training (default 64)
 35 |         :param lr: float, learning rate to use for training (default 2e-3)
 36 |         :param momentum: float, momentum to use for training (default 0.99)
 37 |         :param solver: string, optimizer 'adam' or 'sgd' (default 'adam')
 38 |         :param num_samples: int, number of times theta needs to be sampled
 39 |         :param num_epochs: int, number of epochs to train for, (default 100)
 40 |         :param reduce_on_plateau: bool, reduce learning rate by 10x on plateau of 10 epochs (default False)
 41 |         :param num_data_loader_workers: int, number of data loader workers (default cpu_count). set it to 0 if you are using Windows
 42 |         """
 43 | 
 44 |         assert isinstance(input_size, int) and input_size > 0, \
 45 |             "input_size must by type int > 0."
 46 |         assert (isinstance(num_topics, int) or isinstance(num_topics, np.int64)) and num_topics > 0, \
 47 |             "num_topics must by type int > 0."
 48 |         assert model_type in ['LDA', 'prodLDA'], \
 49 |             "model must be 'LDA' or 'prodLDA'."
 50 |         assert isinstance(hidden_sizes, tuple), \
 51 |             "hidden_sizes must be type tuple."
 52 |         assert activation in ['softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu',
 53 |                               'rrelu', 'elu', 'selu'], \
 54 |             "activation must be 'softplus', 'relu', 'sigmoid', 'swish', 'leakyrelu'," \
 55 |             " 'rrelu', 'elu', 'selu' or 'tanh'."
 56 |         assert dropout >= 0, "dropout must be >= 0."
 57 |         # assert isinstance(learn_priors, bool), "learn_priors must be boolean."
 58 |         assert isinstance(batch_size, int) and batch_size > 0, \
 59 |             "batch_size must be int > 0."
 60 |         assert lr > 0, "lr must be > 0."
 61 |         assert isinstance(momentum, float) and momentum > 0 and momentum <= 1, \
 62 |             "momentum must be 0 < float <= 1."
 63 |         assert solver in ['adagrad', 'adam', 'sgd', 'adadelta', 'rmsprop'], \
 64 |             "solver must be 'adam', 'adadelta', 'sgd', 'rmsprop' or 'adagrad'"
 65 |         assert isinstance(reduce_on_plateau, bool), \
 66 |             "reduce_on_plateau must be type bool."
 67 |         assert isinstance(topic_prior_mean, float), \
 68 |             "topic_prior_mean must be type float"
 69 |         # and topic_prior_variance >= 0, \
 70 |         # assert isinstance(topic_prior_variance, float), \
 71 |         #    "topic prior_variance must be type float"
 72 | 
 73 |         self.input_size = input_size
 74 |         self.num_topics = num_topics
 75 |         self.model_type = model_type
 76 |         self.hidden_sizes = hidden_sizes
 77 |         self.activation = activation
 78 |         self.dropout = dropout
 79 |         self.learn_priors = learn_priors
 80 |         self.batch_size = batch_size
 81 |         self.lr = lr
 82 |         self.num_samples = num_samples
 83 |         self.bert_size = bert_input_size
 84 |         self.momentum = momentum
 85 |         self.solver = solver
 86 |         self.num_epochs = num_epochs
 87 |         self.reduce_on_plateau = reduce_on_plateau
 88 |         self.num_data_loader_workers = num_data_loader_workers
 89 |         self.topic_prior_mean = topic_prior_mean
 90 |         self.topic_prior_variance = topic_prior_variance
 91 |         # init inference avitm network
 92 |         self.model = DecoderNetwork(
 93 |             input_size, self.bert_size, inference_type, num_topics, model_type, hidden_sizes, activation,
 94 |             dropout, self.learn_priors, self.topic_prior_mean, self.topic_prior_variance)
 95 |         self.early_stopping = EarlyStopping(patience=5, verbose=False)
 96 |         # init optimizer
 97 |         if self.solver == 'adam':
 98 |             self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=(self.momentum, 0.99))
 99 |         elif self.solver == 'sgd':
100 |             self.optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=self.momentum)
101 |         elif self.solver == 'adagrad':
102 |             self.optimizer = optim.Adagrad(self.model.parameters(), lr=lr)
103 |         elif self.solver == 'adadelta':
104 |             self.optimizer = optim.Adadelta(self.model.parameters(), lr=lr)
105 |         elif self.solver == 'rmsprop':
106 |             self.optimizer = optim.RMSprop(self.model.parameters(), lr=lr, momentum=self.momentum)
107 |         # init lr scheduler
108 |         if self.reduce_on_plateau:
109 |             self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10)
110 | 
111 |         # performance attributes
112 |         self.best_loss_train = float('inf')
113 | 
114 |         # training attributes
115 |         self.model_dir = None
116 |         self.train_data = None
117 |         self.nn_epoch = None
118 | 
119 |         # learned topics
120 |         self.best_components = None
121 | 
122 |         # Use cuda if available
123 |         if torch.cuda.is_available():
124 |             self.USE_CUDA = True
125 |         else:
126 |             self.USE_CUDA = False
127 |         if self.USE_CUDA:
128 |             self.model = self.model.cuda()
129 | 
130 |     def _loss(self, inputs, word_dists, prior_mean, prior_variance,
131 |               posterior_mean, posterior_variance, posterior_log_variance):
132 |         # KL term
133 |         # var division term
134 |         var_division = torch.sum(posterior_variance / prior_variance, dim=1)
135 |         # diff means term
136 |         diff_means = prior_mean - posterior_mean
137 |         diff_term = torch.sum(
138 |             (diff_means * diff_means) / prior_variance, dim=1)
139 |         # logvar det division term
140 |         logvar_det_division = \
141 |             prior_variance.log().sum() - posterior_log_variance.sum(dim=1)
142 |         # combine terms
143 |         KL = 0.5 * (var_division + diff_term - self.num_topics + logvar_det_division)
144 |         # Reconstruction term
145 |         RL = -torch.sum(inputs * torch.log(word_dists + 1e-10), dim=1)
146 |         loss = KL + RL
147 | 
148 |         return loss.sum()
149 | 
150 |     def _train_epoch(self, loader):
151 |         """Train epoch."""
152 |         self.model.train()
153 |         train_loss = 0
154 |         samples_processed = 0
155 |         topic_doc_list = []
156 |         for batch_samples in loader:
157 |             # batch_size x vocab_size
158 |             X = batch_samples['X']
159 |             X = X.reshape(X.shape[0], -1)
160 |             X_bert = batch_samples['X_bert']
161 |             if self.USE_CUDA:
162 |                 X = X.cuda()
163 |                 X_bert = X_bert.cuda()
164 | 
165 |             # forward pass
166 |             self.model.zero_grad()
167 |             prior_mean, prior_variance, \
168 |             posterior_mean, posterior_variance, posterior_log_variance, \
169 |             word_dists, topic_word, topic_document = self.model(X, X_bert)
170 |             topic_doc_list.extend(topic_document)
171 | 
172 |             # backward pass
173 |             loss = self._loss(X, word_dists, prior_mean, prior_variance,
174 |                               posterior_mean, posterior_variance, posterior_log_variance)
175 |             loss.backward()
176 |             self.optimizer.step()
177 | 
178 |             # compute train loss
179 |             samples_processed += X.size()[0]
180 |             train_loss += loss.item()
181 | 
182 |         train_loss /= samples_processed
183 | 
184 |         return samples_processed, train_loss, topic_word, topic_doc_list
185 | 
186 |     def _validation(self, loader):
187 |         """Train epoch."""
188 |         self.model.eval()
189 |         val_loss = 0
190 |         samples_processed = 0
191 |         for batch_samples in loader:
192 |             # batch_size x vocab_size
193 |             X = batch_samples['X']
194 |             X = X.reshape(X.shape[0], -1)
195 |             X_bert = batch_samples['X_bert']
196 | 
197 |             if self.USE_CUDA:
198 |                 X = X.cuda()
199 |                 X_bert = X_bert.cuda()
200 | 
201 |             # forward pass
202 |             self.model.zero_grad()
203 |             prior_mean, prior_variance, \
204 |             posterior_mean, posterior_variance, posterior_log_variance, \
205 |             word_dists, topic_word, topic_document = self.model(X, X_bert)
206 | 
207 |             loss = self._loss(X, word_dists, prior_mean, prior_variance,
208 |                               posterior_mean, posterior_variance, posterior_log_variance)
209 | 
210 |             # compute train loss
211 |             samples_processed += X.size()[0]
212 |             val_loss += loss.item()
213 | 
214 |         val_loss /= samples_processed
215 | 
216 |         return samples_processed, val_loss
217 | 
218 |     def fit(self, train_dataset, validation_dataset=None, save_dir=None, verbose=True):
219 |         """
220 |         Train the CTM model.
221 | 
222 |         :param train_dataset: PyTorch Dataset class for training data.
223 |         :param validation_dataset: PyTorch Dataset class for validation data
224 |         :param save_dir: directory to save checkpoint models to.
225 |         :param verbose: verbose
226 |         """
227 |         # Print settings to output file
228 |         if verbose:
229 |             print("Settings: \n\
230 |                    N Components: {}\n\
231 |                    Topic Prior Mean: {}\n\
232 |                    Topic Prior Variance: {}\n\
233 |                    Model Type: {}\n\
234 |                    Hidden Sizes: {}\n\
235 |                    Activation: {}\n\
236 |                    Dropout: {}\n\
237 |                    Learn Priors: {}\n\
238 |                    Learning Rate: {}\n\
239 |                    Momentum: {}\n\
240 |                    Reduce On Plateau: {}\n\
241 |                    Save Dir: {}".format(
242 |                 self.num_topics, self.topic_prior_mean,
243 |                 self.topic_prior_variance, self.model_type,
244 |                 self.hidden_sizes, self.activation, self.dropout, self.learn_priors,
245 |                 self.lr, self.momentum, self.reduce_on_plateau, save_dir))
246 | 
247 |         self.model_dir = save_dir
248 |         self.train_data = train_dataset
249 |         self.validation_data = validation_dataset
250 | 
251 |         train_loader = DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True,
252 |                                   num_workers=self.num_data_loader_workers)
253 | 
254 |         # init training variables
255 |         train_loss = 0
256 |         samples_processed = 0
257 | 
258 |         # train loop
259 |         for epoch in range(self.num_epochs):
260 |             self.nn_epoch = epoch
261 |             # train epoch
262 |             s = datetime.datetime.now()
263 |             sp, train_loss, topic_word, topic_document = self._train_epoch(train_loader)
264 |             samples_processed += sp
265 |             e = datetime.datetime.now()
266 | 
267 |             if verbose:
268 |                 print("Epoch: [{}/{}]\tSamples: [{}/{}]\tTrain Loss: {}\tTime: {}".format(
269 |                     epoch + 1, self.num_epochs, samples_processed,
270 |                     len(self.train_data) * self.num_epochs, train_loss, e - s))
271 | 
272 |             self.best_components = self.model.beta
273 |             self.final_topic_word = topic_word
274 |             self.final_topic_document = topic_document
275 |             self.best_loss_train = train_loss
276 |             if self.validation_data is not None:
277 |                 validation_loader = DataLoader(
278 |                     self.validation_data, batch_size=self.batch_size, shuffle=True,
279 |                     num_workers=self.num_data_loader_workers)
280 |                 # train epoch
281 |                 s = datetime.datetime.now()
282 |                 val_samples_processed, val_loss = self._validation(validation_loader)
283 |                 e = datetime.datetime.now()
284 | 
285 |                 if verbose:
286 |                     print("Epoch: [{}/{}]\tSamples: [{}/{}]\tValidation Loss: {}\tTime: {}".format(
287 |                         epoch + 1, self.num_epochs, val_samples_processed,
288 |                         len(self.validation_data) * self.num_epochs, val_loss, e - s))
289 | 
290 |                 if np.isnan(val_loss) or np.isnan(train_loss):
291 |                     break
292 |                 else:
293 |                     self.early_stopping(val_loss, self.model)
294 |                     if self.early_stopping.early_stop:
295 |                         if verbose:
296 |                             print("Early stopping")
297 |                         if save_dir is not None:
298 |                             self.save(save_dir)
299 |                         break
300 | 
301 |     def predict(self, dataset):
302 |         """Predict input."""
303 |         self.model.eval()
304 | 
305 |         loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False,
306 |                             num_workers=self.num_data_loader_workers)
307 | 
308 |         topic_document_mat = []
309 |         with torch.no_grad():
310 |             for batch_samples in loader:
311 |                 # batch_size x vocab_size
312 |                 X = batch_samples['X']
313 |                 X = X.reshape(X.shape[0], -1)
314 |                 X_bert = batch_samples['X_bert']
315 | 
316 |                 if self.USE_CUDA:
317 |                     X = X.cuda()
318 |                     X_bert = X_bert.cuda()
319 |                 # forward pass
320 |                 self.model.zero_grad()
321 |                 _, _, _, _, _, _, _, topic_document = self.model(X, X_bert)
322 |                 topic_document_mat.append(topic_document)
323 | 
324 |         results = self.get_info()
325 |         results['test-topic-document-matrix'] = np.asarray(self.get_thetas(dataset)).T
326 | 
327 |         return results
328 | 
329 |     def get_topic_word_mat(self):
330 |         top_wor = self.final_topic_word.cpu().detach().numpy()
331 |         return top_wor
332 | 
333 |     def get_topic_document_mat(self):
334 |         top_doc = self.final_topic_document
335 |         top_doc_arr = np.array([i.cpu().detach().numpy() for i in top_doc])
336 |         return top_doc_arr
337 | 
338 |     def get_topics(self, k=10):
339 |         """
340 |         Retrieve topic words.
341 | 
342 |         Args
343 |             k : (int) number of words to return per topic, default 10.
344 |         """
345 |         assert k <= self.input_size, "k must be <= input size."
346 |         component_dists = self.best_components
347 |         topics = defaultdict(list)
348 |         topics_list = []
349 |         if self.num_topics is not None:
350 |             for i in range(self.num_topics):
351 |                 _, idxs = torch.topk(component_dists[i], k)
352 |                 component_words = [self.train_data.idx2token[idx]
353 |                                    for idx in idxs.cpu().numpy()]
354 |                 topics[i] = component_words
355 |                 topics_list.append(component_words)
356 | 
357 |         return topics_list
358 | 
359 |     def get_info(self):
360 |         info = {}
361 |         topic_word = self.get_topics()
362 |         topic_word_dist = self.get_topic_word_mat()
363 |         topic_document_dist = self.get_topic_document_mat()
364 |         info['topics'] = topic_word
365 | 
366 |         info['topic-document-matrix'] = np.asarray(self.get_thetas(self.train_data)).T
367 | 
368 |         info['topic-word-matrix'] = topic_word_dist
369 |         return info
370 | 
371 |     def _format_file(self):
372 |         model_dir = "AVITM_nc_{}_tpm_{}_tpv_{}_hs_{}_ac_{}_do_{}_lr_{}_mo_{}_rp_{}". \
373 |             format(self.num_topics, 0.0, 1 - (1. / self.num_topics),
374 |                    self.model_type, self.hidden_sizes, self.activation,
375 |                    self.dropout, self.lr, self.momentum,
376 |                    self.reduce_on_plateau)
377 |         return model_dir
378 | 
379 |     def save(self, models_dir=None):
380 |         """
381 |         Save model.
382 | 
383 |         :param models_dir: path to directory for saving NN models.
384 |         """
385 |         if (self.model is not None) and (models_dir is not None):
386 | 
387 |             model_dir = self._format_file()
388 |             if not os.path.isdir(os.path.join(models_dir, model_dir)):
389 |                 os.makedirs(os.path.join(models_dir, model_dir))
390 | 
391 |             filename = "epoch_{}".format(self.nn_epoch) + '.pth'
392 |             fileloc = os.path.join(models_dir, model_dir, filename)
393 |             with open(fileloc, 'wb') as file:
394 |                 torch.save({'state_dict': self.model.state_dict(),
395 |                             'dcue_dict': self.__dict__}, file)
396 | 
397 |     def load(self, model_dir, epoch):
398 |         """
399 |         Load a previously trained model.
400 | 
401 |         :param model_dir: directory where models are saved.
402 |         :param epoch: epoch of model to load.
403 |         """
404 |         epoch_file = "epoch_" + str(epoch) + ".pth"
405 |         model_file = os.path.join(model_dir, epoch_file)
406 |         with open(model_file, 'rb') as model_dict:
407 |             checkpoint = torch.load(model_dict)
408 | 
409 |         for (k, v) in checkpoint['dcue_dict'].items():
410 |             setattr(self, k, v)
411 | 
412 |         self.model.load_state_dict(checkpoint['state_dict'])
413 | 
414 |     def get_thetas(self, dataset):
415 |         """
416 |         Get the document-topic distribution for a dataset of topics. Includes multiple sampling to reduce variation via
417 |         the parameter num_samples.
418 |         :param dataset: a PyTorch Dataset containing the documents
419 |         """
420 |         self.model.eval()
421 | 
422 |         loader = DataLoader(
423 |             dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_data_loader_workers)
424 |         final_thetas = []
425 |         for sample_index in range(self.num_samples):
426 |             with torch.no_grad():
427 |                 collect_theta = []
428 |                 for batch_samples in loader:
429 |                     # batch_size x vocab_size
430 |                     x = batch_samples['X']
431 |                     x = x.reshape(x.shape[0], -1)
432 |                     x_bert = batch_samples['X_bert']
433 |                     if self.USE_CUDA:
434 |                         x = x.cuda()
435 |                         x_bert = x_bert.cuda()
436 |                     # forward pass
437 |                     self.model.zero_grad()
438 |                     collect_theta.extend(self.model.get_theta(x, x_bert).cpu().numpy().tolist())
439 | 
440 |                 final_thetas.append(np.array(collect_theta))
441 |         return np.sum(final_thetas, axis=0) / self.num_samples
442 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/networks/__init__.py


--------------------------------------------------------------------------------
/models/contextualized_topic_models/networks/decoding_network.py:
--------------------------------------------------------------------------------
  1 | """PyTorch class for feed foward AVITM network."""
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | import numpy as np
  7 | 
  8 | from models.contextualized_topic_models.networks.inference_network import CombinedInferenceNetwork, ContextualInferenceNetwork
  9 | 
 10 | 
 11 | class DecoderNetwork(nn.Module):
 12 | 
 13 |     """AVITM Network."""
 14 | 
 15 |     def __init__(self, input_size, bert_size, infnet, n_components=10, model_type='prodLDA',
 16 |                  hidden_sizes=(100,100), activation='softplus', dropout=0.2,
 17 |                  learn_priors=True, topic_prior_mean=0.0, topic_prior_variance=None):
 18 |         """
 19 |         Initialize InferenceNetwork.
 20 | 
 21 |         Args
 22 |             input_size : int, dimension of input
 23 |             n_components : int, number of topic components, (default 10)
 24 |             model_type : string, 'prodLDA' or 'LDA' (default 'prodLDA')
 25 |             hidden_sizes : tuple, length = n_layers, (default (100, 100))
 26 |             activation : string, 'softplus', 'relu', (default 'softplus')
 27 |             learn_priors : bool, make priors learnable parameter
 28 |             topic_prior_mean: double, mean parameter of the prior
 29 |             topic_prior_variance: double, variance parameter of the prior
 30 |         """
 31 |         super(DecoderNetwork, self).__init__()
 32 |         assert isinstance(input_size, int), "input_size must by type int."
 33 |         assert (isinstance(n_components, int) or isinstance(n_components, np.int64)) and n_components > 0, \
 34 |             "n_components must be type int > 0."
 35 |         assert model_type in ['prodLDA', 'LDA'], \
 36 |             "model type must be 'prodLDA' or 'LDA'"
 37 |         assert isinstance(hidden_sizes, tuple), \
 38 |             "hidden_sizes must be type tuple."
 39 |         assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu',
 40 |                               'rrelu', 'elu', 'selu'], \
 41 |             "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \
 42 |             " 'rrelu', 'elu', 'selu' or 'tanh'."
 43 |         assert dropout >= 0, "dropout must be >= 0."
 44 |         assert isinstance(topic_prior_mean, float), \
 45 |             "topic_prior_mean must be type float"
 46 |         # and topic_prior_variance >= 0, \
 47 |         #assert isinstance(topic_prior_variance, float), \
 48 |         #    "topic prior_variance must be type float"
 49 | 
 50 |         self.input_size = input_size
 51 |         self.n_components = n_components
 52 |         self.model_type = model_type
 53 |         self.hidden_sizes = hidden_sizes
 54 |         self.activation = activation
 55 |         self.dropout = dropout
 56 |         self.learn_priors = learn_priors
 57 | 
 58 |         if infnet == "zeroshot":
 59 |             self.inf_net = ContextualInferenceNetwork(
 60 |                 input_size, bert_size, n_components, hidden_sizes, activation)
 61 |         elif infnet == "combined":
 62 |             self.inf_net = CombinedInferenceNetwork(
 63 |                 input_size, bert_size, n_components, hidden_sizes, activation)
 64 |         else:
 65 |             raise Exception('Missing infnet parameter, options are zeroshot and combined')
 66 |         if torch.cuda.is_available():
 67 |             self.inf_net = self.inf_net.cuda()
 68 |         # init prior parameters
 69 |         # \mu_1k = log \alpha_k + 1/K \sum_i log \alpha_i;
 70 |         # \alpha = 1 \forall \alpha
 71 |         #self.topic_prior_mean = topic_prior_mean
 72 |         self.prior_mean = torch.tensor(
 73 |             [topic_prior_mean] * n_components)
 74 |         if torch.cuda.is_available():
 75 |             self.prior_mean = self.prior_mean.cuda()
 76 |         if self.learn_priors:
 77 |             self.prior_mean = nn.Parameter(self.prior_mean)
 78 | 
 79 | 
 80 |         # \Sigma_1kk = 1 / \alpha_k (1 - 2/K) + 1/K^2 \sum_i 1 / \alpha_k;
 81 |         # \alpha = 1 \forall \alpha
 82 |         if topic_prior_variance is None:
 83 |             topic_prior_variance = 1. - (1. / self.n_components)
 84 |         self.prior_variance = torch.tensor(
 85 |             [topic_prior_variance] * n_components)
 86 |         if torch.cuda.is_available():
 87 |             self.prior_variance = self.prior_variance.cuda()
 88 |         if self.learn_priors:
 89 |             self.prior_variance = nn.Parameter(self.prior_variance)
 90 | 
 91 |         self.beta = torch.Tensor(n_components, input_size)
 92 |         if torch.cuda.is_available():
 93 |             self.beta = self.beta.cuda()
 94 |         self.beta = nn.Parameter(self.beta)
 95 |         nn.init.xavier_uniform_(self.beta)
 96 | 
 97 |         self.beta_batchnorm = nn.BatchNorm1d(input_size, affine=False)
 98 | 
 99 |         # dropout on theta
100 |         self.drop_theta = nn.Dropout(p=self.dropout)
101 | 
102 |     @staticmethod
103 |     def reparameterize(mu, logvar):
104 |         """Reparameterize the theta distribution."""
105 |         std = torch.exp(0.5*logvar)
106 |         eps = torch.randn_like(std)
107 |         return eps.mul(std).add_(mu)
108 | 
109 |     def forward(self, x, x_bert):
110 |         """Forward pass."""
111 |         # batch_size x n_components
112 |         posterior_mu, posterior_log_sigma = self.inf_net(x, x_bert)
113 |         posterior_sigma = torch.exp(posterior_log_sigma)
114 | 
115 |         # generate samples from theta
116 |         theta = F.softmax(self.reparameterize(posterior_mu, posterior_log_sigma), dim=1)
117 | 
118 |         topic_doc = theta
119 |         theta = self.drop_theta(theta)
120 | 
121 |         # prodLDA vs LDA
122 |         if self.model_type == 'prodLDA':
123 |             # in: batch_size x input_size x n_components
124 |             word_dist = F.softmax(
125 |                 self.beta_batchnorm(torch.matmul(theta, self.beta)), dim=1)
126 |             topic_word = self.beta
127 |             # word_dist: batch_size x input_size
128 |             #self.topic_word_matrix = self.beta
129 |         elif self.model_type == 'LDA':
130 |             # simplex constrain on Beta
131 |             beta = F.softmax(self.beta_batchnorm(self.beta), dim=1)
132 |             topic_word = beta
133 |             word_dist = torch.matmul(theta, beta)
134 |             # word_dist: batch_size x input_size
135 | 
136 |         return self.prior_mean, self.prior_variance, \
137 |             posterior_mu, posterior_sigma, posterior_log_sigma, word_dist, topic_word, topic_doc
138 | 
139 |     def get_theta(self, x, x_bert):
140 |         with torch.no_grad():
141 |             # batch_size x n_components
142 |             posterior_mu, posterior_log_sigma = self.inf_net(x, x_bert)
143 |             posterior_sigma = torch.exp(posterior_log_sigma)
144 | 
145 |             # generate samples from theta
146 |             theta = F.softmax(
147 |                 self.reparameterize(posterior_mu, posterior_log_sigma), dim=1)
148 | 
149 |             return theta
150 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/networks/inference_network.py:
--------------------------------------------------------------------------------
  1 | """PyTorch class for feed foward inference network."""
  2 | 
  3 | from collections import OrderedDict
  4 | from torch import nn
  5 | import torch
  6 | import numpy as np
  7 | 
  8 | class ContextualInferenceNetwork(nn.Module):
  9 | 
 10 |     """Inference Network."""
 11 | 
 12 |     def __init__(self, input_size, bert_size, output_size, hidden_sizes,
 13 |                  activation='softplus', dropout=0.2):
 14 |         """
 15 |         Initialize InferenceNetwork.
 16 | 
 17 |         Args
 18 |             input_size : int, dimension of input
 19 |             output_size : int, dimension of output
 20 |             hidden_sizes : tuple, length = n_layers
 21 |             activation : string, 'softplus' or 'relu', default 'softplus'
 22 |             dropout : float, default 0.2, default 0.2
 23 |         """
 24 |         super(ContextualInferenceNetwork, self).__init__()
 25 |         assert isinstance(input_size, int), "input_size must by type int."
 26 |         assert isinstance(output_size, int), "output_size must be type int."
 27 |         assert isinstance(hidden_sizes, tuple), \
 28 |             "hidden_sizes must be type tuple."
 29 |         assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu',
 30 |                               'rrelu', 'elu', 'selu'], \
 31 |             "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \
 32 |             " 'rrelu', 'elu', 'selu' or 'tanh'."
 33 |         assert dropout >= 0, "dropout must be >= 0."
 34 | 
 35 |         self.input_size = input_size
 36 |         self.output_size = output_size
 37 |         self.hidden_sizes = hidden_sizes
 38 |         self.dropout = dropout
 39 | 
 40 |         if activation == 'softplus':
 41 |             self.activation = nn.Softplus()
 42 |         elif activation == 'relu':
 43 |             self.activation = nn.ReLU()
 44 |         elif activation == 'sigmoid':
 45 |             self.activation = nn.Sigmoid()
 46 |         elif activation == 'tanh':
 47 |             self.activation = nn.Tanh()
 48 |         elif activation == 'leakyrelu':
 49 |             self.activation = nn.LeakyReLU()
 50 |         elif activation == 'rrelu':
 51 |             self.activation = nn.RReLU()
 52 |         elif activation == 'elu':
 53 |             self.activation = nn.ELU()
 54 |         elif activation == 'selu':
 55 |             self.activation = nn.SELU()
 56 | 
 57 |         self.input_layer = nn.Linear(input_size+input_size, hidden_sizes[0])
 58 |         self.adapt_bert = nn.Linear(bert_size, hidden_sizes[0])
 59 | 
 60 |         self.hiddens = nn.Sequential(OrderedDict([
 61 |             ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation))
 62 |             for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))]))
 63 | 
 64 |         self.f_mu = nn.Linear(hidden_sizes[-1], output_size)
 65 |         self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False)
 66 | 
 67 |         self.f_sigma = nn.Linear(hidden_sizes[-1], output_size)
 68 |         self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False)
 69 | 
 70 |         self.dropout_enc = nn.Dropout(p=self.dropout)
 71 | 
 72 |     def forward(self, x, x_bert):
 73 |         """Forward pass."""
 74 |         x_bert = self.adapt_bert(x_bert)
 75 | 
 76 |         x = self.activation(x_bert)
 77 |         x = self.hiddens(x)
 78 |         x = self.dropout_enc(x)
 79 |         mu = self.f_mu_batchnorm(self.f_mu(x))
 80 |         log_sigma = self.f_sigma_batchnorm(self.f_sigma(x))
 81 | 
 82 |         return mu, log_sigma
 83 | 
 84 | 
 85 | class CombinedInferenceNetwork(nn.Module):
 86 | 
 87 |     """Inference Network."""
 88 | 
 89 |     def __init__(self, input_size, bert_size, output_size, hidden_sizes,
 90 |                  activation='softplus', dropout=0.2):
 91 |         """
 92 |         Initialize InferenceNetwork.
 93 | 
 94 |         Args
 95 |             input_size : int, dimension of input
 96 |             output_size : int, dimension of output
 97 |             hidden_sizes : tuple, length = n_layers
 98 |             activation : string, 'softplus' or 'relu', default 'softplus'
 99 |             dropout : float, default 0.2, default 0.2
100 |         """
101 |         super(CombinedInferenceNetwork, self).__init__()
102 |         assert isinstance(input_size, int), "input_size must by type int."
103 |         assert (isinstance(output_size, int) or isinstance(output_size, np.int64)), "output_size must be type int."
104 |         assert isinstance(hidden_sizes, tuple), \
105 |             "hidden_sizes must be type tuple."
106 |         assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu',
107 |                               'rrelu', 'elu', 'selu'], \
108 |             "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \
109 |             " 'rrelu', 'elu', 'selu' or 'tanh'."
110 | 
111 |         assert dropout >= 0, "dropout must be >= 0."
112 | 
113 |         self.input_size = input_size
114 |         self.output_size = output_size
115 |         self.hidden_sizes = hidden_sizes
116 |         self.dropout = dropout
117 | 
118 |         if activation == 'softplus':
119 |             self.activation = nn.Softplus()
120 |         elif activation == 'relu':
121 |             self.activation = nn.ReLU()
122 |         elif activation == 'sigmoid':
123 |             self.activation = nn.Sigmoid()
124 |         elif activation == 'tanh':
125 |             self.activation = nn.Tanh()
126 |         elif activation == 'leakyrelu':
127 |             self.activation = nn.LeakyReLU()
128 |         elif activation == 'rrelu':
129 |             self.activation = nn.RReLU()
130 |         elif activation == 'elu':
131 |             self.activation = nn.ELU()
132 |         elif activation == 'selu':
133 |             self.activation = nn.SELU()
134 | 
135 |         self.input_layer = nn.Linear(input_size+input_size, hidden_sizes[0])
136 |         self.adapt_bert = nn.Linear(bert_size, input_size)
137 |         self.bert_layer = nn.Linear(hidden_sizes[0], hidden_sizes[0])
138 | 
139 |         self.hiddens = nn.Sequential(OrderedDict([
140 |             ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation))
141 |             for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))]))
142 | 
143 |         self.f_mu = nn.Linear(hidden_sizes[-1], output_size)
144 |         self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False)
145 | 
146 |         self.f_sigma = nn.Linear(hidden_sizes[-1], output_size)
147 |         self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False)
148 | 
149 |         self.dropout_enc = nn.Dropout(p=self.dropout)
150 | 
151 |     def forward(self, x, x_bert):
152 |         """Forward pass."""
153 |         x_bert = self.adapt_bert(x_bert)
154 |         x = torch.cat((x, x_bert), 1)
155 |         x = self.input_layer(x)
156 | 
157 |         x = self.activation(x)
158 |         x = self.hiddens(x)
159 |         x = self.dropout_enc(x)
160 |         mu = self.f_mu_batchnorm(self.f_mu(x))
161 |         log_sigma = self.f_sigma_batchnorm(self.f_sigma(x))
162 | 
163 |         return mu, log_sigma
164 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/utils/__init__.py


--------------------------------------------------------------------------------
/models/contextualized_topic_models/utils/data_preparation.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sentence_transformers import SentenceTransformer
  3 | import scipy.sparse
  4 | import warnings
  5 | from models.contextualized_topic_models.datasets.dataset import CTMDataset
  6 | import os
  7 | import pickle as pkl
  8 | 
  9 | def get_bag_of_words(data, min_length):
 10 |     """
 11 |     Creates the bag of words
 12 |     """
 13 |     vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length)
 14 |             for x in data if np.sum(x[x != np.array(None)]) != 0]
 15 | 
 16 |     vect = scipy.sparse.csr_matrix(vect)
 17 |     return vect
 18 | 
 19 | def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200):
 20 |     """
 21 |     Creates SBERT Embeddings from an input file
 22 |     """
 23 |     model = SentenceTransformer(sbert_model_to_load)
 24 |     with open(text_file, encoding="utf-8") as filino:
 25 |         train_text = list(map(lambda x: x, filino.readlines()))
 26 | 
 27 |     return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size))
 28 | 
 29 | 
 30 | def bert_embeddings_from_list(texts, sbert_model_to_load="bert-base-nli-mean-tokens", batch_size=100):
 31 |     """
 32 |     Creates SBERT Embeddings from a list
 33 |     """
 34 |     model = SentenceTransformer(sbert_model_to_load)
 35 |     return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))
 36 | 
 37 | 
 38 | class QuickText:
 39 |     """
 40 |     Integrated class to handle all the text preprocessing needed
 41 |     """
 42 |     def __init__(self, bert_model, text_for_bow, text_for_bert=None, bert_path=None):
 43 |         """
 44 |         :param bert_model: string, bert model to use
 45 |         :param text_for_bert: list, list of sentences with the unpreprocessed text
 46 |         :param text_for_bow: list, list of sentences with the preprocessed text
 47 |         """
 48 |         self.vocab_dict = {}
 49 |         self.vocab = []
 50 |         self.index_dd = None
 51 |         self.idx2token = None
 52 |         self.bow = None
 53 |         self.bert_model = bert_model
 54 |         self.text_handler = ""
 55 |         self.data_bert = None
 56 |         self.text_for_bow = text_for_bow
 57 | 
 58 |         if text_for_bert is not None:
 59 |             self.text_for_bert = text_for_bert
 60 |         else:
 61 |             self.text_for_bert = None
 62 |         self.bert_path = bert_path
 63 | 
 64 |     def prepare_bow(self):
 65 |         indptr = [0]
 66 |         indices = []
 67 |         data = []
 68 |         vocabulary = {}
 69 | 
 70 |         if self.text_for_bow is not None:
 71 |             docs = self.text_for_bow
 72 |         else:
 73 |             docs = self.text_for_bert
 74 | 
 75 |         for d in docs:
 76 |             for term in d.split():
 77 |                 index = vocabulary.setdefault(term, len(vocabulary))
 78 |                 indices.append(index)
 79 |                 data.append(1)
 80 |             indptr.append(len(indices))
 81 | 
 82 |         self.vocab_dict = vocabulary
 83 |         self.vocab = list(vocabulary.keys())
 84 | 
 85 |         warnings.simplefilter('always', DeprecationWarning)
 86 |         if len(self.vocab) > 2000:
 87 |             warnings.warn("The vocab you are using has more than 2000 words, reconstructing high-dimensional vectors requires"
 88 |                           "significantly more training epochs and training samples. "
 89 |                           "Consider reducing the number of vocabulary items. "
 90 |                           "See https://github.com/MilaNLProc/contextualized-topic-models#preprocessing "
 91 |                           "and https://github.com/MilaNLProc/contextualized-topic-models#tldr", Warning)
 92 | 
 93 |         self.idx2token = {v: k for (k, v) in self.vocab_dict.items()}
 94 |         self.bow = scipy.sparse.csr_matrix((data, indices, indptr), dtype=int)
 95 | 
 96 |     def load_contextualized_embeddings(self, embeddings):
 97 |         self.data_bert = embeddings
 98 | 
 99 |     def load_dataset(self):
100 |         self.prepare_bow()
101 |         if self.bert_path is not None:
102 |             if os.path.exists(self.bert_path):
103 |                 self.data_bert = pkl.load(open(self.bert_path, 'r'))
104 |         else:
105 |             if self.data_bert is None:
106 |                 if self.text_for_bert is not None:
107 |                     self.data_bert = bert_embeddings_from_list(self.text_for_bert, self.bert_model)
108 |                 else:
109 |                     self.data_bert = bert_embeddings_from_list(self.text_for_bow, self.bert_model)
110 |                 pkl.dump(self.data_bert, open(self.bert_path, 'w'))
111 | 
112 |         training_dataset = CTMDataset(self.bow, self.data_bert, self.idx2token)
113 |         return training_dataset
114 | 
115 | class TextHandler:
116 |     """
117 |     Class used to handle the text preparation and the BagOfWord
118 |     """
119 |     def __init__(self, file_name=None, sentences=None):
120 |         self.file_name = file_name
121 |         self.sentences = sentences
122 |         self.vocab_dict = {}
123 |         self.vocab = []
124 |         self.index_dd = None
125 |         self.idx2token = None
126 |         self.bow = None
127 | 
128 |         warnings.simplefilter('always', DeprecationWarning)
129 |         if len(self.vocab) > 2000:
130 |             warnings.warn("TextHandler class is deprecated and will be removed in version 2.0. Use QuickText.", Warning)
131 | 
132 |     def prepare(self):
133 |         indptr = [0]
134 |         indices = []
135 |         data = []
136 |         vocabulary = {}
137 | 
138 |         if self.sentences is None and self.file_name is None:
139 |             raise Exception("Sentences and file_names cannot both be none")
140 | 
141 |         if self.sentences is not None:
142 |             docs = self.sentences
143 |         elif self.file_name is not None:
144 |             with open(self.file_name, encoding="utf-8") as filino:
145 |                 docs = filino.readlines()
146 |         else:
147 |             raise Exception("One parameter between sentences and file_name should be selected")
148 | 
149 |         for d in docs:
150 |             for term in d.split():
151 |                 index = vocabulary.setdefault(term, len(vocabulary))
152 |                 indices.append(index)
153 |                 data.append(1)
154 |             indptr.append(len(indices))
155 | 
156 |         self.vocab_dict = vocabulary
157 |         self.vocab = list(vocabulary.keys())
158 | 
159 |         warnings.simplefilter('always', DeprecationWarning)
160 |         if len(self.vocab) > 2000:
161 |             warnings.warn("The vocab you are using has more than 2000 words, reconstructing high-dimensional vectors requires"
162 |                           "significantly more training epochs and training samples. "
163 |                           "Consider reducing the number of vocabulary items. "
164 |                           "See https://github.com/MilaNLProc/contextualized-topic-models#preprocessing "
165 |                           "and https://github.com/MilaNLProc/contextualized-topic-models#tldr", Warning)
166 | 
167 |         self.idx2token = {v: k for (k, v) in self.vocab_dict.items()}
168 |         self.bow = scipy.sparse.csr_matrix((data, indices, indptr), dtype=int)
169 | 


--------------------------------------------------------------------------------
/models/contextualized_topic_models/utils/preprocessing.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import CountVectorizer
 2 | import string
 3 | from nltk.corpus import stopwords as stop_words
 4 | import warnings
 5 | 
 6 | class WhiteSpacePreprocessing():
 7 |     """
 8 |     Provides a very simple preprocessing script that filters infrequent tokens from text
 9 |     """
10 |     def __init__(self, documents, stopwords_language="english", vocabulary_size=2000):
11 |         """
12 | 
13 |         :param documents: list of strings
14 |         :param stopwords_language: string of the language of the stopwords (see nltk stopwords)
15 |         :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents
16 |         """
17 |         self.documents = documents
18 |         self.stopwords = set(stop_words.words(stopwords_language))
19 |         self.vocabulary_size = vocabulary_size
20 | 
21 |     def preprocess(self):
22 |         """
23 |         Note that if after filtering some documents do not contain words we remove them. That is why we return also the
24 |         list of unpreprocessed documents.
25 | 
26 |         :return: preprocessed documents, unpreprocessed documents and the vocabulary list
27 |         """
28 |         preprocessed_docs_tmp = self.documents
29 |         preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp]
30 |         preprocessed_docs_tmp = [doc.translate(
31 |             str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]
32 |         preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords])
33 |                              for doc in preprocessed_docs_tmp]
34 | 
35 |         vectorizer = CountVectorizer(max_features=self.vocabulary_size, token_pattern=r'\b[a-zA-Z]{2,}\b')
36 |         vectorizer.fit_transform(preprocessed_docs_tmp)
37 |         vocabulary = set(vectorizer.get_feature_names())
38 |         preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocabulary])
39 |                                  for doc in preprocessed_docs_tmp]
40 | 
41 |         preprocessed_docs, unpreprocessed_docs = [], []
42 |         for i, doc in enumerate(preprocessed_docs_tmp):
43 |             if len(doc) > 0:
44 |                 preprocessed_docs.append(doc)
45 |                 unpreprocessed_docs.append(self.documents[i])
46 | 
47 |         return preprocessed_docs, unpreprocessed_docs, list(vocabulary)
48 | 
49 | 
50 | class SimplePreprocessing(WhiteSpacePreprocessing):
51 |     def __init__(self, documents, stopwords_language="english"):
52 |         super().__init__(documents, stopwords_language)
53 |         warnings.simplefilter('always', DeprecationWarning)
54 | 
55 |         if self.__class__.__name__ == "CTM":
56 | 
57 |             warnings.warn("SimplePrepocessing is deprecated and will be removed in version 2.0, "
58 |                           "use WhiteSpacePreprocessing", DeprecationWarning)
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/models/model.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | import os
 3 | import numpy as np
 4 | import json
 5 | 
 6 | 
 7 | class AbstractModel(ABC):
 8 |     """
 9 |     Class structure of a generic Topic Modeling implementation
10 |     """
11 | 
12 |     def __init__(self):
13 |         """
14 |         Create a blank model to initialize
15 |         """
16 |         self.hyperparameters = dict()
17 | 
18 |     def set_hyperparameters(self, **kwargs):
19 |         """
20 |         Set model hyperparameters
21 |         :param **kwargs: a dictionary of in the form {hyperparameter name: value}
22 |         """
23 |         for key, value in kwargs.items():
24 |             self.hyperparameters[key] = value
25 | 
26 |     @abstractmethod
27 |     def train_model(self, dataset, hyperparameters, top_words=10):
28 |         """
29 |         Train the model.
30 |         :param dataset: Dataset
31 |         :param hyperparameters: dictionary in the form {hyperparameter name: value}
32 |         :param top_words: number of top significant words for each topic (default: 10)
33 |         :return model_output: a dictionary containing up to 4 keys: *topics*, *topic-word-matrix*,
34 |         *topic-document-matrix*, *test-topic-document-matrix*. *topics* is the list of the most significant words for
35 |         each topic (list of lists of strings). *topic-word-matrix* is the matrix (num topics x ||vocabulary||)
36 |         containing  the probabilities of a word in a given topic. *topic-document-matrix* is the matrix (||topics|| x
37 |         ||training documents||) containing the probabilities of the topics in a given training document.
38 |         *test-topic-document-matrix* is the matrix (||topics|| x ||testing documents||) containing the probabilities
39 |         of the topics in a given testing document.
40 |         """
41 |         pass
42 | 
43 | 
44 | def save_model_output(model_output, path=os.curdir, appr_order=7):
45 |     """
46 |     Saves the model output in the chosen directory
47 |     :param model_output: output of the model
48 |     :param path: path in which the file will be saved and name of the file
49 |     :param appr_order: approximation order (used to round model_output values)
50 |     """
51 | 
52 |     to_save = {}
53 |     try:
54 |         for single_output in model_output.keys():
55 |             if single_output != "topics" and single_output != "test-topics":
56 |                 to_save[single_output] = (
57 |                     model_output[single_output].round(appr_order))
58 |             else:
59 |                 to_save[single_output] = (model_output[single_output])
60 |         np.savez_compressed(path, **to_save)
61 |     except:
62 |         raise Exception("error in saving the output model file")
63 | 
64 | 
65 | def load_model_output(output_path, vocabulary_path=None, top_words=10):
66 |     """
67 |     Loads a model output from the choosen directory
68 |     Parameters
69 |     ----------
70 |     :param output_path: path in which th model output is saved
71 |     :param vocabulary_path: path in which the vocabulary is saved (optional, used to retrieve the top k words of each
72 |      topic)
73 |     :param top_words: top k words to retrieve for each topic (in case a vocabulary path is given)
74 |     """
75 |     output = dict(np.load(output_path, allow_pickle=True))
76 |     if vocabulary_path is not None:
77 |         vocabulary_file = open(vocabulary_path, 'r')
78 |         vocabulary = json.load(vocabulary_file)
79 |         index2vocab = vocabulary
80 | 
81 |         topics_output = []
82 |         for topic in output["topic-word-matrix"]:
83 |             top_k = np.argsort(topic)[-top_words:]
84 |             top_k_words = list(
85 |                 reversed([[index2vocab[str(i)], float(topic[i])] for i in top_k]))
86 |             topics_output.append(top_k_words)
87 | 
88 |         output["topic-word-matrix"] = output["topic-word-matrix"].tolist()
89 |         output["topic-document-matrix"] = output["topic-document-matrix"].tolist()
90 |         if "test-topic-word-matrix" in output:
91 |             output["test-topic-word-matrix"] = output["test-topic-word-matrix"].tolist()
92 |         if "test-topic-document-matrix" in output:
93 |             output["test-topic-document-matrix"] = output["test-topic-document-matrix"].tolist()
94 | 
95 |         output["topics"] = topics_output
96 |     return output


--------------------------------------------------------------------------------
/models/pytorchtools.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | class EarlyStopping:
 5 |     """Early stops the training if validation loss doesn't improve after a given patience."""
 6 |     def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
 7 |         """
 8 |         Args:
 9 |             patience (int): How long to wait after last time validation loss improved.
10 |                             Default: 7
11 |             verbose (bool): If True, prints a message for each validation loss improvement.
12 |                             Default: False
13 |             delta (float): Minimum change in the monitored quantity to qualify as an improvement.
14 |                             Default: 0
15 |             path (str): Path for the checkpoint to be saved to.
16 |                             Default: 'checkpoint.pt'
17 |             trace_func (function): trace print function.
18 |                             Default: print
19 |         """
20 |         self.patience = patience
21 |         self.verbose = verbose
22 |         self.counter = 0
23 |         self.best_score = None
24 |         self.early_stop = False
25 |         self.val_loss_min = np.Inf
26 |         self.delta = delta
27 |         self.path = path
28 |         self.trace_func = trace_func
29 | 
30 |     def __call__(self, val_loss, model):
31 | 
32 |         score = -val_loss
33 | 
34 |         if self.best_score is None:
35 |             self.best_score = score
36 |             self.save_checkpoint(val_loss, model)
37 |         elif score < self.best_score + self.delta:
38 |             self.counter += 1
39 |             if self.verbose:
40 |                 self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
41 |             if self.counter >= self.patience:
42 |                 self.early_stop = True
43 |         else:
44 |             self.best_score = score
45 |             self.save_checkpoint(val_loss, model)
46 |             self.counter = 0
47 | 
48 |     def save_checkpoint(self, val_loss, model):
49 |         '''Saves model when validation loss decrease.'''
50 |         if self.verbose:
51 |             self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
52 |         torch.save(model.state_dict(), self.path)
53 |         self.val_loss_min = val_loss


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | snscrape
 2 | hazm
 3 | bitermplus==0.6.12
 4 | Cython==0.29.30
 5 | joblib==1.1.0
 6 | numpy==1.22.4
 7 | pandas==1.4.2
 8 | python-dateutil==2.8.2
 9 | pytz==2022.1
10 | scikit-learn==1.1.1
11 | scipy==1.8.1
12 | six==1.16.0
13 | threadpoolctl==3.1.0
14 | tqdm==4.64.0
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name = 'my_python_package',
 5 |     packages = ['my_python_package'],
 6 |     version = 'version number',  # Ideally should be same as your GitHub release tag varsion
 7 |     description = 'description',
 8 |     author = '',
 9 |     author_email = '',
10 |     url = 'github package source url',
11 |     download_url = 'download link you saved',
12 |     keywords = ['tag1', 'tag2'],
13 |     classifiers = [],
14 | )


--------------------------------------------------------------------------------
/stop_words/stop_words.txt:
--------------------------------------------------------------------------------
   1 | !
   2 | "
   3 | #
   4 | (
   5 | )
   6 | *
   7 | ,
   8 | -
   9 | .
  10 | /
  11 | :
  12 | [
  13 | ]
  14 | «
  15 | »
  16 | ،
  17 | ؛
  18 | ؟
  19 | آباد
  20 | آخ
  21 | آخر
  22 | آخرها
  23 | آخه
  24 | آدمهاست
  25 | آرام
  26 | آرام آرام
  27 | آره
  28 | آری
  29 | آزادانه
  30 | آسان
  31 | آسیب پذیرند
  32 | آشنایند
  33 | آشکارا
  34 | آقا
  35 | آقای
  36 | آقایان
  37 | آمد
  38 | آمدن
  39 | آمده
  40 | آمرانه
  41 | آن
  42 | آن گاه
  43 | آنان
  44 | آنانی
  45 | آنجا
  46 | آنرا
  47 | آنطور
  48 | آنقدر
  49 | آنها
  50 | آنهاست
  51 | آنچنان
  52 | آنچنان که
  53 | اونجور
  54 | اونجوری
  55 | اونجوری که
  56 | آنچه
  57 | آنکه
  58 | آنگاه
  59 | آن‌ها
  60 | آهان
  61 | آهای
  62 | آور
  63 | آورد
  64 | آوردن
  65 | آورده
  66 | آوه
  67 | آی
  68 | آیا
  69 | آید
  70 | آیند
  71 | ا
  72 | اتفاقا
  73 | اثرِ
  74 | اجراست
  75 | احتراما
  76 | احتمالا
  77 | احیاناً
  78 | اخیر
  79 | اخیراً
  80 | اری
  81 | از
  82 | از آن پس
  83 | از بس که
  84 | از جمله
  85 | ازاین رو
  86 | ازجمله
  87 | ازش
  88 | اساسا
  89 | اساساً
  90 | است
  91 | استفاد
  92 | استفاده
  93 | اسلامی اند
  94 | اش
  95 | اشتباها
  96 | اشکارا
  97 | اصلا
  98 | اصلاً
  99 | اصولا
 100 | اصولاً
 101 | اعلام
 102 | اغلب
 103 | افزود
 104 | افسوس
 105 | اقل
 106 | اقلیت
 107 | الا
 108 | الان
 109 | البته
 110 | البتّه
 111 | الهی
 112 | الی
 113 | ام
 114 | اما
 115 | امروز
 116 | امروزه
 117 | امسال
 118 | امشب
 119 | امور
 120 | امیدوارم
 121 | امیدوارند
 122 | امیدواریم
 123 | ان
 124 | ان شاأالله
 125 | انشالا
 126 | انتها
 127 | انجام
 128 | اند
 129 | اندکی
 130 | انشاالله
 131 | انصافا
 132 | انطور
 133 | انقدر
 134 | انها
 135 | انچنان
 136 | انکه
 137 | انگار
 138 | او
 139 | اوست
 140 | اول
 141 | اولا
 142 | اولاً
 143 | اولین
 144 | اون
 145 | اکثر
 146 | اکثرا
 147 | اکثراً
 148 | اکثریت
 149 | اکنون
 150 | اگر
 151 | اگر چه
 152 | اگرچه
 153 | اگه
 154 | ای
 155 | ایا
 156 | اید
 157 | ایشان
 158 | ایم
 159 | این
 160 | این جوری
 161 | این قدر
 162 | این گونه
 163 | اینان
 164 | اینجا
 165 | اینجاست
 166 | ایند
 167 | اینطور
 168 | اینقدر
 169 | اینها
 170 | اینهاست
 171 | اینو
 172 | اینچنین
 173 | اینک
 174 | اینکه
 175 | اینگونه
 176 | ب 
 177 | با
 178 | بااین حال
 179 | بااین وجود
 180 | باد
 181 | بار
 182 | بارة
 183 | باره
 184 | بارها
 185 | باز
 186 | باز هم
 187 | بازهم
 188 | بازی کنان
 189 | بازیگوشانه
 190 | باش
 191 | باشد
 192 | باشم
 193 | باشند
 194 | باشی
 195 | باشید
 196 | باشیم
 197 | بالا
 198 | بالاخره
 199 | بالاخص
 200 | بالاست
 201 | بالای
 202 | بالایِ
 203 | بالطبع
 204 | بالعکس
 205 | باوجودی که
 206 | باورند
 207 | باید
 208 | بتدریج
 209 | بتوان
 210 | بتواند
 211 | بتوانی
 212 | بتوانیم
 213 | بجز
 214 | بخش
 215 | بخشه
 216 | بخشی
 217 | بخصوص
 218 | بخواه
 219 | بخواهد
 220 | بخواهم
 221 | بخواهند
 222 | بخواهی
 223 | بخواهید
 224 | بخواهیم
 225 | بخوبی
 226 | بد
 227 | بدان
 228 | بدانجا
 229 | بدانها
 230 | بدهید
 231 | بدون
 232 | بدین
 233 | بدین ترتیب
 234 | بدینجا
 235 | بر
 236 | برآنند
 237 | برا
 238 | برابر
 239 | برابرِ
 240 | براحتی
 241 | براساس
 242 | براستی
 243 | برای
 244 | برایت
 245 | برایش
 246 | برایشان
 247 | برایم
 248 | برایمان
 249 | برایِ
 250 | برخوردار
 251 | برخوردارند
 252 | برخی
 253 | برداری
 254 | برعکس
 255 | برنامه سازهاست
 256 | بروز
 257 | بروشنی
 258 | بزرگ
 259 | بزودی
 260 | بس
 261 | بسا
 262 | بسادگی
 263 | بسختی
 264 | بسوی
 265 | بسی
 266 | بسیار
 267 | بسیاری
 268 | بشدت
 269 | بطور
 270 | بطوری که
 271 | بعد
 272 | بعد از این که
 273 | بعدا
 274 | بعدازظهر
 275 | بعداً
 276 | بعدها
 277 | بعری
 278 | بعضا
 279 | بعضی
 280 | بعضی شان
 281 | بعضیهایشان
 282 | بعضی‌ها
 283 | بعلاوه
 284 | بعید
 285 | بفهمی نفهمی
 286 | بلافاصله
 287 | بله
 288 | بلکه
 289 | بلی
 290 | بماند
 291 | بنابراین
 292 | بندی
 293 | به
 294 | به آسانی
 295 | به تازگی
 296 | به تدریج
 297 | به تمامی
 298 | به جای
 299 | به جز
 300 | به خوبی
 301 | به درشتی
 302 | به دلخواه
 303 | به راستی
 304 | به رغم
 305 | به روشنی
 306 | به زودی
 307 | به سادگی
 308 | به سرعت
 309 | به شان
 310 | به شدت
 311 | به طور کلی
 312 | به طوری که
 313 | به علاوه
 314 | به قدری
 315 | به مراتب
 316 | به ناچار
 317 | به هرحال
 318 | به هیچ وجه
 319 | به وضوح
 320 | به ویژه
 321 | به کرات
 322 | به گرمی
 323 | بهت
 324 | بهتر
 325 | بهترین
 326 | بهش
 327 | بود
 328 | بودم
 329 | بودن
 330 | بودند
 331 | بوده
 332 | بودی
 333 | بودید
 334 | بودیم
 335 | بویژه
 336 | بپا
 337 | بکار
 338 | بکن
 339 | بکند
 340 | بکنم
 341 | بکنند
 342 | بکنی
 343 | بکنید
 344 | بکنیم
 345 | بگذاریم
 346 | بگو
 347 | بگوید
 348 | بگویم
 349 | بگویند
 350 | بگویی
 351 | بگویید
 352 | بگوییم
 353 | بگیر
 354 | بگیرد
 355 | بگیرم
 356 | بگیرند
 357 | بگیری
 358 | بگیرید
 359 | بگیریم
 360 | بی
 361 | بی آنکه
 362 | بی اطلاعند
 363 | بی تردید
 364 | بی تفاوتند
 365 | بی نیازمندانه
 366 | بی هدف
 367 | بیا
 368 | بیاب
 369 | بیابد
 370 | بیابم
 371 | بیابند
 372 | بیابی
 373 | بیابید
 374 | بیابیم
 375 | بیاور
 376 | بیاورد
 377 | بیاورم
 378 | بیاورند
 379 | بیاوری
 380 | بیاورید
 381 | بیاوریم
 382 | بیاید
 383 | بیایم
 384 | بیایند
 385 | بیایی
 386 | بیایید
 387 | بیاییم
 388 | بیرون
 389 | بیرونِ
 390 | بیست
 391 | بیش
 392 | بیشتر
 393 | بیشتری
 394 | بین
 395 | بیگمان
 396 | ت
 397 | تا
 398 | تازه
 399 | تان
 400 | تاکنون
 401 | تحت
 402 | تحریم هاست
 403 | تر
 404 | تر براساس
 405 | تریلیارد
 406 | تریلیون
 407 | ترین
 408 | تصریحاً
 409 | تعدادی
 410 | تعمدا
 411 | تقریبا
 412 | تقریباً
 413 | تلویحا
 414 | تلویحاً
 415 | تمام
 416 | تمام قد
 417 | تماما
 418 | تمامشان
 419 | تمامی
 420 | تند تند
 421 | تنها
 422 | تو
 423 | توؤماً
 424 | توان
 425 | تواند
 426 | توانست
 427 | توانستم
 428 | توانستن
 429 | توانستند
 430 | توانسته
 431 | توانستی
 432 | توانستیم
 433 | توانم
 434 | توانند
 435 | توانی
 436 | توانید
 437 | توانیم
 438 | توسط
 439 | تولِ
 440 | توی
 441 | تویِ
 442 | تک تک
 443 | ث
 444 | ثالثاً
 445 | ثانیا
 446 | ثانیاً
 447 | ج
 448 | جا
 449 | جای
 450 | جایی
 451 | جدا
 452 | جداً
 453 | جداگانه
 454 | جدید
 455 | جدیدا
 456 | جرمزاست
 457 | جریان
 458 | جز
 459 | جلو
 460 | جلوگیری
 461 | جلوی
 462 | جلویِ
 463 | جمع اند
 464 | جمعا
 465 | جمعی
 466 | جنابعالی
 467 | جناح
 468 | جنس اند
 469 | جهت
 470 | جور
 471 | ح
 472 | حاشیه‌ای
 473 | حاضر
 474 | حاضرم
 475 | حال
 476 | حالا
 477 | حاکیست
 478 | حتما
 479 | حتماً
 480 | حتی
 481 | حداقل
 482 | حداکثر
 483 | حدود
 484 | حدودا
 485 | حدودِ
 486 | حسابگرانه
 487 | حضرتعالی
 488 | حق
 489 | حقیرانه
 490 | حقیقتا
 491 | حول
 492 | حکماً
 493 | خ
 494 | خارجِ
 495 | خالصانه
 496 | خب
 497 | خداحافظ
 498 | خداست
 499 | خدمات
 500 | خسته‌ای
 501 | خصوصا
 502 | خصوصاً
 503 | خلاصه
 504 | خواست
 505 | خواستم
 506 | خواستن
 507 | خواستند
 508 | خواسته
 509 | خواستی
 510 | خواستید
 511 | خواستیم
 512 | خواه
 513 | خواهد
 514 | خواهم
 515 | خواهند
 516 | خواهی
 517 | خواهید
 518 | خواهیم
 519 | خوب
 520 | خود
 521 | خود به خود
 522 | خودبه خودی
 523 | خودت
 524 | خودتان
 525 | خودتو
 526 | خودش
 527 | خودشان
 528 | خودم
 529 | خودمان
 530 | خودمو
 531 | خوش
 532 | خوشبختانه
 533 | خویش
 534 | خویشتن
 535 | خویشتنم
 536 | خیاه
 537 | خیر
 538 | خیره
 539 | خیلی
 540 | د
 541 | دا
 542 | داام
 543 | دااما
 544 | داخل
 545 | داد
 546 | دادم
 547 | دادن
 548 | دادند
 549 | داده
 550 | دادی
 551 | دادید
 552 | دادیم
 553 | دار
 554 | داراست
 555 | دارد
 556 | دارم
 557 | دارند
 558 | داری
 559 | دارید
 560 | داریم
 561 | داشت
 562 | داشتم
 563 | داشتن
 564 | داشتند
 565 | داشته
 566 | داشتی
 567 | داشتید
 568 | داشتیم
 569 | دامم
 570 | دانست
 571 | دانند
 572 | دایم
 573 | دایما
 574 | در
 575 | در باره
 576 | در بارهٌ
 577 | در ثانی
 578 | در مجموع
 579 | در نهایت
 580 | در واقع
 581 | در کل
 582 | در کنار
 583 | دراین میان
 584 | درباره
 585 | درحالی که
 586 | درحالیکه
 587 | درست
 588 | درست و حسابی
 589 | درسته
 590 | درصورتی که
 591 | درعین حال
 592 | درمجموع
 593 | درواقع
 594 | درون
 595 | دریغ
 596 | دریغا
 597 | درین
 598 | دسته دسته
 599 | دشمنیم
 600 | دقیقا
 601 | دم
 602 | دنبالِ
 603 | ده
 604 | دهد
 605 | دهم
 606 | دهند
 607 | دهی
 608 | دهید
 609 | دهیم
 610 | دو
 611 | دو روزه
 612 | دوباره
 613 | دوم
 614 | دیده
 615 | دیر
 616 | دیرت
 617 | دیرم
 618 | دیروز
 619 | دیشب
 620 | دیوانه‌ای
 621 | دیوی
 622 | دیگر
 623 | دیگران
 624 | دیگری
 625 | دیگه
 626 | ذ
 627 | ذاتاً
 628 | ر
 629 | را
 630 | راجع به
 631 | راحت
 632 | راسا
 633 | راست
 634 | راستی
 635 | راه
 636 | رسما
 637 | رسید
 638 | رسیده
 639 | رشته
 640 | رفت
 641 | رفتارهاست
 642 | رفته
 643 | رنجند
 644 | رهگشاست
 645 | رو
 646 | رواست
 647 | روب
 648 | روبروست
 649 | روز
 650 | روز به روز
 651 | روزانه
 652 | روزه ایم
 653 | روزه ست
 654 | روزه م
 655 | روزهای
 656 | روزه‌ای
 657 | روش
 658 | روی
 659 | رویش
 660 | رویِ
 661 | ریزی
 662 | ز
 663 | زشتکارانند
 664 | زمان
 665 | زمانی
 666 | زمینه
 667 | زنند
 668 | زهی
 669 | زود
 670 | زودتر
 671 | زیاد
 672 | زیاده
 673 | زیر
 674 | زیرا
 675 | زیرِ
 676 | زیرچشمی
 677 | س
 678 | سابق
 679 | ساخته
 680 | ساده اند
 681 | سازی
 682 | سالانه
 683 | سالته
 684 | سالم‌تر
 685 | سالهاست
 686 | سالیانه
 687 | ساکنند
 688 | سایر
 689 | سخت
 690 | سخته
 691 | سر
 692 | سراسر
 693 | سرانجام
 694 | سراپا
 695 | سری
 696 | سریع
 697 | سریعا
 698 | سریعاً
 699 | سریِ
 700 | سعی
 701 | سمتِ
 702 | سه باره
 703 | سهواً
 704 | سوم
 705 | سوی
 706 | سویِ
 707 | سپس
 708 | سیاه چاله هاست
 709 | سیخ
 710 | ش
 711 | شان
 712 | شاهدند
 713 | شاهدیم
 714 | شاید
 715 | شبهاست
 716 | شخصا
 717 | شخصاً
 718 | شد
 719 | شدم
 720 | شدن
 721 | شدند
 722 | شده
 723 | شدی
 724 | شدید
 725 | شدیدا
 726 | شدیداً
 727 | شدیم
 728 | شش
 729 | شش نداشته
 730 | شما
 731 | شماری
 732 | شماست
 733 | شمایند
 734 | شناسی
 735 | شو
 736 | شود
 737 | شوراست
 738 | شوقم
 739 | شوم
 740 | شوند
 741 | شونده
 742 | شوی
 743 | شوید
 744 | شویم
 745 | شیرین
 746 | شیرینه
 747 | شیک
 748 | ص
 749 | صد
 750 | صددرصد
 751 | صرفا
 752 | صرفاً
 753 | صریحاً
 754 | صندوق هاست
 755 | صورت
 756 | ض
 757 | ضدِّ
 758 | ضدِّ
 759 | ضمن
 760 | ضمناً
 761 | ط
 762 | طبعا
 763 | طبعاً
 764 | طبقِ
 765 | طبیعتا
 766 | طرف
 767 | طریق
 768 | طلبکارانه
 769 | طور
 770 | طی
 771 | ظ
 772 | ظاهرا
 773 | ظاهراً
 774 | ع
 775 | عاجزانه
 776 | عاقبت
 777 | عبارتند
 778 | عجب
 779 | عجولانه
 780 | عدم
 781 | عرفانی
 782 | عقب
 783 | عقبِ
 784 | علاوه بر
 785 | علاوه بر آن
 786 | علاوه برآن
 787 | علناً
 788 | علّتِ
 789 | علی الظاهر
 790 | علی رغم
 791 | علیرغم
 792 | علیه
 793 | عمدا
 794 | عمداً
 795 | عمدتا
 796 | عمدتاً
 797 | عمده
 798 | عمل
 799 | عملا
 800 | عملاً
 801 | عملی اند
 802 | عموم
 803 | عموما
 804 | عموماً
 805 | عنقریب
 806 | عنوان
 807 | عنوانِ
 808 | عیناً
 809 | غ
 810 | غالبا
 811 | غزالان
 812 | غیر
 813 | غیرقانونی
 814 | ف
 815 | فاقد
 816 | فبها
 817 | فر
 818 | فردا
 819 | فعلا
 820 | فعلاً
 821 | فقط
 822 | فلان
 823 | فلذا
 824 | فوق
 825 | فکر
 826 | ق
 827 | قاالند
 828 | قابل
 829 | قاطبه
 830 | قاطعانه
 831 | قاعدتاً
 832 | قانوناً
 833 | قبل
 834 | قبلا
 835 | قبلاً
 836 | قبلند
 837 | قدر
 838 | قدری
 839 | قصدِ
 840 | قضایاست
 841 | قطعا
 842 | قطعاً
 843 | ل
 844 | لااقل
 845 | لاجرم
 846 | لب
 847 | لذا
 848 | لزوماً
 849 | لطفا
 850 | لطفاً
 851 | لیکن
 852 | م
 853 | ما
 854 | مادامی
 855 | ماست
 856 | مامان مامان گویان
 857 | مان
 858 | مانند
 859 | مانندِ
 860 | مبادا
 861 | متؤسفانه
 862 | متاسفانه
 863 | متعاقبا
 864 | متفاوتند
 865 | مثل
 866 | مثلا
 867 | مثلِ
 868 | مجانی
 869 | مجبورند
 870 | مجددا
 871 | مجدداً
 872 | مجموعا
 873 | مجموعاً
 874 | محتاجند
 875 | محکم
 876 | محکم‌تر
 877 | مخالفند
 878 | مختلف
 879 | مخصوصاً
 880 | مدام
 881 | مدت
 882 | مدتهاست
 883 | مدّتی
 884 | مذهبی اند
 885 | مرا
 886 | مرتب
 887 | مردانه
 888 | مردم
 889 | مردم اند
 890 | مرسی
 891 | مستحضرید
 892 | مستقیما
 893 | مستند
 894 | مسلما
 895 | مشت
 896 | مشترکاً
 897 | مشغولند
 898 | مطمانا
 899 | مطمانم
 900 | مطمینا
 901 | مع الاسف
 902 | مع ذلک
 903 | معتقدم
 904 | معتقدند
 905 | معتقدیم
 906 | معدود
 907 | معذوریم
 908 | معلومه
 909 | معمولا
 910 | معمولاً
 911 | معمولی
 912 | مغرضانه
 913 | مفیدند
 914 | مقابل
 915 | مقدار
 916 | مقصرند
 917 | مقصری
 918 | ملیارد
 919 | ملیون
 920 | ممکن
 921 | ممیزیهاست
 922 | من
 923 | منتهی
 924 | منطقی
 925 | منی
 926 | مواجهند
 927 | موارد
 928 | موجودند
 929 | مورد
 930 | موقتا
 931 | مکرر
 932 | مکرراً
 933 | مگر
 934 | مگر آن که
 935 | مگر این که
 936 | مگو
 937 | می
 938 | میان
 939 | میزان
 940 | میلیارد
 941 | میلیون
 942 | میکند
 943 | میکنم
 944 | میکنند
 945 | میکنی
 946 | میکنید
 947 | میکنیم
 948 | می‌تواند
 949 | می‌خواهیم
 950 | می‌داند
 951 | می‌رسد
 952 | می‌رود
 953 | می‌شود
 954 | می‌کنم
 955 | می‌کنند
 956 | می‌کنیم
 957 | ن
 958 | ناامید
 959 | ناخواسته
 960 | ناراضی اند
 961 | ناشی
 962 | نام
 963 | ناگاه
 964 | ناگزیر
 965 | ناگهان
 966 | ناگهانی
 967 | نباید
 968 | نبش
 969 | نبود
 970 | نخست
 971 | نخستین
 972 | نخواهد
 973 | نخواهم
 974 | نخواهند
 975 | نخواهی
 976 | نخواهید
 977 | نخواهیم
 978 | نخودی
 979 | ندارد
 980 | ندارم
 981 | ندارند
 982 | نداری
 983 | ندارید
 984 | نداریم
 985 | نداشت
 986 | نداشتم
 987 | نداشتند
 988 | نداشته
 989 | نداشتی
 990 | نداشتید
 991 | نداشتیم
 992 | نزد
 993 | نزدِ
 994 | نزدیک
 995 | نزدیکِ
 996 | نسبتا
 997 | نشان
 998 | نشده
 999 | نظیر
1000 | نفرند
1001 | نماید
1002 | نموده
1003 | نمی
1004 | نمی‌شود
1005 | نمی‌کند
1006 | نه
1007 | نه تنها
1008 | نهایتا
1009 | نهایتاً
1010 | نوع
1011 | نوعاً
1012 | نوعی
1013 | نکرده
1014 | نکن
1015 | نکند
1016 | نکنم
1017 | نکنند
1018 | نکنی
1019 | نکنید
1020 | نکنیم
1021 | نگاه
1022 | نگو
1023 | نیازمندند
1024 | نیز
1025 | نیست
1026 | نیستم
1027 | نیستند
1028 | نیستیم
1029 | نیمی
1030 | ه
1031 | ها
1032 | های
1033 | هایی
1034 | هبچ
1035 | هر
1036 | هر از گاهی
1037 | هر چند
1038 | هر چند که
1039 | هر چه
1040 | هرچند
1041 | هرچه
1042 | هرکس
1043 | هرگاه
1044 | هرگز
1045 | هزار
1046 | هست
1047 | هستم
1048 | هستند
1049 | هستی
1050 | هستید
1051 | هستیم
1052 | هفت
1053 | هق هق کنان
1054 | هم
1055 | هم اکنون
1056 | هم اینک
1057 | همان
1058 | همان طور که
1059 | همان گونه که
1060 | همانا
1061 | همانند
1062 | همانها
1063 | همدیگر
1064 | همزمان
1065 | همه
1066 | همه روزه
1067 | همه ساله
1068 | همه شان
1069 | همهٌ
1070 | همه‌اش
1071 | همواره
1072 | همچنان
1073 | همچنان که
1074 | همچنین
1075 | همچون
1076 | همچین
1077 | همگان
1078 | همگی
1079 | همیشه
1080 | همین
1081 | همین که
1082 | هنوز
1083 | هنگام
1084 | هنگامِ
1085 | هنگامی
1086 | هنگامی که
1087 | هوی
1088 | هی
1089 | هیچ
1090 | هیچ گاه
1091 | هیچکدام
1092 | هیچکس
1093 | هیچگاه
1094 | هیچگونه
1095 | هیچی
1096 | و
1097 | و لا غیر
1098 | وابسته اند
1099 | واقعا
1100 | واقعاً
1101 | واقعی
1102 | واقفند
1103 | واما
1104 | وای
1105 | وجود
1106 | وحشت زده
1107 | وسطِ
1108 | وضع
1109 | وقتی
1110 | وقتی که
1111 | وقتیکه
1112 | ولی
1113 | وگرنه
1114 | وگو
1115 | وی
1116 | ویا
1117 | ویژه
1118 | ّه
1119 | ٪
1120 | پ
1121 | پارسال
1122 | پارسایانه
1123 | پاره‌ای
1124 | پاعینِ
1125 | پایین ترند
1126 | پدرانه
1127 | پرسان
1128 | پروردگارا
1129 | پریروز
1130 | پس
1131 | پس از
1132 | پس فردا
1133 | پشت
1134 | پشتوانه اند
1135 | پشیمونی
1136 | پنج
1137 | پهن شده
1138 | پی
1139 | پی درپی
1140 | پیدا
1141 | پیداست
1142 | پیرامون
1143 | پیش
1144 | پیشاپیش
1145 | پیشتر
1146 | پیشِ
1147 | پیوسته
1148 | چ
1149 | چاپلوسانه
1150 | چت
1151 | چته
1152 | چرا
1153 | چرا که
1154 | چشم بسته
1155 | چطور
1156 | چقدر
1157 | چنان
1158 | چنانچه
1159 | چنانکه
1160 | چند
1161 | چند روزه
1162 | چندان
1163 | چنده
1164 | چندین
1165 | چنین
1166 | چه
1167 | چه بسا
1168 | چه طور
1169 | چهار
1170 | چو
1171 | چون
1172 | چکار
1173 | چگونه
1174 | چی
1175 | چیز
1176 | چیزی
1177 | چیزیست
1178 | چیست
1179 | چیه
1180 | ژ
1181 | ک
1182 | کارند
1183 | کاش
1184 | کاشکی
1185 | کامل
1186 | کاملا
1187 | کاملاً
1188 | کتبا
1189 | کجا
1190 | کجاست
1191 | کدام
1192 | کرد
1193 | کردم
1194 | کردن
1195 | کردند
1196 | کرده
1197 | کردی
1198 | کردید
1199 | کردیم
1200 | کس
1201 | کسانی
1202 | کسی
1203 | کل
1204 | کلا
1205 | کلی
1206 | کلیه
1207 | کم
1208 | کم کم
1209 | کمااینکه
1210 | کماکان
1211 | کمتر
1212 | کمتره
1213 | کمتری
1214 | کمی
1215 | کن
1216 | کنار
1217 | کنارش
1218 | کنارِ
1219 | کنایه‌ای
1220 | کند
1221 | کنم
1222 | کنند
1223 | کننده
1224 | کنون
1225 | کنونی
1226 | کنی
1227 | کنید
1228 | کنیم
1229 | که
1230 | کو
1231 | کَی
1232 | کی
1233 | گ
1234 | گاه
1235 | گاهی
1236 | گذاری
1237 | گذاشته
1238 | گذشته
1239 | گردد
1240 | گردند
1241 | گرفت
1242 | گرفتارند
1243 | گرفتم
1244 | گرفتن
1245 | گرفتند
1246 | گرفته
1247 | گرفتی
1248 | گرفتید
1249 | گرفتیم
1250 | گروهی
1251 | گرچه
1252 | گفت
1253 | گفتم
1254 | گفتن
1255 | گفتند
1256 | گفته
1257 | گفتی
1258 | گفتید
1259 | گفتیم
1260 | گه
1261 | گهگاه
1262 | گو
1263 | گونه
1264 | گوی
1265 | گویا
1266 | گوید
1267 | گویم
1268 | گویند
1269 | گویی
1270 | گویید
1271 | گوییم
1272 | گیر
1273 | گیرد
1274 | گیرم
1275 | گیرند
1276 | گیری
1277 | گیرید
1278 | گیریم
1279 | ی
1280 | یا
1281 | یاب
1282 | یابد
1283 | یابم
1284 | یابند
1285 | یابی
1286 | یابید
1287 | یابیم
1288 | یارب
1289 | یافت
1290 | یافتم
1291 | یافتن
1292 | یافته
1293 | یافتی
1294 | یافتید
1295 | یافتیم
1296 | یعنی
1297 | یقینا
1298 | یقیناً
1299 | یه
1300 | یواش یواش
1301 | یک
1302 | یک جوری
1303 | یک کم
1304 | یک کمی
1305 | یکدیگر
1306 | یکریز
1307 | یکسال
1308 | یکهزار
1309 | یکی
1310 | ۰
1311 | ۱
1312 | ۲
1313 | ۳
1314 | ۴
1315 | ۵
1316 | ۶
1317 | ۷
1318 | ۸
1319 | ۹
1320 | …
1321 | ﻿و
1322 | ‏‏‏علاقه مند
1323 | میخونم
1324 | میخوانم
1325 | می خوانم
1326 | میخونید
1327 | میخوانید
1328 | می خوانید
1329 | در آینده
1330 | بشم
1331 | بشی
1332 | بشید
1333 | بشین
1334 | یک چیزی
1335 | بهتون
1336 | اینم
1337 | بیفته
1338 | محض رضای خدا
1339 | هیچوقت
1340 | دونستن
1341 | میفرستین
1342 | میفرستی
1343 | میفرستم
1344 | عه
1345 | هستش
1346 | همه‌مون
1347 | همه مون
1348 | جدی
1349 | بدجور
1350 | بد جور
1351 | خداروشکر
1352 | شی
1353 | وجدانا
1354 | روم
1355 | بگین
1356 | هیچ جور
1357 | هیچجور
1358 | هیچ‌جور
1359 | مثل اینکه
1360 | دوهزاری
1361 | هستا
1362 | شون
1363 | هامو
1364 | هام رو
1365 | مارو
1366 | ما رو
1367 | رو
1368 | داره
1369 | این دفعه
1370 | دفعه


--------------------------------------------------------------------------------
/stop_words/swear_words.txt:
--------------------------------------------------------------------------------
  1 | آب کیر
  2 | آشغال
  3 | آلت تناسلی
  4 | آلت
  5 | ابله
  6 | ابن یزید
  7 | احمق
  8 | اسب
  9 | اسبی
 10 | اسکل
 11 | اسکل
 12 | اسگل
 13 | اسگول
 14 | الاغ
 15 | الاق
 16 | انگل
 17 | انی
 18 | انی
 19 | اوسکل
 20 | اوسکل
 21 | اوسگل
 22 | اوصکل
 23 | اوصگل
 24 | ب ک
 25 | باسن
 26 | بخورش
 27 | بدبخت
 28 | بمال
 29 | تخمم
 30 | کیرم
 31 | بپرروش
 32 | بپرسرش
 33 | کونی
 34 | بکارت
 35 | بکن توش
 36 | بکنش
 37 | بکنمت
 38 | خایه
 39 | بی عفت
 40 | بی غیرت
 41 | بی ناموس
 42 | بی پدر
 43 | بیابخورش
 44 | بیشعور
 45 | بیناموس
 46 | تخم سگ
 47 | تخمی
 48 | ترک
 49 | توله سگ
 50 | جاکش
 51 | جلق زدن
 52 | جنده
 53 | جنسی
 54 | جوون
 55 | جکس
 56 | جیندا
 57 | حرومزاده
 58 | حشر
 59 | حشری شدن
 60 | حشری
 61 | حیوانی
 62 | خارکس ده
 63 | خارکسده
 64 | خارکسّه
 65 | خانم جنده
 66 | خایه خور
 67 | خایه مال
 68 | خایه
 69 | خر
 70 | خرفت
 71 | خری
 72 | خز
 73 | خفه خون
 74 | خفه شو
 75 | خواهرجنده
 76 | خی کاس
 77 | داف ناز
 78 | داف
 79 | داگ استایل
 80 | دخترجنده
 81 | دخترقرتی
 82 | درازگوش
 83 | دله
 84 | دهن سرویس
 85 | گاییده
 86 | دهنت سرویس
 87 | دوجنسه
 88 | دول
 89 | دیوث
 90 | دیوس خان
 91 | دیوس
 92 | دیوص
 93 | رشتی
 94 | ریدن
 95 | ریدی
 96 | زارت
 97 | زباله
 98 | زرنزن
 99 | زن جنده
100 | زن کاسده
101 | زنا زاده
102 | زنا
103 | زنازاده
104 | زنتو
105 | زنشو
106 | زنیکه
107 | سادیسمی
108 | ساک
109 | ساکونی
110 | سرخور
111 | سرکیر
112 | سسکی
113 | سوراخ کون
114 | سوراخ کون
115 | سولاخ
116 | سکس چت
117 | سکس
118 | سکسی باش
119 | سکسی
120 | سکسیم
121 | سکسیی
122 | سگ تو روحت
123 | سگ دهن
124 | سگ صفت
125 | سگ پدر
126 | سگی
127 | سیکتیر
128 | شاسگول
129 | شاش
130 | شق کردن
131 | شل مغز
132 | شنگول
133 | شهوتی
134 | صیغه ای
135 | صیک
136 | عرب
137 | عرق خور
138 | عمتو
139 | عمه ننه
140 | عن تر
141 | عن
142 | عنتر
143 | عوضی
144 | غرمساق
145 | غرمصاق
146 | فاحشه خانم
147 | فاحشه
148 | فارس
149 | فاک فیس
150 | فیلم سوپر
151 | قرتی
152 | قرمساق
153 | قرمصاق
154 | قس
155 | لا پا
156 | لاس
157 | لاش گوشت
158 | لاشی
159 | لاکونی
160 | لجن
161 | لخت
162 | لختی
163 | لر
164 | لز
165 | مادر جنده
166 | مادرجنده
167 | مادرسگ
168 | مادرقهوه
169 | مادرکونی
170 | مالوندن
171 | ماچ کردنی
172 | مرتیکه
173 | مردیکه
174 | مرض داری
175 | مرضداری
176 | مشروب
177 | ملنگ
178 | ممه خور
179 | ممه
180 | منگل
181 | میخوریش
182 | نرکده
183 | نعشه
184 | نکبت
185 | نگاییدم
186 | هیز
187 | ولدزنا
188 | پدر سوخته
189 | پدر سگ
190 | پدر صلواتی
191 | پدرسگ
192 | پریود
193 | پستان
194 | پسون
195 | پشمام
196 | پفیوز
197 | پلشت
198 | پورن
199 | پپه
200 | چاغال
201 | چاقال
202 | چس خور
203 | چس
204 | کاسکش
205 | کث لیس
206 | کث
207 | کثافت
208 | کثافط
209 | کردن
210 | کردنی
211 | کرم
212 | کس خل
213 | کس خور
214 | کس خیس
215 | کس دادن
216 | کس لیس
217 | کس لیسیدن
218 | کس ننت
219 | کس و کیر
220 | کس کردن
221 | کس کش
222 | کس
223 | کسخل
224 | کسشعر
225 | کسکش
226 | کسکیر
227 | کص خل
228 | کص لیس
229 | کص
230 | کصافت
231 | کصافط
232 | کصخل
233 | کصکش
234 | کلفت
235 | کله کیری
236 | کوث لیس
237 | کوس خل
238 | کوس خور
239 | کوس لیس
240 | کوس
241 | کوص خل
242 | کوص لیس
243 | کوص
244 | کون تپل
245 | کون ده
246 | کون سوراخ
247 | کون پنیر
248 | کون گنده
249 | کون
250 | کونده خار
251 | کونده خوار
252 | کونده
253 | کونشو
254 | کونی
255 | کونی
256 | کیر
257 | کیردراز
258 | کیردوس
259 | کیرر
260 | کیرمکیدن
261 | کیرناز
262 | کیروکس
263 | کیروکس
264 | کیری
265 | گاو
266 | گاوی
267 | گاگول
268 | گایدن
269 | گایدی
270 | گاییدن
271 | گردن دراز
272 | گشاد
273 | گوز
274 | گوزو
275 | گوسفند
276 | گوش دراز
277 | گوه
278 | گوه
279 | گی زن
280 | گیخوار
281 | یبن زنا
282 | مادرتو
283 | ناموستو
284 | چنده
285 | باسنی
286 | سیکیم
287 | سگ ناموس
288 | نوب
289 | خایمال
290 | مادر به خطا
291 | کصلیس
292 | بکنت
293 | کصده
294 | گورومساخ
295 | پوفیوز
296 | پدرتو
297 | قورومساق
298 | سیهدیر
299 | اوبی
300 | مادر سگ
301 | نگایدم


--------------------------------------------------------------------------------
/tools/Dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Dataset Module to load custom datasets
 3 | '''
 4 | 
 5 | from operator import index
 6 | from textwrap import indent
 7 | import numpy as np
 8 | 
 9 | 
10 | class Dataset:
11 |     '''
12 |     This module is designed to help users load their own custom dataset.
13 |     '''
14 |     def __init__(self, path:str, encoding:str='utf-8') -> None:
15 |         '''
16 |         initialization of Dataset
17 |         :param path : string, path to the dataset
18 |         :param encoding : string, encoding to read data (default 'utf-8')
19 |         '''
20 | 
21 |         self.initialize_corpus(
22 |             self.load_data(path, encoding)) # initialize train, test, dev
23 |         self.load_vocab(path, encoding) # get vocabulary
24 |         self.wordtoindex = {word: index for index, word in enumerate(self.vocab)}
25 |         self.indextoword = {index: word for word, index in self.wordtoindex.items()}
26 |         self.count_words()
27 | 
28 |     def initialize_corpus(self, data:dict) -> None:
29 |         self.train_corpus = data['train_corpus']
30 |         self.test_corpus = data['test_corpus']
31 |         self.dev_corpus = data['dev_corpus']
32 | 
33 |         self.train_labels = data['train_labels']
34 |         self.test_labels = data['test_labels']
35 |         self.dev_labels = data['dev_labels']
36 | 
37 |     def load_data(self, path:str, encoding:str) -> None:
38 |         data = {
39 |             'train_corpus' : [],
40 |             'test_corpus' : [],
41 |             'dev_corpus' : [],
42 |             'train_labels' : [],
43 |             'test_labels' : [],
44 |             'dev_labels' : []
45 |         }
46 | 
47 |         with open(f'{path}/data.tsv', 'r', encoding=encoding) as f:
48 |             lines = f.readlines()
49 |             for line in lines:
50 |                 _ = line.split('\t')
51 |                 _slice = _[1]
52 |                 if len(_) == 3:
53 |                     try:
54 |                         data[f'{_slice}_corpus'].append(_[0])
55 |                         data[f'{_slice}_labels'].append(_[2])
56 |                     except Exception:
57 |                         print(f'{_slice} is not in [train, test, dev]...')
58 |                 elif len(_) == 2:
59 |                     try:
60 |                         data[f'{_slice}_corpus'].append(_[0])
61 |                     except Exception:
62 |                         print(f'{_slice} is not in [train, test, dev]...')
63 |                 else:
64 |                     raise Exception('data file must have at least 2 and at most 3 columns...')
65 |         return data
66 | 
67 |     def load_vocab(self, path:str, encoding:str) -> None:
68 |         self.vocab = ['UNK']
69 |         with open(f'{path}/vocab.txt', 'r', encoding=encoding) as f:
70 |             lines = f.readlines()
71 |             for line in lines:
72 |                 _ = line.split()
73 |                 self.vocab.append(_[0])
74 |     
75 |     def count_words(self):
76 |         self.words_count = {}
77 |         
78 |         for doc in self.train_corpus:
79 |             tokenized = doc.split()
80 |             for token in tokenized:
81 |                 if token in self.vocab:
82 |                     try:
83 |                         self.words_count[token] += 1
84 |                     except:
85 |                         self.words_count[token] = 1
86 | 
87 |         for i in list(self.words_count.keys()):
88 |             if self.words_count[i] == 0:
89 |                 del self.words_count[i]
90 |                 del self.indextoword[self.wordtoindex[i]]
91 |                 del self.wordtoindex[i]
92 | 


--------------------------------------------------------------------------------
/tools/create_dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This is the function of the scraper that generates a dataset from a list of hashtags.
  3 | """
  4 | 
  5 | from scraper import TwitterScraper
  6 | from hazm import word_tokenize, Normalizer, Lemmatizer
  7 | import numpy as np
  8 | import pandas as pd
  9 | import argparse
 10 | from tqdm import tqdm
 11 | 
 12 | normalizer = Normalizer().normalize
 13 | lemmatizer = Lemmatizer().lemmatize
 14 | 
 15 | # Retrieved from https://github.com/kharazi/persian-stopwords
 16 | stopwords = set(open('../stop_words/stop_words.txt', encoding='utf8').read().splitlines())
 17 | # Retrieved from https://github.com/amirshnll/Persian-Swear-Words
 18 | swearing_words = set(open('../stop_words/swear_words.txt', encoding='utf8').read().splitlines())
 19 | 
 20 | bad_hashtags = set(['تا_آخوند_کفن_نشود_این_وطن_وطن_نشود',
 21 | 'ایران_را_پس_میگیریم',
 22 | 'جمهوری_اسلامی_نابود_باید_گردد',
 23 | 'مرگ_بر_خامنه\\u200cای_جنایتکار',
 24 | 'مرگ_بر_کلیت_و_تمامیت_جمهوری_اسلامی',
 25 | 'جاویدشاه',
 26 | 'نه_به_جمهورى_اسلامى',
 27 | 'ریدم_تو_اسلام',
 28 | 'براندازم',
 29 | 'قيام_تا_سرنگونی',
 30 | 'مريم_رجوی'])
 31 | 
 32 | swearing_words.update(bad_hashtags)
 33 | 
 34 | class const:
 35 |     farsi = ('ب', 'س', 'ش', 'ل', 'ت', 'ن', 'م', 'گ', 'ظ', 'ط', 'ز',
 36 |              'ر', 'ژ', 'ذ', 'د', 'پ', 'چ', 'ج', 'ح', 'ع', 
 37 |              'خ', 'غ', 'ف', 'ق', 'ث', 'ص', 'ض','\u0020',
 38 |              '\u200C', '\u060c','؟', '!', '?', '.', ':','\n', '_')
 39 | 
 40 |     alef = ('ا', 'آ', 'ء', 'أ', 'إ')
 41 |     vav = ('و', 'ؤ')
 42 |     heh = ('ه', 'ة', 'ە')
 43 |     yah = ('ی', 'ي', 'ئ', 'ى')
 44 |     kaf = ('ک', 'ك')
 45 | 
 46 | hashtags = {
 47 |     "economics": [
 48 |         "بورس",
 49 |         "نفت",
 50 |         "دلار",
 51 |         "بازارکار",
 52 |         "اقتصادی",
 53 |         "اخبار_اقتصادی",
 54 |         "اقتصاد_ایران",
 55 |         "بازار_آزاد",
 56 |         "بانک_مرکزی",
 57 |         "ارز",
 58 |         "مالیات",
 59 |         "تورم",
 60 |         "نرخ",
 61 |         "تحریم",
 62 |         "طلا",
 63 |         "ارز۴۲۰۰",
 64 |         "گرانی",
 65 |         "بانک",
 66 |         "سهام_عدالت",
 67 |         "خودرو",
 68 |         "فارکس",
 69 |         "بنزین",
 70 |         "بازار",
 71 |         "نرخ_ارز",
 72 |         "یورو",
 73 |         "قیمت_نفت",
 74 |         "بودجه",
 75 |         "قیمت",
 76 |         "بازار_کار",
 77 |         "اقتصاد",
 78 |         "سکه",
 79 |         "فرابورس",
 80 |         "سهام",
 81 |         "بیمه",
 82 |     ],
 83 |     "health": [
 84 |         "کرونا",
 85 |         "وزارت_بهداشت",
 86 |         "نه_به_واکسن_اجباری",
 87 |         "واکسن",
 88 |         "واکسن_بزنیم",
 89 |         "كرونا",
 90 |         "اومیکرون",
 91 |         "پزشکی",
 92 |         "واکسن_اجباری",
 93 |         "واکسن_کرونا",
 94 |         "پزشک",
 95 |         "امیکرون",
 96 |         "واکسیناسیون",
 97 |         "ماسک",
 98 |         "آمار_کرونا",
 99 |         "واکسن_میزنم",
100 |         "وزات_بهداشت",
101 |         "بهداشت",
102 |         "کووید۱۹",
103 |         "COVID19",
104 |         "وزیر_بهداشت",
105 |         "HIV",
106 |         "اميكرون",
107 |         "نه_به_واکسن",
108 |         "بهترین_واکسن_در_دسترس_ترین_واکسن",
109 |         "أوميكرون",
110 |         "واکسن_حق_مردم",
111 |         "واكسن",
112 |         "برکت",
113 |     ],
114 |     "sport": [
115 |         "استقلال",
116 |         "پرسپولیس",
117 |         "فوتبال",
118 |         "پرسپوليس",
119 |         "ورزش",
120 |         "HalaMadrid",
121 |         "رئال_مادرید",
122 |         "ورزش_سیاسی_نیست",
123 |         "لیگ_برتر",
124 |         "تیم_حکومتی",
125 |         "تاج",
126 |         "آرسنال",
127 |         "پیروزی",
128 |         "فرهاد_مجیدی",
129 |         "والیبال",
130 |         "المپیک",
131 |         "حامد_لک",
132 |         "فوتبال_پاک",
133 |         "دربی",
134 |         "فیفا",
135 |         "لیورپول",
136 |         "پنالتی",
137 |         "فنرباغچه",
138 |         "تراکتور",
139 |         "لیگ",
140 |         "فدراسیون_آبی",
141 |         "ورزش_سیاسی",
142 |         "چلسی",
143 |         "RealPSG",
144 |         "جام_جهانی",
145 |         "مهدی_طارمی",
146 |         "تیم",
147 |         "تنیس",
148 |         "باشگاه",
149 |     ],
150 |     "art": [
151 |         "شعر",
152 |         "کتاب",
153 |         "سینما",
154 |         "تئاتر",
155 |         "فیلم",
156 |         "سریال",
157 |         "كتاب",
158 |         "موسیقی",
159 |         "پیشنهاد_فیلم",
160 |         "آهنگ",
161 |         "حافظ",
162 |         "سعدی",
163 |         "معرفی_کتاب",
164 |         "کارگردان",
165 |         "خواننده",
166 |         "جشنواره_فیلم_فجر",
167 |         "film",
168 |         "cinema",
169 |         "actor",
170 |         "drama",
171 |         "moviestar",
172 |         "Movietime",
173 |     ],
174 |     "tech": [
175 |         "اینترنت",
176 |         "اپل",
177 |         "سامسونگ",
178 |         "بازی",
179 |         "گیم",
180 |         "گوگل",
181 |         "بیت_کوین",
182 |         "کریپتو",
183 |         "اتریوم",
184 |         "ارزدیجیتال",
185 |         "BTC",
186 |         "همراه_اول",
187 |         "Bitcoin",
188 |         "ارز_دیجیتال",
189 |         "بيتكوين",
190 |         "سئو",
191 |         "بیتکوین",
192 |         "ایرانسل",
193 |         "btc",
194 |         "کاردانو",
195 |         "دیجیکالا",
196 |         "هوشمند",
197 |         "استارلینک",
198 |     ],
199 |     "transport": [
200 |         "ترافیک",
201 |         "اسنپ",
202 |         "تپسی",
203 |         "تاکسی",
204 |         "هواپیما",
205 |         "مترو",
206 |         "اتوبوس",
207 |         "طرح_ترافیک",
208 |         "قطار",
209 |         "فرودگاه",
210 |         "سفر_استانی",
211 |         "فرودگاه_مهرآباد",
212 |         "جاده_چالوس",
213 |     ],
214 |     "education": [
215 |         "معلم",
216 |         "آموزش",
217 |         "دانشگاه",
218 |         "کنکور",
219 |         "دانشگاه_آزاد",
220 |         "مدرسه",
221 |         "دانش_آموز",
222 |         "کنکور_سراسری",
223 |         "سازمان_سنجش",
224 |         "دانشگاه_تهران",
225 |         "آموزش_و_پرورش",
226 |         "دانشجو",
227 |         "معلمان",
228 |         "روز_معلم",
229 |         "فرهنگیان",
230 |         "مدارس",
231 |         "دانشگاه_فرهنگیان",
232 |     ],
233 |     "religion": [
234 |         "یا_سید_الساجدین",
235 |         "امام_سجاد",
236 |         "اللهم_عجل_لوليك_الفرج",
237 |         "امام_حسین",
238 |         "خدا",
239 |         "امام",
240 |         "رمضان",
241 |         "قرآن",
242 |         "مسلمان",
243 |         "اسلام",
244 |         "عاشورا",
245 |         "شیعه",
246 |         "حج",
247 |         "MuhammadForAll",
248 |         "زين_العابدين",
249 |         "امام_رضا",
250 |     ],
251 |     "lifestyle": [
252 |         "شیک",
253 |         "زیبایی",
254 |         "تقویم_آشپزی",
255 |         "پوست",
256 |         "آشپزی",
257 |         "غذا",
258 |         "قهوه",
259 |         "رستوران",
260 |     ],
261 |     "social": [
262 |         "روز_جهانی_زن",
263 |         "زن",
264 |         "زنان",
265 |         "روز_زن",
266 |         "خانواده",
267 |         "کشف_حجاب",
268 |         "هشتم_مارس",
269 |         "باحجاب_باوقار",
270 |         "خودکشی",
271 |         "ازدواج",
272 |         "طلاق",
273 |         "فقر",
274 |         "مردان",
275 |         "کودک_همسری",
276 |         "زندانی_سیاسی",
277 |         "حقوق_زنان",
278 |         "حجاب",
279 |     ],
280 |     "ecology": [
281 |         "باران",
282 |         "هوا",
283 |         "آب",
284 |         "زلزله",
285 |         "کم_آبی",
286 |         "آلودگی_هوا",
287 |         "آلودگی",
288 |         "ریزگرد",
289 |         "هوای_تهران",
290 |         "کولاک",
291 |         "گردوخاک",
292 |         "گردوغبار",
293 |         "بارش",
294 |         "سیلاب",
295 |         "بارندگی",
296 |         "آلودگی_هوای_تهران",
297 |         "مدیریت_بحران",
298 |         "برف",
299 |         "سیل",
300 |         "آتش",
301 |         "آتش_سوزی",
302 |         "خشکسالی",
303 |         "محیط_زیست",
304 |         "خاک",
305 |         "هواشناسى",
306 |         "هواشناسی_توییتر",
307 |     ],
308 | }
309 | 
310 | 
311 | def remover(char):
312 |     if char in const.farsi:
313 |         return char
314 |     if char in const.alef:
315 |         return const.alef[0]
316 |     if char in const.vav:
317 |         return const.vav[0]
318 |     if char in const.heh:
319 |         return const.heh[0]
320 |     if char in const.yah:
321 |         return const.yah[0]
322 |     if char in const.kaf:
323 |         return const.kaf[0]
324 |     return ''
325 | 
326 | 
327 | def pre_process(text):
328 |     persian_words = map(remover, text)
329 |     sentence = ''.join(persian_words)
330 |     if (len(sentence) < 20):
331 |       return None
332 |     word_tokens = word_tokenize(sentence)
333 | 
334 |     for w in word_tokens:
335 |       if w in swearing_words:
336 |         return None
337 | 
338 |     filtered_stopwords = [w for w in word_tokens if w not in stopwords and len(w) > 1]
339 | 
340 |     if (len(filtered_stopwords) < 5):
341 |       return None
342 |     filtered_stopwords = ' '.join(filtered_stopwords)
343 |     return filtered_stopwords
344 | 
345 | 
346 | def main(args):
347 |     df = pd.DataFrame([])
348 |     for topic in tqdm(hashtags.keys(), desc='Scraping Topics'):
349 |         scraper = TwitterScraper(
350 |             max_results=args.max_results,
351 |             hashtags=hashtags[topic],
352 |             lang=args.lang,
353 |             until=args.until,
354 |             since=args.since,
355 |             with_replies=args.with_replies,
356 |         )
357 |         result = scraper.basic_mode()
358 |         result['topic'] = topic
359 |         df = pd.concat([df, result], axis=0)
360 |     
361 |     # preprocess
362 |     df = df[df['username'].notna()]
363 |     tweets = map(pre_process, df.text)
364 |     tweets = list(tweets)
365 |     df['processed_text'] = tweets
366 |     df = df[df['processed_text'].notna()]
367 |     df = df.reset_index(drop=True)
368 | 
369 |     df = df.drop_duplicates(subset='tweet_id')
370 |     print('-- Dataframe shape: {}'.format(df.shape))
371 |     df = df.groupby('topic').apply(lambda x: x.sample( len(x) if len(x) < 10000 else 10000)).reset_index(drop=True)
372 |     df = df.reset_index(drop=True)
373 | 
374 |     df.to_csv("../datasets/twitter_dataset.tsv", index=False, sep='\t')
375 |     print('[ OK ] Dataset created.')
376 | 
377 | 
378 | if __name__ == "__main__":
379 |     parser = argparse.ArgumentParser()
380 |     parser.add_argument("--max_results", default=(2 * (10 ** 4)), type=int)
381 |     parser.add_argument("--lang", default="fa", type=str)
382 |     parser.add_argument("--until", default="2022-02-10", type=str)
383 |     parser.add_argument("--since", default="2019-06-01", type=str)
384 |     parser.add_argument("--with_replies", default=False, type=bool)
385 |     args = parser.parse_args()
386 |     print(args)
387 |     main(args)
388 | 


--------------------------------------------------------------------------------
/tools/scraper.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/JustAnotherArchivist/snscrape/blob/master/snscrape/modules/twitter.py
  2 | import snscrape.modules.twitter as sntwitter
  3 | import pandas as pd
  4 | from random import random
  5 | from datetime import date
  6 | from multiprocessing.dummy import Pool
  7 | import time
  8 | 
  9 | 
 10 | class TwitterScraper:
 11 |     def __init__(
 12 |         self,
 13 |         max_results: int,
 14 |         all_words=[],
 15 |         exact_pharase=[],
 16 |         any_words=[],
 17 |         none_words=[],
 18 |         hashtags=[],
 19 |         mentioned_users=[],
 20 |         from_users=[],
 21 |         to_users=[],
 22 |         with_links=True,
 23 |         with_replies=True,
 24 |         **kwargs,
 25 |     ):
 26 |         """
 27 |         :param max_results: Number of tweets will be captured per user.
 28 |         :param all_words: ['what’s', 'happening'] · contains both “what’s” and “happening”
 29 |         :param exact_pharase: ['happy hour'] · contains the exact phrase “happy hour”
 30 |         :param any_words: ['cats', 'dogs'] · contains either “cats” or “dogs” (or both)
 31 |         :param none_words: ['cats', 'dogs'] · does not contain “cats” and does not contain “dogs”
 32 |         :param hashtags: ['#ThrowbackThursday'] or ['ThrowbackThursday'] · contains the hashtag #ThrowbackThursday
 33 |         :param mentioned_users: ['@SFBART', '@Caltrain'] or ['SFBART', 'Caltrain'] · mentions @SFBART or mentions @Caltrain
 34 |         :param from_users: ['@Twitter'] or ['Twitter'] · sent from @Twitter
 35 |         :param to_users: ['@Twitter'] or ['Twitter'] · sent in reply to @Twitter
 36 |         :param with_links: Tweets contain link
 37 |         :param with_replies: Tweets as a replies
 38 |         :param kwargs: The setting of start_time, end_time, and language of the tweets
 39 | 
 40 |         """
 41 | 
 42 |         self.number_of_user = 0
 43 |         self.max_results = max_results
 44 |         self.all_words = TwitterScraper.all_of_these_words(all_words)
 45 |         self.exact_pharase = TwitterScraper.any_of_these_exact_pharase(exact_pharase)
 46 |         self.any_words = TwitterScraper.any_of_these_words(any_words)
 47 |         self.none_words = TwitterScraper.none_of_these_words(none_words)
 48 |         self.these_hashtags = TwitterScraper.any_of_these_hashtags(hashtags)
 49 |         self.mentioned_users = TwitterScraper.mentioning_these_users(mentioned_users)
 50 |         self.with_links = f"-filter:links" if not with_links else ""
 51 |         self.with_replies = f"-filter:replies" if not with_replies else ""
 52 | 
 53 |         self.query_dict = {
 54 |             "all_words": self.all_words,
 55 |             "exact_pharase": self.exact_pharase,
 56 |             "any_words": self.any_words,
 57 |             "none_words": self.none_words,
 58 |             "these_hashtags": self.these_hashtags,
 59 |             "mentioned_users": self.mentioned_users,
 60 |             "with_links": self.with_links,
 61 |             "with_replies": self.with_replies,
 62 |         }
 63 | 
 64 |         self.query_dict["from"] = TwitterScraper.f_or_t_users(from_users, "from")
 65 |         self.query_dict["to"] = TwitterScraper.f_or_t_users(to_users, "to")
 66 | 
 67 |         for key, value in kwargs.items():
 68 |             self.query_dict[key] = f"({key}:{value})"
 69 | 
 70 |     @staticmethod
 71 |     def f_or_t_users(users, key):
 72 |         if not users:
 73 |             return ""
 74 |         tmp_list = [f"{key}:{user}" for user in users]
 75 |         return "(" + " OR ".join(tmp_list) + ")"
 76 | 
 77 |     @staticmethod
 78 |     def all_of_these_words(all_words):
 79 |         if not all_words:
 80 |             return ""
 81 |         return " ".join(all_words)
 82 | 
 83 |     @staticmethod
 84 |     def any_of_these_words(any_words):
 85 |         if not any_words:
 86 |             return ""
 87 |         return "(" + " OR ".join(any_words) + ")"
 88 | 
 89 |     @staticmethod
 90 |     def any_of_these_exact_pharase(exact_pharase):
 91 |         if not exact_pharase:
 92 |             return ""
 93 |         return '("' + '" OR "'.join(exact_pharase) + '")'
 94 | 
 95 |     @staticmethod
 96 |     def none_of_these_words(none_words):
 97 |         if not none_words:
 98 |             return ""
 99 |         return "-" + " -".join(none_words)
100 | 
101 |     @staticmethod
102 |     def any_of_these_hashtags(hashtags):
103 |         if not hashtags:
104 |             return ""
105 |         tmp_list = ["#" + h.replace("#", "") for h in hashtags]
106 |         return "(" + " OR ".join(tmp_list) + ")"
107 | 
108 |     @staticmethod
109 |     def mentioning_these_users(users):
110 |         if not users:
111 |             return ""
112 |         tmp_list = ["@" + h.replace("@", "") for h in users]
113 |         return "(" + " OR ".join(tmp_list) + ")"
114 | 
115 |     def create_query(self, query_dict):
116 |         tmp_string = ""
117 |         res = dict([(key, val) for key, val in query_dict.items() if val])
118 |         del query_dict
119 |         query = " ".join(res.values())
120 |         del res
121 | 
122 |         return query
123 | 
124 |     def crawler(self, query, error_counter=0):
125 |         """
126 |         This is the main function to send the query to Twitter and collect the tweets.
127 | 
128 |         :param query: The Twitter query that we built it.
129 |         """
130 |         # Creating list to append tweet data
131 |         tweets_list = []
132 |         try:
133 |             # Using TwitterSearchScraper to scrape data and append tweets to list
134 |             scraper = sntwitter.TwitterSearchScraper(query)
135 |             i = 0
136 |             for tweet in scraper.get_items():  # declare a username
137 |                 if i >= self.max_results:  # check number and date
138 |                     break
139 | 
140 |                 tweets_list.append(
141 |                     [
142 |                         tweet.date,
143 |                         tweet.id,
144 |                         tweet.content,
145 |                         tweet.replyCount,
146 |                         tweet.retweetCount,
147 |                         tweet.likeCount,
148 |                         tweet.user.username,
149 |                         tweet.lang,
150 |                         tweet.media,
151 |                         tweet.hashtags,
152 |                     ]
153 |                 )  # declare the attributes to be returned
154 |                 i += 1
155 |         except Exception as e:
156 |             if "Unable to find guest token" in str(e):
157 |                 error_counter += 1
158 |                 if error_counter > 3:
159 |                     error_counter = 0
160 |                     print("Sleep Time!")
161 |                     time.sleep(30.3 * 60)
162 |                     print("Morning!")
163 | 
164 |                 return self.crawler(query, error_counter)
165 |             print(f"query: {query} , {e}")
166 | 
167 |         # Creating a dataframe from the tweets list above
168 |         tweets_df = pd.DataFrame(
169 |             tweets_list,
170 |             columns=[
171 |                 "datetime",
172 |                 "tweet_id",
173 |                 "text",
174 |                 "reply_count",
175 |                 "retweet_count",
176 |                 "like_count",
177 |                 "username",
178 |                 "lang",
179 |                 "media",
180 |                 "hashtags",
181 |             ],
182 |         )
183 |         return tweets_df
184 | 
185 |     def basic_mode(self):
186 |         query = self.create_query(self.query_dict)
187 |         return self.crawler(query)
188 | 
189 |     def user_crawler(self, user):
190 |         """
191 |         Calling the query function for a specefic user.
192 | 
193 |         :param user: Account username in Twitter
194 |         """
195 |         tmp_dict = self.query_dict.copy()
196 |         tmp_dict["from"] = f"(from:{user})"
197 |         query = self.create_query(tmp_dict)
198 |         del tmp_dict
199 |         return self.crawler(query)
200 | 
201 |     def user_mode(self, user_list):
202 |         """
203 |         parallelize the process with multithreading(call query function per user)
204 | 
205 |         :param user_list: List of users that we going to collect their tweets.
206 |         """
207 |         user_crawler = self.user_crawler
208 |         pool = Pool(22)
209 |         df_list = pool.map(user_crawler, user_list)
210 |         pool.close()
211 |         pool.join()
212 |         result_df = pd.concat(df_list, ignore_index=True)
213 |         return result_df
214 | 


--------------------------------------------------------------------------------