├── .github └── workflows │ └── python-publish.yml ├── .gitignore ├── README.md ├── models ├── BTM.py ├── CTM.py ├── LDA.py ├── NMF.py ├── contextualized_topic_models │ ├── LICENSE │ ├── __init__.py │ ├── contextualized_topic_models.py │ ├── datasets │ │ ├── __init__.py │ │ └── dataset.py │ ├── models │ │ ├── __init__.py │ │ └── ctm.py │ ├── networks │ │ ├── __init__.py │ │ ├── decoding_network.py │ │ └── inference_network.py │ └── utils │ │ ├── __init__.py │ │ ├── data_preparation.py │ │ └── preprocessing.py ├── model.py └── pytorchtools.py ├── requirements.txt ├── setup.cfg ├── setup.py ├── stop_words ├── stop_words.txt └── swear_words.txt └── tools ├── Dataset.py ├── create_dataset.py └── scraper.py /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Topic Modeling Tool for Persian Short Texts 2 | 3 | The tool for topic modeling provided by the **[Data Science Innovation Center](http://dslab.aut.ac.ir/fa/)** extracts topics from digitized **Persian texts** and compares their performance in short texts using a variety of topic modeling techniques. 4 | 5 | Visit the **[website](http://dslab.aut.ac.ir/fa/products/%d9%be%d8%b1%d8%af%d8%a7%d8%b2%d8%b4-%d9%85%d8%aa%d9%86-%d9%88-%d8%b2%d8%a8%d8%a7%d9%86-%d8%b7%d8%a8%db%8c%d8%b9%db%8c/%d8%a7%d8%a8%d8%b2%d8%a7%d8%b1-%d8%af%d8%b3%d8%aa%d9%87-%d8%a8%d9%86%d8%af%db%8c-%d9%85%d9%88%d8%b6%d9%88%d8%b9%db%8c/)** to view the description in Persian. 6 | 7 | ## Installation 8 | We recommend **Python 3.6** or higher, **[gensim 4.2](https://radimrehurek.com/gensim/)** or higher. 9 | 10 | **Install from sources** 11 | 12 | You can also clone the latest version from the repository and install it directly from the source code: 13 | 14 | ``` 15 | git clone https://github.com/DSInCenter/topicmodel.git 16 | cd topicmodel 17 | pip install -r requirements.txt 18 | ``` 19 | 20 | ## Getting Started 21 | To get started, you can see the demo of GSDMM's algorithm in this link: 22 | - [GSDMM demo](https://dsic-topic-model.herokuapp.com/) 23 | 24 | These examples demonstrate how to clone and execute a model on Google Colab: 25 | - [Run NMF model on Google colab](https://colab.research.google.com/drive/1l7Fs6yYrbIy9fXyTBflMXGaVQjh10RPn?usp=sharing) 26 | - [Run LDA model on Google colab](https://colab.research.google.com/drive/1yhNeh6J177fSQxEZE7OTLJMWtvff7LDA?usp=sharing) 27 | 28 | **LDA demonstration**: 29 | 30 | First, import Dataset Class from Dataset.py and import LDA model from LDA.py: 31 | ````python 32 | from tools.Dataset import Dataset 33 | from LDA import LDA 34 | ```` 35 | 36 | Create Objects from Dataset and LDA Classes and Traing The Model: 37 | ````python 38 | lda = LDA(num_topics=11, iterations=5) 39 | dataset = Dataset('Dataset', 'utf-8') 40 | lda_result = lda.train_model(dataset, hyperparams=None, top_words=10) 41 | print(lda_result) 42 | ```` 43 | 44 | ## Citing & Authors 45 | If you find this repository helpful, feel free to cite this work [](): 46 | 47 | ```bibtex 48 | @article{karimi2023comparative, 49 | title={Comparative Analysis of Topic Modeling Algorithms for Short Texts in Persian Tweets}, 50 | author={Karimi, Amir Hossein and Akbari, Masoud and Akbari, Mohammad}, 51 | year={2023} 52 | } 53 | ``` 54 | 55 | Don't hesitate to send us an e-mail or report an issue, if something is broken (and it shouldn't be) or if you have further questions. 56 | 57 | -------------------------------------------------------------------------------- /models/BTM.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import normalized_mutual_info_score 2 | from models.model import AbstractModel 3 | from tools.Dataset import Dataset 4 | import numpy as np 5 | import pandas as pd 6 | import bitermplus as btm 7 | import argparse 8 | import pickle 9 | 10 | 11 | 12 | class BTM(AbstractModel): 13 | ''' 14 | Bitermplus implements Biterm topic model for short texts 15 | introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi Cheng. 16 | Actually, it is a cythonized version of BTM. 17 | 18 | Source code: https://github.com/maximtrp/bitermplus 19 | 20 | ''' 21 | 22 | def __init__(self, 23 | num_topics: int = 10, 24 | iterations: int = 20, 25 | num_top_words: int = 20, 26 | alpha: float = 1, 27 | beta: float = 0.01, 28 | seed: int = 123 29 | ): 30 | """ 31 | initialization of BTM 32 | 33 | :param num_topics : int, Number of topics. 34 | :param iterations : int, Number of iterations for the model fitting process 35 | :param num_top_words : int, Number of top words for coherence calculation. 36 | :param alpha : float, Model parameter. 37 | :param beta : float, Model parameter. 38 | :param seed : int, Random state seed. 39 | 40 | see https://bitermplus.readthedocs.io/en/latest/bitermplus.html 41 | """ 42 | super().__init__() 43 | self.hyperparameters['num_topics'] = num_topics 44 | self.hyperparameters['iterations'] = iterations 45 | self.hyperparameters['num_top_words'] = num_top_words 46 | self.hyperparameters['alpha'] = alpha 47 | self.hyperparameters['beta'] = beta 48 | self.hyperparameters['seed'] = seed 49 | self.model = None 50 | self.vocab = None 51 | 52 | 53 | def hyperparameters_info(self): 54 | """ 55 | Returns hyperparameters informations 56 | """ 57 | return self.hyperparameters 58 | 59 | 60 | def set_hyperparameters(self, **kwargs): 61 | """ 62 | Set model hyperparameters 63 | """ 64 | super().set_hyperparameters(**kwargs) 65 | 66 | 67 | def train_model(self, dataset, hyperparameters=None, top_words=10): 68 | ''' 69 | Train the model 70 | 71 | :param dataset: Dataset 72 | :param hyperparameters: dictionary in the form {hyperparameter name: value} 73 | :param top_words: number of top significant words for each topic (default: 10) 74 | ''' 75 | 76 | if hyperparameters is None: 77 | hyperparameters = {} 78 | self.hyperparameters.update(hyperparameters) 79 | 80 | ######################### Need Dataset.texts 81 | 82 | # Obtaining terms frequency in a sparse matrix and corpus vocabulary 83 | X, vocabulary, vocab_dict = btm.get_words_freqs() 84 | 85 | # Vectorizing documents 86 | docs_vec = btm.get_vectorized_docs(Dataset.texts, vocabulary) 87 | # Generating biterms 88 | biterms = btm.get_biterms(docs_vec) 89 | 90 | # Initializing and running model 91 | model = btm.BTM(X, vocabulary, seed=12321, T=11, M=10, alpha=50/8, beta=0.01) 92 | model.fit(biterms, iterations=20) 93 | 94 | #Now, we will calculate documents vs topics probability matrix (make an inference). 95 | p_zd = model.transform(docs_vec) 96 | 97 | self.model = model 98 | 99 | 100 | def _select_words(self, topic_id: int, words_num): 101 | probs = self.model.matrix_topics_words_[topic_id, :] 102 | idx = np.argsort(probs)[:-words_num-1:-1] 103 | result = pd.Series(self.model.vocabulary_[idx]) 104 | result.name = 'topic{}'.format(topic_id) 105 | return result 106 | 107 | 108 | def _get_topics_words(self, words_num=20): 109 | topics_num = self.model.topics_num_ 110 | topics_idx = np.arange(topics_num) 111 | top_words_btm = pd.concat(map(lambda x: self._select_words(x, words_num), topics_idx), axis=1) 112 | return top_words_btm 113 | 114 | 115 | 116 | 117 | # def __save_pickle(file, path): 118 | # with open(path, 'wb') as handle: 119 | # pickle.dump(file, handle, protocol=pickle.HIGHEST_PROTOCOL) 120 | 121 | 122 | # def __get_data(path:str, encoding:str) -> pd.DataFrame : 123 | # return pd.read_csv(path, encoding=encoding) 124 | 125 | # def __run_btm(corpus, labels, seed, num_of_topics, iterations): 126 | # print('preparing data...') 127 | # X, vocabulary, vocab_dict = btm.get_words_freqs(corpus) 128 | # tf = np.array(X.sum(axis=0)).ravel() 129 | 130 | # # Vectorizing documents 131 | # docs_vec = btm.get_vectorized_docs(texts, vocabulary) 132 | # docs_lens = list(map(len, docs_vec)) 133 | # # Generating biterms 134 | # biterms = btm.get_biterms(docs_vec) 135 | 136 | # print('running model...') 137 | # # INITIALIZING AND RUNNING MODEL 138 | # model = btm.BTM(X, vocabulary, seed=12321, T=num_of_topics, M=10, alpha=50/8, beta=0.01) 139 | # model.fit(biterms, iterations=iterations) 140 | # #Now, we will calculate documents vs topics probability matrix (make an inference). 141 | # p_zd = model.transform(docs_vec) 142 | 143 | # # Get index of max probability for each document 144 | # top_prob = [np.argmax(i) for i in p_zd] 145 | 146 | # print('*****************************') 147 | # print('Evaluating model performance:') 148 | # print('NMI : {}'.format(normalized_mutual_info_score(labels, top_prob))) 149 | # print('*****************************') 150 | # print('savin results...') 151 | # _save_pickle(p_zd, 'btm_result.pickle') 152 | # print('saving model...') 153 | # _save_pickle(model, 'btm_model.pickle') 154 | 155 | 156 | 157 | 158 | # if __name__ == '__main__': 159 | 160 | # parser = argparse.ArgumentParser(description='Run btm model') 161 | # parser.add_argument('--data', help='path to dataset', nargs='?', default='./data/new_dataset.csv', type=str) 162 | # parser.add_argument('--num_of_topics', help='number of topics', nargs='?', default=11, type=int) 163 | # parser.add_argument('--seed', nargs='?', default=12321, type=int) 164 | # parser.add_argument('--M', nargs='?', default=10, type=int) 165 | # parser.add_argument('--alpha', nargs='?', default=50/8, type=float) 166 | # parser.add_argument('--beta', nargs='?', default=0.01, type=float) 167 | # parser.add_argument('--iterations', nargs='?', default=20, type=int) 168 | # parser.add_argument('--encoding', help='encoding to read dataset', nargs='?', default='utf-8', type=str) 169 | 170 | # args = parser.parse_args() 171 | 172 | # data = __get_data(args.data, args.encoding) 173 | # __run_btm( 174 | # corpus=data['processed_text'].str.strip().tolist(), 175 | # labels=data['topic'], 176 | # seed=args.seed, 177 | # num_of_topics=args.num_of_topics, 178 | # iterations=args.iterations) 179 | 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /models/CTM.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | 3 | from models.model import AbstractModel 4 | from models.contextualized_topic_models.datasets import dataset 5 | from models.contextualized_topic_models.models import ctm 6 | from models.contextualized_topic_models.utils.data_preparation import bert_embeddings_from_list 7 | 8 | import os 9 | import pickle as pkl 10 | 11 | 12 | class CTM(AbstractModel): 13 | 14 | def __init__(self, num_topics=10, model_type='prodLDA', activation='softplus', 15 | dropout=0.2, learn_priors=True, batch_size=64, lr=2e-3, momentum=0.99, 16 | solver='adam', num_epochs=100, reduce_on_plateau=False, prior_mean=0.0, 17 | prior_variance=None, num_layers=2, num_neurons=100, use_partitions=True, num_samples=10, 18 | inference_type="zeroshot", bert_path="", bert_model="m3hrdadfi/bert-zwnj-wnli-mean-tokens"): 19 | """ 20 | initialization of CTM 21 | :param num_topics : int, number of topic components, (default 10) 22 | :param model_type : string, 'prodLDA' or 'LDA' (default 'prodLDA') 23 | :param activation : string, 'softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu', 24 | 'selu' (default 'softplus') 25 | :param num_layers : int, number of layers (default 2) 26 | :param dropout : float, dropout to use (default 0.2) 27 | :param learn_priors : bool, make priors a learnable parameter (default True) 28 | :param batch_size : int, size of batch to use for training (default 64) 29 | :param lr : float, learning rate to use for training (default 2e-3) 30 | :param momentum : float, momentum to use for training (default 0.99) 31 | :param solver : string, optimizer 'adam' or 'sgd' (default 'adam') 32 | :param num_epochs : int, number of epochs to train for, (default 100) 33 | :param num_samples: int, number of times theta needs to be sampled (default: 10) 34 | :param use_partitions: bool, if true the model will be trained on the training set and evaluated on the test 35 | set (default: true) 36 | :param reduce_on_plateau : bool, reduce learning rate by 10x on plateau of 10 epochs (default False) 37 | :param inference_type: the type of the CTM model. It can be "zeroshot" or "combined" (default zeroshot) 38 | :param bert_path: path to store the document contextualized representations 39 | :param bert_model: name of the contextualized model (default: m3hrdadfi/bert-zwnj-wnli-mean-tokens). 40 | see https://www.sbert.net/docs/pretrained_models.html 41 | """ 42 | 43 | super().__init__() 44 | 45 | self.hyperparameters['num_topics'] = num_topics 46 | self.hyperparameters['model_type'] = model_type 47 | self.hyperparameters['activation'] = activation 48 | self.hyperparameters['dropout'] = dropout 49 | self.hyperparameters['inference_type'] = inference_type 50 | self.hyperparameters['learn_priors'] = learn_priors 51 | self.hyperparameters['batch_size'] = batch_size 52 | self.hyperparameters['lr'] = lr 53 | self.hyperparameters['num_samples'] = num_samples 54 | self.hyperparameters['momentum'] = momentum 55 | self.hyperparameters['solver'] = solver 56 | self.hyperparameters['num_epochs'] = num_epochs 57 | self.hyperparameters['reduce_on_plateau'] = reduce_on_plateau 58 | self.hyperparameters["prior_mean"] = prior_mean 59 | self.hyperparameters["prior_variance"] = prior_variance 60 | self.hyperparameters["num_neurons"] = num_neurons 61 | self.hyperparameters["bert_path"] = bert_path 62 | self.hyperparameters["num_layers"] = num_layers 63 | self.hyperparameters["bert_model"] = bert_model 64 | self.use_partitions = use_partitions 65 | 66 | hidden_sizes = tuple([num_neurons for _ in range(num_layers)]) 67 | self.hyperparameters['hidden_sizes'] = tuple(hidden_sizes) 68 | 69 | self.model = None 70 | self.vocab = None 71 | 72 | def train_model(self, dataset, hyperparameters=None, top_words=10): 73 | """ 74 | trains CTM model 75 | :param dataset: octis Dataset for training the model 76 | :param hyperparameters: dict, with optionally) the following information: 77 | :param top_words: number of top-n words of the topics (default 10) 78 | """ 79 | if hyperparameters is None: 80 | hyperparameters = {} 81 | 82 | self.set_params(hyperparameters) 83 | self.vocab = dataset.vocab 84 | 85 | if self.use_partitions: 86 | train, validation, test = dataset.train_corpus, dataset.dev_corpus, dataset.test_corpus 87 | 88 | data_corpus_train = [' '.join(i) for i in train] 89 | data_corpus_test = [' '.join(i) for i in test] 90 | data_corpus_validation = [' '.join(i) for i in validation] 91 | 92 | x_train, x_test, x_valid, input_size = self.preprocess( 93 | self.vocab, data_corpus_train, test=data_corpus_test, validation=data_corpus_validation, 94 | bert_train_path=self.hyperparameters['bert_path'] + "_train.pkl", 95 | bert_test_path=self.hyperparameters['bert_path'] + "_test.pkl", 96 | bert_val_path=self.hyperparameters['bert_path'] + "_val.pkl", 97 | bert_model=self.hyperparameters["bert_model"]) 98 | self.model = ctm.CTM(input_size=input_size, bert_input_size=x_train.X_bert.shape[1], model_type='prodLDA', 99 | num_topics=self.hyperparameters['num_topics'], dropout=self.hyperparameters['dropout'], 100 | activation=self.hyperparameters['activation'], lr=self.hyperparameters['lr'], 101 | inference_type=self.hyperparameters['inference_type'], 102 | hidden_sizes=self.hyperparameters['hidden_sizes'], 103 | solver=self.hyperparameters['solver'], 104 | momentum=self.hyperparameters['momentum'], 105 | num_epochs=self.hyperparameters['num_epochs'], 106 | learn_priors=self.hyperparameters['learn_priors'], 107 | batch_size=self.hyperparameters['batch_size'], 108 | num_samples=self.hyperparameters['num_samples'], 109 | topic_prior_mean=self.hyperparameters["prior_mean"], 110 | reduce_on_plateau=self.hyperparameters['reduce_on_plateau'], 111 | topic_prior_variance=self.hyperparameters["prior_variance"]) 112 | self.model.fit(x_train, x_valid, verbose=False) 113 | result = self.inference(x_test) 114 | return result 115 | 116 | else: 117 | data_corpus = [' '.join(i) for i in dataset.train_corpus()] 118 | x_train, input_size = self.preprocess( 119 | self.vocab, train=data_corpus, bert_train_path=self.hyperparameters['bert_path'] + "_train.pkl", 120 | bert_model=self.hyperparameters["bert_model"]) 121 | 122 | self.model = ctm.CTM(input_size=input_size, bert_input_size=x_train.X_bert.shape[1], model_type='prodLDA', 123 | num_topics=self.hyperparameters['num_topics'], dropout=self.hyperparameters['dropout'], 124 | activation=self.hyperparameters['activation'], lr=self.hyperparameters['lr'], 125 | inference_type=self.hyperparameters['inference_type'], 126 | hidden_sizes=self.hyperparameters['hidden_sizes'], solver=self.hyperparameters['solver'], 127 | momentum=self.hyperparameters['momentum'], num_epochs=self.hyperparameters['num_epochs'], 128 | learn_priors=self.hyperparameters['learn_priors'], 129 | batch_size=self.hyperparameters['batch_size'], 130 | num_samples=self.hyperparameters['num_samples'], 131 | topic_prior_mean=self.hyperparameters["prior_mean"], 132 | reduce_on_plateau=self.hyperparameters['reduce_on_plateau'], 133 | topic_prior_variance=self.hyperparameters["prior_variance"]) 134 | 135 | 136 | self.model.fit(x_train, None, verbose=False) 137 | result = self.model.get_info() 138 | return result 139 | 140 | def set_params(self, hyperparameters): 141 | for k in hyperparameters.keys(): 142 | if k in self.hyperparameters.keys() and k != 'hidden_sizes': 143 | self.hyperparameters[k] = hyperparameters.get(k, self.hyperparameters[k]) 144 | 145 | self.hyperparameters['hidden_sizes'] = tuple( 146 | [self.hyperparameters["num_neurons"] for _ in range(self.hyperparameters["num_layers"])]) 147 | 148 | def inference(self, x_test): 149 | assert isinstance(self.use_partitions, bool) and self.use_partitions 150 | results = self.model.predict(x_test) 151 | return results 152 | 153 | def partitioning(self, use_partitions=False): 154 | self.use_partitions = use_partitions 155 | 156 | @staticmethod 157 | def preprocess(vocab, train, bert_model, test=None, validation=None, 158 | bert_train_path=None, bert_test_path=None, bert_val_path=None): 159 | vocab2id = {w: i for i, w in enumerate(vocab)} 160 | vec = CountVectorizer( 161 | vocabulary=vocab2id, token_pattern=r'(?u)\b[\w+|\-]+\b') 162 | entire_dataset = train.copy() 163 | if test is not None: 164 | entire_dataset.extend(test) 165 | if validation is not None: 166 | entire_dataset.extend(validation) 167 | 168 | vec.fit(entire_dataset) 169 | idx2token = {v: k for (k, v) in vec.vocabulary_.items()} 170 | 171 | x_train = vec.transform(train) 172 | b_train = CTM.load_bert_data(bert_train_path, train, bert_model) 173 | 174 | train_data = dataset.CTMDataset(x_train.toarray(), b_train, idx2token) 175 | input_size = len(idx2token.keys()) 176 | 177 | if test is not None and validation is not None: 178 | x_test = vec.transform(test) 179 | b_test = CTM.load_bert_data(bert_test_path, test, bert_model) 180 | test_data = dataset.CTMDataset(x_test.toarray(), b_test, idx2token) 181 | 182 | x_valid = vec.transform(validation) 183 | b_val = CTM.load_bert_data(bert_val_path, validation, bert_model) 184 | valid_data = dataset.CTMDataset(x_valid.toarray(), b_val, idx2token) 185 | return train_data, test_data, valid_data, input_size 186 | if test is None and validation is not None: 187 | x_valid = vec.transform(validation) 188 | b_val = CTM.load_bert_data(bert_val_path, validation, bert_model) 189 | valid_data = dataset.CTMDataset(x_valid.toarray(), b_val, idx2token) 190 | return train_data, valid_data, input_size 191 | if test is not None and validation is None: 192 | x_test = vec.transform(test) 193 | b_test = CTM.load_bert_data(bert_test_path, test, bert_model) 194 | test_data = dataset.CTMDataset(x_test.toarray(), b_test, idx2token) 195 | return train_data, test_data, input_size 196 | if test is None and validation is None: 197 | return train_data, input_size 198 | 199 | @staticmethod 200 | def load_bert_data(bert_path, texts, bert_model): 201 | if bert_path is not None: 202 | if os.path.exists(bert_path): 203 | bert_ouput = pkl.load(open(bert_path, 'rb')) 204 | else: 205 | bert_ouput = bert_embeddings_from_list(texts, bert_model) 206 | pkl.dump(bert_ouput, open(bert_path, 'wb')) 207 | else: 208 | bert_ouput = bert_embeddings_from_list(texts, bert_model) 209 | return bert_ouput 210 | -------------------------------------------------------------------------------- /models/LDA.py: -------------------------------------------------------------------------------- 1 | from models.model import AbstractModel 2 | import numpy as np 3 | from gensim.models import ldamodel 4 | import gensim.corpora as corpora 5 | #import octis.configuration.citations as citations 6 | #import octis.configuration.defaults as defaults 7 | 8 | 9 | class LDA(AbstractModel): 10 | 11 | id2word = None 12 | id_corpus = None 13 | use_partitions = True 14 | update_with_test = False 15 | 16 | def __init__(self, num_topics=100, distributed=False, chunksize=2000, passes=1, update_every=1, alpha="symmetric", 17 | eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, 18 | random_state=None): 19 | """ 20 | Initialize LDA model 21 | 22 | Parameters 23 | ---------- 24 | num_topics (int, optional) – The number of requested latent topics to be 25 | extracted from the training corpus. 26 | 27 | distributed (bool, optional) – Whether distributed computing should be 28 | used to accelerate training. 29 | 30 | chunksize (int, optional) – Number of documents to be used in each 31 | training chunk. 32 | 33 | passes (int, optional) – Number of passes through the corpus during 34 | training. 35 | 36 | update_every (int, optional) – Number of documents to be iterated 37 | through for each update. Set to 0 for batch learning, > 1 for 38 | online iterative learning. 39 | 40 | alpha ({numpy.ndarray, str}, optional) – Can be set to an 1D array of 41 | length equal to the number of expected topics that expresses our 42 | a-priori belief for the each topics’ probability. Alternatively 43 | default prior selecting strategies can be employed by supplying 44 | a string: 45 | 46 | ’asymmetric’: Uses a fixed normalized asymmetric prior of 47 | 1.0 / topicno. 48 | 49 | ’auto’: Learns an asymmetric prior from the corpus 50 | (not available if distributed==True). 51 | 52 | eta ({float, np.array, str}, optional) – A-priori belief on word 53 | probability, this can be: 54 | 55 | scalar for a symmetric prior over topic/word probability, 56 | 57 | vector of length num_words to denote an asymmetric user defined 58 | probability for each word, 59 | 60 | matrix of shape (num_topics, num_words) to assign a probability 61 | for each word-topic combination, 62 | 63 | the string ‘auto’ to learn the asymmetric prior from the data. 64 | 65 | decay (float, optional) – A number between (0.5, 1] to weight what 66 | percentage of the previous lambda value is forgotten when each new 67 | document is examined. 68 | 69 | offset (float, optional) – Hyper-parameter that controls how much 70 | we will slow down the first steps the first few iterations. 71 | 72 | eval_every (int, optional) – Log perplexity is estimated every 73 | that many updates. Setting this to one slows down training by ~2x. 74 | 75 | iterations (int, optional) – Maximum number of iterations through the 76 | corpus when inferring the topic distribution of a corpus. 77 | 78 | gamma_threshold (float, optional) – Minimum change in the value of the 79 | gamma parameters to continue iterating. 80 | 81 | random_state ({np.random.RandomState, int}, optional) – Either a 82 | randomState object or a seed to generate one. Useful for reproducibility. 83 | 84 | 85 | """ 86 | super().__init__() 87 | self.hyperparameters = dict() 88 | self.hyperparameters["num_topics"] = num_topics 89 | self.hyperparameters["distributed"] = distributed 90 | self.hyperparameters["chunksize"] = chunksize 91 | self.hyperparameters["passes"] = passes 92 | self.hyperparameters["update_every"] = update_every 93 | self.hyperparameters["alpha"] = alpha 94 | self.hyperparameters["eta"] = eta 95 | self.hyperparameters["decay"] = decay 96 | self.hyperparameters["offset"] = offset 97 | self.hyperparameters["eval_every"] = eval_every 98 | self.hyperparameters["iterations"] = iterations 99 | self.hyperparameters["gamma_threshold"] = gamma_threshold 100 | self.hyperparameters["random_state"] = random_state 101 | 102 | # def info(self): 103 | # """ 104 | # Returns model informations 105 | # """ 106 | # return { 107 | # "citation": citations.models_LDA, 108 | # "name": "LDA, Latent Dirichlet Allocation" 109 | # } 110 | 111 | # def hyperparameters_info(self): 112 | # """ 113 | # Returns hyperparameters informations 114 | # """ 115 | # return defaults.LDA_hyperparameters_info 116 | 117 | def set_hyperparameters(self, **kwargs): 118 | """ 119 | Set model hyperparameters 120 | """ 121 | super().set_hyperparameters(**kwargs) 122 | # Allow alpha to be a float in case of symmetric alpha 123 | if "alpha" in kwargs: 124 | if isinstance(kwargs["alpha"], float): 125 | self.hyperparameters["alpha"] = [ 126 | kwargs["alpha"] 127 | ] * self.hyperparameters["num_topics"] 128 | 129 | def partitioning(self, use_partitions, update_with_test=False): 130 | """ 131 | Handle the partitioning system to use and reset the model to perform 132 | new evaluations 133 | 134 | Parameters 135 | ---------- 136 | use_partitions: True if train/set partitioning is needed, False 137 | otherwise 138 | update_with_test: True if the model should be updated with the test set, 139 | False otherwise 140 | """ 141 | self.use_partitions = use_partitions 142 | self.update_with_test = update_with_test 143 | self.id2word = None 144 | self.id_corpus = None 145 | 146 | def train_model(self, dataset, hyperparams=None, top_words=10): 147 | """ 148 | Train the model and return output 149 | 150 | Parameters 151 | ---------- 152 | dataset : dataset to use to build the model 153 | hyperparams : hyperparameters to build the model 154 | top_words : if greater than 0 returns the most significant words for each topic in the output 155 | (Default True) 156 | Returns 157 | ------- 158 | result : dictionary with up to 3 entries, 159 | 'topics', 'topic-word-matrix' and 160 | 'topic-document-matrix' 161 | """ 162 | if hyperparams is None: 163 | hyperparams = {} 164 | 165 | if self.use_partitions: 166 | train_corpus = dataset.train_corpus 167 | test_corpus = dataset.test_corpus 168 | else: 169 | train_corpus = dataset.train_corpus + dataset.test_corpus 170 | 171 | if self.id2word is None: 172 | _corpus = dataset.train_corpus + dataset.test_corpus 173 | self.id2word = corpora.Dictionary([doc.split() for doc in _corpus]) 174 | 175 | if self.id_corpus is None: 176 | self.id_corpus = [self.id2word.doc2bow(document.split()) 177 | for document in train_corpus] 178 | 179 | if "num_topics" not in hyperparams: 180 | hyperparams["num_topics"] = self.hyperparameters["num_topics"] 181 | 182 | # Allow alpha to be a float in case of symmetric alpha 183 | if "alpha" in hyperparams: 184 | if isinstance(hyperparams["alpha"], float): 185 | hyperparams["alpha"] = [ 186 | hyperparams["alpha"] 187 | ] * hyperparams["num_topics"] 188 | 189 | hyperparams["corpus"] = self.id_corpus 190 | hyperparams["id2word"] = self.id2word 191 | self.hyperparameters.update(hyperparams) 192 | 193 | self.trained_model = ldamodel.LdaModel(**self.hyperparameters) 194 | 195 | result = {} 196 | 197 | result["topic-word-matrix"] = self.trained_model.get_topics() 198 | 199 | if top_words > 0: 200 | topics_output = [] 201 | for topic in result["topic-word-matrix"]: 202 | top_k = np.argsort(topic)[-top_words:] 203 | top_k_words = list(reversed([self.id2word[i] for i in top_k])) 204 | topics_output.append(top_k_words) 205 | result["topics"] = topics_output 206 | 207 | result["topic-document-matrix"] = self._get_topic_document_matrix() 208 | 209 | if self.use_partitions: 210 | new_corpus = [self.id2word.doc2bow(document.split()) for document in test_corpus] 211 | if self.update_with_test: 212 | self.trained_model.update(new_corpus) 213 | self.id_corpus.extend(new_corpus) 214 | 215 | result["test-topic-word-matrix"] = self.trained_model.get_topics() 216 | 217 | if top_words > 0: 218 | topics_output = [] 219 | for topic in result["test-topic-word-matrix"]: 220 | top_k = np.argsort(topic)[-top_words:] 221 | top_k_words = list( 222 | reversed([self.id2word[i] for i in top_k])) 223 | topics_output.append(top_k_words) 224 | result["test-topics"] = topics_output 225 | 226 | result["test-topic-document-matrix"] = self._get_topic_document_matrix() 227 | 228 | else: 229 | test_document_topic_matrix = [] 230 | for document in new_corpus: 231 | document_topics_tuples = self.trained_model[document] 232 | document_topics = np.zeros( 233 | self.hyperparameters["num_topics"]) 234 | for single_tuple in document_topics_tuples: 235 | document_topics[single_tuple[0]] = single_tuple[1] 236 | 237 | test_document_topic_matrix.append(document_topics) 238 | result["test-topic-document-matrix"] = np.array( 239 | test_document_topic_matrix).transpose() 240 | return result 241 | 242 | def _get_topics_words(self, topk): 243 | """ 244 | Return the most significative words for each topic. 245 | """ 246 | topic_terms = [] 247 | for i in range(self.hyperparameters["num_topics"]): 248 | topic_words_list = [] 249 | for word_tuple in self.trained_model.get_topic_terms(i, topk): 250 | topic_words_list.append(self.id2word[word_tuple[0]]) 251 | topic_terms.append(topic_words_list) 252 | return topic_terms 253 | 254 | def _get_topic_document_matrix(self): 255 | """ 256 | Return the topic representation of the 257 | corpus 258 | """ 259 | doc_topic_tuples = [] 260 | for document in self.id_corpus: 261 | doc_topic_tuples.append( 262 | self.trained_model.get_document_topics(document, 263 | minimum_probability=0)) 264 | 265 | topic_document = np.zeros(( 266 | self.hyperparameters["num_topics"], 267 | len(doc_topic_tuples))) 268 | 269 | for ndoc in range(len(doc_topic_tuples)): 270 | document = doc_topic_tuples[ndoc] 271 | for topic_tuple in document: 272 | topic_document[topic_tuple[0]][ndoc] = topic_tuple[1] 273 | return topic_document -------------------------------------------------------------------------------- /models/NMF.py: -------------------------------------------------------------------------------- 1 | from models.model import AbstractModel 2 | import numpy as np 3 | from gensim.models import nmf 4 | import gensim.corpora as corpora 5 | 6 | 7 | class NMF(AbstractModel): 8 | 9 | def __init__(self, num_topics=100, chunksize=2000, passes=1, kappa=1.0, minimum_probability=0.01, w_max_iter=200, 10 | w_stop_condition=0.0001, h_max_iter=50, h_stop_condition=0.001, eval_every=10, normalize=True, 11 | random_state=None, use_partitions=True): 12 | """ 13 | Initialize NMF model 14 | Parameters 15 | ---------- 16 | num_topics (int, optional) – Number of topics to extract. 17 | chunksize (int, optional) – Number of documents to be used in each 18 | training chunk. 19 | passes (int, optional) – Number of full passes over the 20 | training corpus. Leave at default passes=1 if your input 21 | is an iterator. 22 | kappa (float, optional) – Gradient descent step size. 23 | Larger value makes the model train faster, but could 24 | lead to non-convergence if set too large. 25 | minimum_probability – If normalize is True, topics with 26 | smaller probabilities are filtered out. If normalize is False, 27 | topics with smaller factors are filtered out. If set to None, 28 | a value of 1e-8 is used to prevent 0s. 29 | w_max_iter (int, optional) – Maximum number of iterations to 30 | train W per each batch. 31 | w_stop_condition (float, optional) – If error difference gets less 32 | than that, training of W stops for the current batch. 33 | h_max_iter (int, optional) – Maximum number of iterations to train 34 | h per each batch. 35 | h_stop_condition (float) – If error difference gets less than that, 36 | training of h stops for the current batch. 37 | eval_every (int, optional) – Number of batches after which l2 norm 38 | of (v - Wh) is computed. Decreases performance if set too low. 39 | normalize (bool or None, optional) – Whether to normalize the result. 40 | random_state ({np.random.RandomState, int}, optional) – Seed for 41 | random generator. Needed for reproducibility. 42 | """ 43 | super().__init__() 44 | self.hyperparameters["num_topics"] = num_topics 45 | self.hyperparameters["chunksize"] = chunksize 46 | self.hyperparameters["passes"] = passes 47 | self.hyperparameters["kappa"] = kappa 48 | self.hyperparameters["minimum_probability"] = minimum_probability 49 | self.hyperparameters["w_max_iter"] = w_max_iter 50 | self.hyperparameters["w_stop_condition"] = w_stop_condition 51 | self.hyperparameters["h_max_iter"] = h_max_iter 52 | self.hyperparameters["h_stop_condition"] = h_stop_condition 53 | self.hyperparameters["eval_every"] = eval_every 54 | self.hyperparameters["normalize"] = normalize 55 | self.hyperparameters["random_state"] = random_state 56 | self.use_partitions = use_partitions 57 | 58 | self.id2word = None 59 | self.id_corpus = None 60 | self.update_with_test = False 61 | 62 | def info(self): 63 | """ 64 | Returns model informations 65 | """ 66 | return { 67 | "citation": citations.models_NMF, 68 | "name": "NMF, Non-negative Matrix Factorization" 69 | } 70 | 71 | def hyperparameters_info(self): 72 | """ 73 | Returns hyperparameters informations 74 | """ 75 | return defaults.NMF_gensim_hyperparameters_info 76 | 77 | def partitioning(self, use_partitions, update_with_test=False): 78 | """ 79 | Handle the partitioning system to use and reset the model to perform 80 | new evaluations 81 | Parameters 82 | ---------- 83 | use_partitions: True if train/set partitioning is needed, False 84 | otherwise 85 | update_with_test: True if the model should be updated with the test set, 86 | False otherwise 87 | """ 88 | self.use_partitions = use_partitions 89 | self.update_with_test = update_with_test 90 | self.id2word = None 91 | self.id_corpus = None 92 | 93 | def train_model(self, dataset, hyperparameters=None, top_words=10): 94 | """ 95 | Train the model and return output 96 | Parameters 97 | ---------- 98 | dataset : dataset to use to build the model 99 | hyperparameters : hyperparameters to build the model 100 | top_words : if greather than 0 returns the most significant words 101 | for each topic in the output 102 | Default True 103 | Returns 104 | ------- 105 | result : dictionary with up to 3 entries, 106 | 'topics', 'topic-word-matrix' and 107 | 'topic-document-matrix' 108 | """ 109 | if hyperparameters is None: 110 | hyperparameters = {} 111 | if self.use_partitions: 112 | partition = [dataset.train_corpus, dataset.test_corpus] 113 | else: 114 | partition = [dataset.train, []] 115 | 116 | if self.id2word is None: 117 | _corpus = dataset.train_corpus + dataset.test_corpus 118 | self.id2word = corpora.Dictionary([doc.split() for doc in _corpus]) 119 | if self.id_corpus is None: 120 | self.id_corpus = [self.id2word.doc2bow( 121 | document.split()) for document in partition[0]] 122 | 123 | hyperparameters["corpus"] = self.id_corpus 124 | hyperparameters["id2word"] = self.id2word 125 | self.hyperparameters.update(hyperparameters) 126 | 127 | self.trained_model = nmf.Nmf(**self.hyperparameters) 128 | 129 | result = {} 130 | 131 | result["topic-word-matrix"] = self.trained_model.get_topics() 132 | 133 | if top_words > 0: 134 | topics_output = [] 135 | for topic in result["topic-word-matrix"]: 136 | top_k = np.argsort(topic)[-top_words:] 137 | top_k_words = list(reversed([self.id2word[i] for i in top_k])) 138 | topics_output.append(top_k_words) 139 | result["topics"] = topics_output 140 | 141 | result["topic-document-matrix"] = self._get_topic_document_matrix() 142 | 143 | if self.use_partitions: 144 | new_corpus = [self.id2word.doc2bow( 145 | document.split()) for document in partition[1]] 146 | if self.update_with_test: 147 | self.trained_model.update(new_corpus) 148 | self.id_corpus.extend(new_corpus) 149 | 150 | result["test-topic-word-matrix"] = self.trained_model.get_topics() 151 | 152 | if top_words > 0: 153 | topics_output = [] 154 | for topic in result["test-topic-word-matrix"]: 155 | top_k = np.argsort(topic)[-top_words:] 156 | top_k_words = list( 157 | reversed([self.id2word[i] for i in top_k])) 158 | topics_output.append(top_k_words) 159 | result["test-topics"] = topics_output 160 | 161 | result["test-topic-document-matrix"] = self._get_topic_document_matrix() 162 | else: 163 | result["test-topic-document-matrix"] = self._get_topic_document_matrix(new_corpus) 164 | return result 165 | 166 | def _get_topics_words(self, topk): 167 | """ 168 | Return the most significative words for each topic. 169 | """ 170 | topic_terms = [] 171 | for i in range(self.hyperparameters["num_topics"]): 172 | topic_words_list = [] 173 | for word_tuple in self.trained_model.get_topic_terms(i, topk): 174 | topic_words_list.append(self.id2word[word_tuple[0]]) 175 | topic_terms.append(topic_words_list) 176 | return topic_terms 177 | 178 | def _get_topic_document_matrix(self, test_corpus=None): 179 | """ 180 | Return the topic representation of the 181 | corpus 182 | """ 183 | doc_topic_tuples = [] 184 | 185 | if test_corpus is None: 186 | for document in self.id_corpus: 187 | doc_topic_tuples.append( 188 | self.trained_model.get_document_topics(document, minimum_probability=0)) 189 | else: 190 | for document in test_corpus: 191 | doc_topic_tuples.append( 192 | self.trained_model.get_document_topics(document, minimum_probability=0)) 193 | topic_document = np.zeros(( 194 | self.hyperparameters["num_topics"], 195 | len(doc_topic_tuples))) 196 | 197 | for ndoc in range(len(doc_topic_tuples)): 198 | document = doc_topic_tuples[ndoc] 199 | for topic_tuple in document: 200 | topic_document[topic_tuple[0]][ndoc] = topic_tuple[1] 201 | return topic_document 202 | 203 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, Federico Bianchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for Contextualized Topic Models.""" 2 | 3 | __author__ = """Federico Bianchi""" 4 | __email__ = 'f.bianchi@unibocconi.it' 5 | __version__ = '1.7.0' 6 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/contextualized_topic_models.py: -------------------------------------------------------------------------------- 1 | """Main module.""" 2 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/datasets/__init__.py -------------------------------------------------------------------------------- /models/contextualized_topic_models/datasets/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset 3 | import scipy.sparse 4 | 5 | 6 | class CTMDataset(Dataset): 7 | 8 | """Class to load BOW dataset.""" 9 | 10 | def __init__(self, X, X_bert, idx2token): 11 | """ 12 | Args 13 | X : array-like, shape=(n_samples, n_features) 14 | Document word matrix. 15 | """ 16 | if X.shape[0] != len(X_bert): 17 | raise Exception("Wait! BoW and Contextual Embeddings have different sizes! " 18 | "You might want to check if the BoW preparation method has removed some documents. ") 19 | 20 | self.X = X 21 | self.X_bert = X_bert 22 | self.idx2token = idx2token 23 | 24 | def __len__(self): 25 | """Return length of dataset.""" 26 | return self.X.shape[0] 27 | 28 | def __getitem__(self, i): 29 | """Return sample from dataset at index i.""" 30 | if type(self.X[i]) == scipy.sparse.csr.csr_matrix: 31 | X = torch.FloatTensor(self.X[i].todense()) 32 | X_bert = torch.FloatTensor(self.X_bert[i]) 33 | else: 34 | X = torch.FloatTensor(self.X[i]) 35 | X_bert = torch.FloatTensor(self.X_bert[i]) 36 | 37 | return {'X': X, 'X_bert': X_bert} 38 | 39 | 40 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/models/__init__.py -------------------------------------------------------------------------------- /models/contextualized_topic_models/models/ctm.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | from collections import defaultdict 4 | 5 | import numpy as np 6 | import torch 7 | from torch import optim 8 | from torch.optim.lr_scheduler import ReduceLROnPlateau 9 | from torch.utils.data import DataLoader 10 | 11 | from models.contextualized_topic_models.networks.decoding_network import DecoderNetwork 12 | from models.pytorchtools import EarlyStopping 13 | 14 | 15 | class CTM(object): 16 | """Class to train the contextualized topic model 17 | """ 18 | 19 | def __init__(self, input_size, bert_input_size, inference_type="zeroshot", num_topics=10, model_type='prodLDA', 20 | hidden_sizes=(100, 100), activation='softplus', dropout=0.2, learn_priors=True, batch_size=64, 21 | lr=2e-3, momentum=0.99, solver='adam', num_epochs=100, num_samples=10, 22 | reduce_on_plateau=False, topic_prior_mean=0.0, topic_prior_variance=None, num_data_loader_workers=0): 23 | """ 24 | :param input_size: int, dimension of input 25 | :param bert_input_size: int, dimension of input that comes from BERT embeddings 26 | :param inference_type: string, you can choose between the contextual model and the combined model 27 | :param num_topics: int, number of topic components, (default 10) 28 | :param model_type: string, 'prodLDA' or 'LDA' (default 'prodLDA') 29 | :param hidden_sizes: tuple, length = n_layers, (default (100, 100)) 30 | :param activation: string, 'softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 'rrelu', 'elu', 31 | 'selu' (default 'softplus') 32 | :param dropout: float, dropout to use (default 0.2) 33 | :param learn_priors: bool, make priors a learnable parameter (default True) 34 | :param batch_size: int, size of batch to use for training (default 64) 35 | :param lr: float, learning rate to use for training (default 2e-3) 36 | :param momentum: float, momentum to use for training (default 0.99) 37 | :param solver: string, optimizer 'adam' or 'sgd' (default 'adam') 38 | :param num_samples: int, number of times theta needs to be sampled 39 | :param num_epochs: int, number of epochs to train for, (default 100) 40 | :param reduce_on_plateau: bool, reduce learning rate by 10x on plateau of 10 epochs (default False) 41 | :param num_data_loader_workers: int, number of data loader workers (default cpu_count). set it to 0 if you are using Windows 42 | """ 43 | 44 | assert isinstance(input_size, int) and input_size > 0, \ 45 | "input_size must by type int > 0." 46 | assert (isinstance(num_topics, int) or isinstance(num_topics, np.int64)) and num_topics > 0, \ 47 | "num_topics must by type int > 0." 48 | assert model_type in ['LDA', 'prodLDA'], \ 49 | "model must be 'LDA' or 'prodLDA'." 50 | assert isinstance(hidden_sizes, tuple), \ 51 | "hidden_sizes must be type tuple." 52 | assert activation in ['softplus', 'relu', 'sigmoid', 'swish', 'tanh', 'leakyrelu', 53 | 'rrelu', 'elu', 'selu'], \ 54 | "activation must be 'softplus', 'relu', 'sigmoid', 'swish', 'leakyrelu'," \ 55 | " 'rrelu', 'elu', 'selu' or 'tanh'." 56 | assert dropout >= 0, "dropout must be >= 0." 57 | # assert isinstance(learn_priors, bool), "learn_priors must be boolean." 58 | assert isinstance(batch_size, int) and batch_size > 0, \ 59 | "batch_size must be int > 0." 60 | assert lr > 0, "lr must be > 0." 61 | assert isinstance(momentum, float) and momentum > 0 and momentum <= 1, \ 62 | "momentum must be 0 < float <= 1." 63 | assert solver in ['adagrad', 'adam', 'sgd', 'adadelta', 'rmsprop'], \ 64 | "solver must be 'adam', 'adadelta', 'sgd', 'rmsprop' or 'adagrad'" 65 | assert isinstance(reduce_on_plateau, bool), \ 66 | "reduce_on_plateau must be type bool." 67 | assert isinstance(topic_prior_mean, float), \ 68 | "topic_prior_mean must be type float" 69 | # and topic_prior_variance >= 0, \ 70 | # assert isinstance(topic_prior_variance, float), \ 71 | # "topic prior_variance must be type float" 72 | 73 | self.input_size = input_size 74 | self.num_topics = num_topics 75 | self.model_type = model_type 76 | self.hidden_sizes = hidden_sizes 77 | self.activation = activation 78 | self.dropout = dropout 79 | self.learn_priors = learn_priors 80 | self.batch_size = batch_size 81 | self.lr = lr 82 | self.num_samples = num_samples 83 | self.bert_size = bert_input_size 84 | self.momentum = momentum 85 | self.solver = solver 86 | self.num_epochs = num_epochs 87 | self.reduce_on_plateau = reduce_on_plateau 88 | self.num_data_loader_workers = num_data_loader_workers 89 | self.topic_prior_mean = topic_prior_mean 90 | self.topic_prior_variance = topic_prior_variance 91 | # init inference avitm network 92 | self.model = DecoderNetwork( 93 | input_size, self.bert_size, inference_type, num_topics, model_type, hidden_sizes, activation, 94 | dropout, self.learn_priors, self.topic_prior_mean, self.topic_prior_variance) 95 | self.early_stopping = EarlyStopping(patience=5, verbose=False) 96 | # init optimizer 97 | if self.solver == 'adam': 98 | self.optimizer = optim.Adam(self.model.parameters(), lr=lr, betas=(self.momentum, 0.99)) 99 | elif self.solver == 'sgd': 100 | self.optimizer = optim.SGD(self.model.parameters(), lr=lr, momentum=self.momentum) 101 | elif self.solver == 'adagrad': 102 | self.optimizer = optim.Adagrad(self.model.parameters(), lr=lr) 103 | elif self.solver == 'adadelta': 104 | self.optimizer = optim.Adadelta(self.model.parameters(), lr=lr) 105 | elif self.solver == 'rmsprop': 106 | self.optimizer = optim.RMSprop(self.model.parameters(), lr=lr, momentum=self.momentum) 107 | # init lr scheduler 108 | if self.reduce_on_plateau: 109 | self.scheduler = ReduceLROnPlateau(self.optimizer, patience=10) 110 | 111 | # performance attributes 112 | self.best_loss_train = float('inf') 113 | 114 | # training attributes 115 | self.model_dir = None 116 | self.train_data = None 117 | self.nn_epoch = None 118 | 119 | # learned topics 120 | self.best_components = None 121 | 122 | # Use cuda if available 123 | if torch.cuda.is_available(): 124 | self.USE_CUDA = True 125 | else: 126 | self.USE_CUDA = False 127 | if self.USE_CUDA: 128 | self.model = self.model.cuda() 129 | 130 | def _loss(self, inputs, word_dists, prior_mean, prior_variance, 131 | posterior_mean, posterior_variance, posterior_log_variance): 132 | # KL term 133 | # var division term 134 | var_division = torch.sum(posterior_variance / prior_variance, dim=1) 135 | # diff means term 136 | diff_means = prior_mean - posterior_mean 137 | diff_term = torch.sum( 138 | (diff_means * diff_means) / prior_variance, dim=1) 139 | # logvar det division term 140 | logvar_det_division = \ 141 | prior_variance.log().sum() - posterior_log_variance.sum(dim=1) 142 | # combine terms 143 | KL = 0.5 * (var_division + diff_term - self.num_topics + logvar_det_division) 144 | # Reconstruction term 145 | RL = -torch.sum(inputs * torch.log(word_dists + 1e-10), dim=1) 146 | loss = KL + RL 147 | 148 | return loss.sum() 149 | 150 | def _train_epoch(self, loader): 151 | """Train epoch.""" 152 | self.model.train() 153 | train_loss = 0 154 | samples_processed = 0 155 | topic_doc_list = [] 156 | for batch_samples in loader: 157 | # batch_size x vocab_size 158 | X = batch_samples['X'] 159 | X = X.reshape(X.shape[0], -1) 160 | X_bert = batch_samples['X_bert'] 161 | if self.USE_CUDA: 162 | X = X.cuda() 163 | X_bert = X_bert.cuda() 164 | 165 | # forward pass 166 | self.model.zero_grad() 167 | prior_mean, prior_variance, \ 168 | posterior_mean, posterior_variance, posterior_log_variance, \ 169 | word_dists, topic_word, topic_document = self.model(X, X_bert) 170 | topic_doc_list.extend(topic_document) 171 | 172 | # backward pass 173 | loss = self._loss(X, word_dists, prior_mean, prior_variance, 174 | posterior_mean, posterior_variance, posterior_log_variance) 175 | loss.backward() 176 | self.optimizer.step() 177 | 178 | # compute train loss 179 | samples_processed += X.size()[0] 180 | train_loss += loss.item() 181 | 182 | train_loss /= samples_processed 183 | 184 | return samples_processed, train_loss, topic_word, topic_doc_list 185 | 186 | def _validation(self, loader): 187 | """Train epoch.""" 188 | self.model.eval() 189 | val_loss = 0 190 | samples_processed = 0 191 | for batch_samples in loader: 192 | # batch_size x vocab_size 193 | X = batch_samples['X'] 194 | X = X.reshape(X.shape[0], -1) 195 | X_bert = batch_samples['X_bert'] 196 | 197 | if self.USE_CUDA: 198 | X = X.cuda() 199 | X_bert = X_bert.cuda() 200 | 201 | # forward pass 202 | self.model.zero_grad() 203 | prior_mean, prior_variance, \ 204 | posterior_mean, posterior_variance, posterior_log_variance, \ 205 | word_dists, topic_word, topic_document = self.model(X, X_bert) 206 | 207 | loss = self._loss(X, word_dists, prior_mean, prior_variance, 208 | posterior_mean, posterior_variance, posterior_log_variance) 209 | 210 | # compute train loss 211 | samples_processed += X.size()[0] 212 | val_loss += loss.item() 213 | 214 | val_loss /= samples_processed 215 | 216 | return samples_processed, val_loss 217 | 218 | def fit(self, train_dataset, validation_dataset=None, save_dir=None, verbose=True): 219 | """ 220 | Train the CTM model. 221 | 222 | :param train_dataset: PyTorch Dataset class for training data. 223 | :param validation_dataset: PyTorch Dataset class for validation data 224 | :param save_dir: directory to save checkpoint models to. 225 | :param verbose: verbose 226 | """ 227 | # Print settings to output file 228 | if verbose: 229 | print("Settings: \n\ 230 | N Components: {}\n\ 231 | Topic Prior Mean: {}\n\ 232 | Topic Prior Variance: {}\n\ 233 | Model Type: {}\n\ 234 | Hidden Sizes: {}\n\ 235 | Activation: {}\n\ 236 | Dropout: {}\n\ 237 | Learn Priors: {}\n\ 238 | Learning Rate: {}\n\ 239 | Momentum: {}\n\ 240 | Reduce On Plateau: {}\n\ 241 | Save Dir: {}".format( 242 | self.num_topics, self.topic_prior_mean, 243 | self.topic_prior_variance, self.model_type, 244 | self.hidden_sizes, self.activation, self.dropout, self.learn_priors, 245 | self.lr, self.momentum, self.reduce_on_plateau, save_dir)) 246 | 247 | self.model_dir = save_dir 248 | self.train_data = train_dataset 249 | self.validation_data = validation_dataset 250 | 251 | train_loader = DataLoader(self.train_data, batch_size=self.batch_size, shuffle=True, 252 | num_workers=self.num_data_loader_workers) 253 | 254 | # init training variables 255 | train_loss = 0 256 | samples_processed = 0 257 | 258 | # train loop 259 | for epoch in range(self.num_epochs): 260 | self.nn_epoch = epoch 261 | # train epoch 262 | s = datetime.datetime.now() 263 | sp, train_loss, topic_word, topic_document = self._train_epoch(train_loader) 264 | samples_processed += sp 265 | e = datetime.datetime.now() 266 | 267 | if verbose: 268 | print("Epoch: [{}/{}]\tSamples: [{}/{}]\tTrain Loss: {}\tTime: {}".format( 269 | epoch + 1, self.num_epochs, samples_processed, 270 | len(self.train_data) * self.num_epochs, train_loss, e - s)) 271 | 272 | self.best_components = self.model.beta 273 | self.final_topic_word = topic_word 274 | self.final_topic_document = topic_document 275 | self.best_loss_train = train_loss 276 | if self.validation_data is not None: 277 | validation_loader = DataLoader( 278 | self.validation_data, batch_size=self.batch_size, shuffle=True, 279 | num_workers=self.num_data_loader_workers) 280 | # train epoch 281 | s = datetime.datetime.now() 282 | val_samples_processed, val_loss = self._validation(validation_loader) 283 | e = datetime.datetime.now() 284 | 285 | if verbose: 286 | print("Epoch: [{}/{}]\tSamples: [{}/{}]\tValidation Loss: {}\tTime: {}".format( 287 | epoch + 1, self.num_epochs, val_samples_processed, 288 | len(self.validation_data) * self.num_epochs, val_loss, e - s)) 289 | 290 | if np.isnan(val_loss) or np.isnan(train_loss): 291 | break 292 | else: 293 | self.early_stopping(val_loss, self.model) 294 | if self.early_stopping.early_stop: 295 | if verbose: 296 | print("Early stopping") 297 | if save_dir is not None: 298 | self.save(save_dir) 299 | break 300 | 301 | def predict(self, dataset): 302 | """Predict input.""" 303 | self.model.eval() 304 | 305 | loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False, 306 | num_workers=self.num_data_loader_workers) 307 | 308 | topic_document_mat = [] 309 | with torch.no_grad(): 310 | for batch_samples in loader: 311 | # batch_size x vocab_size 312 | X = batch_samples['X'] 313 | X = X.reshape(X.shape[0], -1) 314 | X_bert = batch_samples['X_bert'] 315 | 316 | if self.USE_CUDA: 317 | X = X.cuda() 318 | X_bert = X_bert.cuda() 319 | # forward pass 320 | self.model.zero_grad() 321 | _, _, _, _, _, _, _, topic_document = self.model(X, X_bert) 322 | topic_document_mat.append(topic_document) 323 | 324 | results = self.get_info() 325 | results['test-topic-document-matrix'] = np.asarray(self.get_thetas(dataset)).T 326 | 327 | return results 328 | 329 | def get_topic_word_mat(self): 330 | top_wor = self.final_topic_word.cpu().detach().numpy() 331 | return top_wor 332 | 333 | def get_topic_document_mat(self): 334 | top_doc = self.final_topic_document 335 | top_doc_arr = np.array([i.cpu().detach().numpy() for i in top_doc]) 336 | return top_doc_arr 337 | 338 | def get_topics(self, k=10): 339 | """ 340 | Retrieve topic words. 341 | 342 | Args 343 | k : (int) number of words to return per topic, default 10. 344 | """ 345 | assert k <= self.input_size, "k must be <= input size." 346 | component_dists = self.best_components 347 | topics = defaultdict(list) 348 | topics_list = [] 349 | if self.num_topics is not None: 350 | for i in range(self.num_topics): 351 | _, idxs = torch.topk(component_dists[i], k) 352 | component_words = [self.train_data.idx2token[idx] 353 | for idx in idxs.cpu().numpy()] 354 | topics[i] = component_words 355 | topics_list.append(component_words) 356 | 357 | return topics_list 358 | 359 | def get_info(self): 360 | info = {} 361 | topic_word = self.get_topics() 362 | topic_word_dist = self.get_topic_word_mat() 363 | topic_document_dist = self.get_topic_document_mat() 364 | info['topics'] = topic_word 365 | 366 | info['topic-document-matrix'] = np.asarray(self.get_thetas(self.train_data)).T 367 | 368 | info['topic-word-matrix'] = topic_word_dist 369 | return info 370 | 371 | def _format_file(self): 372 | model_dir = "AVITM_nc_{}_tpm_{}_tpv_{}_hs_{}_ac_{}_do_{}_lr_{}_mo_{}_rp_{}". \ 373 | format(self.num_topics, 0.0, 1 - (1. / self.num_topics), 374 | self.model_type, self.hidden_sizes, self.activation, 375 | self.dropout, self.lr, self.momentum, 376 | self.reduce_on_plateau) 377 | return model_dir 378 | 379 | def save(self, models_dir=None): 380 | """ 381 | Save model. 382 | 383 | :param models_dir: path to directory for saving NN models. 384 | """ 385 | if (self.model is not None) and (models_dir is not None): 386 | 387 | model_dir = self._format_file() 388 | if not os.path.isdir(os.path.join(models_dir, model_dir)): 389 | os.makedirs(os.path.join(models_dir, model_dir)) 390 | 391 | filename = "epoch_{}".format(self.nn_epoch) + '.pth' 392 | fileloc = os.path.join(models_dir, model_dir, filename) 393 | with open(fileloc, 'wb') as file: 394 | torch.save({'state_dict': self.model.state_dict(), 395 | 'dcue_dict': self.__dict__}, file) 396 | 397 | def load(self, model_dir, epoch): 398 | """ 399 | Load a previously trained model. 400 | 401 | :param model_dir: directory where models are saved. 402 | :param epoch: epoch of model to load. 403 | """ 404 | epoch_file = "epoch_" + str(epoch) + ".pth" 405 | model_file = os.path.join(model_dir, epoch_file) 406 | with open(model_file, 'rb') as model_dict: 407 | checkpoint = torch.load(model_dict) 408 | 409 | for (k, v) in checkpoint['dcue_dict'].items(): 410 | setattr(self, k, v) 411 | 412 | self.model.load_state_dict(checkpoint['state_dict']) 413 | 414 | def get_thetas(self, dataset): 415 | """ 416 | Get the document-topic distribution for a dataset of topics. Includes multiple sampling to reduce variation via 417 | the parameter num_samples. 418 | :param dataset: a PyTorch Dataset containing the documents 419 | """ 420 | self.model.eval() 421 | 422 | loader = DataLoader( 423 | dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_data_loader_workers) 424 | final_thetas = [] 425 | for sample_index in range(self.num_samples): 426 | with torch.no_grad(): 427 | collect_theta = [] 428 | for batch_samples in loader: 429 | # batch_size x vocab_size 430 | x = batch_samples['X'] 431 | x = x.reshape(x.shape[0], -1) 432 | x_bert = batch_samples['X_bert'] 433 | if self.USE_CUDA: 434 | x = x.cuda() 435 | x_bert = x_bert.cuda() 436 | # forward pass 437 | self.model.zero_grad() 438 | collect_theta.extend(self.model.get_theta(x, x_bert).cpu().numpy().tolist()) 439 | 440 | final_thetas.append(np.array(collect_theta)) 441 | return np.sum(final_thetas, axis=0) / self.num_samples 442 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/networks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/networks/__init__.py -------------------------------------------------------------------------------- /models/contextualized_topic_models/networks/decoding_network.py: -------------------------------------------------------------------------------- 1 | """PyTorch class for feed foward AVITM network.""" 2 | 3 | import torch 4 | from torch import nn 5 | from torch.nn import functional as F 6 | import numpy as np 7 | 8 | from models.contextualized_topic_models.networks.inference_network import CombinedInferenceNetwork, ContextualInferenceNetwork 9 | 10 | 11 | class DecoderNetwork(nn.Module): 12 | 13 | """AVITM Network.""" 14 | 15 | def __init__(self, input_size, bert_size, infnet, n_components=10, model_type='prodLDA', 16 | hidden_sizes=(100,100), activation='softplus', dropout=0.2, 17 | learn_priors=True, topic_prior_mean=0.0, topic_prior_variance=None): 18 | """ 19 | Initialize InferenceNetwork. 20 | 21 | Args 22 | input_size : int, dimension of input 23 | n_components : int, number of topic components, (default 10) 24 | model_type : string, 'prodLDA' or 'LDA' (default 'prodLDA') 25 | hidden_sizes : tuple, length = n_layers, (default (100, 100)) 26 | activation : string, 'softplus', 'relu', (default 'softplus') 27 | learn_priors : bool, make priors learnable parameter 28 | topic_prior_mean: double, mean parameter of the prior 29 | topic_prior_variance: double, variance parameter of the prior 30 | """ 31 | super(DecoderNetwork, self).__init__() 32 | assert isinstance(input_size, int), "input_size must by type int." 33 | assert (isinstance(n_components, int) or isinstance(n_components, np.int64)) and n_components > 0, \ 34 | "n_components must be type int > 0." 35 | assert model_type in ['prodLDA', 'LDA'], \ 36 | "model type must be 'prodLDA' or 'LDA'" 37 | assert isinstance(hidden_sizes, tuple), \ 38 | "hidden_sizes must be type tuple." 39 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 40 | 'rrelu', 'elu', 'selu'], \ 41 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 42 | " 'rrelu', 'elu', 'selu' or 'tanh'." 43 | assert dropout >= 0, "dropout must be >= 0." 44 | assert isinstance(topic_prior_mean, float), \ 45 | "topic_prior_mean must be type float" 46 | # and topic_prior_variance >= 0, \ 47 | #assert isinstance(topic_prior_variance, float), \ 48 | # "topic prior_variance must be type float" 49 | 50 | self.input_size = input_size 51 | self.n_components = n_components 52 | self.model_type = model_type 53 | self.hidden_sizes = hidden_sizes 54 | self.activation = activation 55 | self.dropout = dropout 56 | self.learn_priors = learn_priors 57 | 58 | if infnet == "zeroshot": 59 | self.inf_net = ContextualInferenceNetwork( 60 | input_size, bert_size, n_components, hidden_sizes, activation) 61 | elif infnet == "combined": 62 | self.inf_net = CombinedInferenceNetwork( 63 | input_size, bert_size, n_components, hidden_sizes, activation) 64 | else: 65 | raise Exception('Missing infnet parameter, options are zeroshot and combined') 66 | if torch.cuda.is_available(): 67 | self.inf_net = self.inf_net.cuda() 68 | # init prior parameters 69 | # \mu_1k = log \alpha_k + 1/K \sum_i log \alpha_i; 70 | # \alpha = 1 \forall \alpha 71 | #self.topic_prior_mean = topic_prior_mean 72 | self.prior_mean = torch.tensor( 73 | [topic_prior_mean] * n_components) 74 | if torch.cuda.is_available(): 75 | self.prior_mean = self.prior_mean.cuda() 76 | if self.learn_priors: 77 | self.prior_mean = nn.Parameter(self.prior_mean) 78 | 79 | 80 | # \Sigma_1kk = 1 / \alpha_k (1 - 2/K) + 1/K^2 \sum_i 1 / \alpha_k; 81 | # \alpha = 1 \forall \alpha 82 | if topic_prior_variance is None: 83 | topic_prior_variance = 1. - (1. / self.n_components) 84 | self.prior_variance = torch.tensor( 85 | [topic_prior_variance] * n_components) 86 | if torch.cuda.is_available(): 87 | self.prior_variance = self.prior_variance.cuda() 88 | if self.learn_priors: 89 | self.prior_variance = nn.Parameter(self.prior_variance) 90 | 91 | self.beta = torch.Tensor(n_components, input_size) 92 | if torch.cuda.is_available(): 93 | self.beta = self.beta.cuda() 94 | self.beta = nn.Parameter(self.beta) 95 | nn.init.xavier_uniform_(self.beta) 96 | 97 | self.beta_batchnorm = nn.BatchNorm1d(input_size, affine=False) 98 | 99 | # dropout on theta 100 | self.drop_theta = nn.Dropout(p=self.dropout) 101 | 102 | @staticmethod 103 | def reparameterize(mu, logvar): 104 | """Reparameterize the theta distribution.""" 105 | std = torch.exp(0.5*logvar) 106 | eps = torch.randn_like(std) 107 | return eps.mul(std).add_(mu) 108 | 109 | def forward(self, x, x_bert): 110 | """Forward pass.""" 111 | # batch_size x n_components 112 | posterior_mu, posterior_log_sigma = self.inf_net(x, x_bert) 113 | posterior_sigma = torch.exp(posterior_log_sigma) 114 | 115 | # generate samples from theta 116 | theta = F.softmax(self.reparameterize(posterior_mu, posterior_log_sigma), dim=1) 117 | 118 | topic_doc = theta 119 | theta = self.drop_theta(theta) 120 | 121 | # prodLDA vs LDA 122 | if self.model_type == 'prodLDA': 123 | # in: batch_size x input_size x n_components 124 | word_dist = F.softmax( 125 | self.beta_batchnorm(torch.matmul(theta, self.beta)), dim=1) 126 | topic_word = self.beta 127 | # word_dist: batch_size x input_size 128 | #self.topic_word_matrix = self.beta 129 | elif self.model_type == 'LDA': 130 | # simplex constrain on Beta 131 | beta = F.softmax(self.beta_batchnorm(self.beta), dim=1) 132 | topic_word = beta 133 | word_dist = torch.matmul(theta, beta) 134 | # word_dist: batch_size x input_size 135 | 136 | return self.prior_mean, self.prior_variance, \ 137 | posterior_mu, posterior_sigma, posterior_log_sigma, word_dist, topic_word, topic_doc 138 | 139 | def get_theta(self, x, x_bert): 140 | with torch.no_grad(): 141 | # batch_size x n_components 142 | posterior_mu, posterior_log_sigma = self.inf_net(x, x_bert) 143 | posterior_sigma = torch.exp(posterior_log_sigma) 144 | 145 | # generate samples from theta 146 | theta = F.softmax( 147 | self.reparameterize(posterior_mu, posterior_log_sigma), dim=1) 148 | 149 | return theta 150 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/networks/inference_network.py: -------------------------------------------------------------------------------- 1 | """PyTorch class for feed foward inference network.""" 2 | 3 | from collections import OrderedDict 4 | from torch import nn 5 | import torch 6 | import numpy as np 7 | 8 | class ContextualInferenceNetwork(nn.Module): 9 | 10 | """Inference Network.""" 11 | 12 | def __init__(self, input_size, bert_size, output_size, hidden_sizes, 13 | activation='softplus', dropout=0.2): 14 | """ 15 | Initialize InferenceNetwork. 16 | 17 | Args 18 | input_size : int, dimension of input 19 | output_size : int, dimension of output 20 | hidden_sizes : tuple, length = n_layers 21 | activation : string, 'softplus' or 'relu', default 'softplus' 22 | dropout : float, default 0.2, default 0.2 23 | """ 24 | super(ContextualInferenceNetwork, self).__init__() 25 | assert isinstance(input_size, int), "input_size must by type int." 26 | assert isinstance(output_size, int), "output_size must be type int." 27 | assert isinstance(hidden_sizes, tuple), \ 28 | "hidden_sizes must be type tuple." 29 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 30 | 'rrelu', 'elu', 'selu'], \ 31 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 32 | " 'rrelu', 'elu', 'selu' or 'tanh'." 33 | assert dropout >= 0, "dropout must be >= 0." 34 | 35 | self.input_size = input_size 36 | self.output_size = output_size 37 | self.hidden_sizes = hidden_sizes 38 | self.dropout = dropout 39 | 40 | if activation == 'softplus': 41 | self.activation = nn.Softplus() 42 | elif activation == 'relu': 43 | self.activation = nn.ReLU() 44 | elif activation == 'sigmoid': 45 | self.activation = nn.Sigmoid() 46 | elif activation == 'tanh': 47 | self.activation = nn.Tanh() 48 | elif activation == 'leakyrelu': 49 | self.activation = nn.LeakyReLU() 50 | elif activation == 'rrelu': 51 | self.activation = nn.RReLU() 52 | elif activation == 'elu': 53 | self.activation = nn.ELU() 54 | elif activation == 'selu': 55 | self.activation = nn.SELU() 56 | 57 | self.input_layer = nn.Linear(input_size+input_size, hidden_sizes[0]) 58 | self.adapt_bert = nn.Linear(bert_size, hidden_sizes[0]) 59 | 60 | self.hiddens = nn.Sequential(OrderedDict([ 61 | ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation)) 62 | for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) 63 | 64 | self.f_mu = nn.Linear(hidden_sizes[-1], output_size) 65 | self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False) 66 | 67 | self.f_sigma = nn.Linear(hidden_sizes[-1], output_size) 68 | self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False) 69 | 70 | self.dropout_enc = nn.Dropout(p=self.dropout) 71 | 72 | def forward(self, x, x_bert): 73 | """Forward pass.""" 74 | x_bert = self.adapt_bert(x_bert) 75 | 76 | x = self.activation(x_bert) 77 | x = self.hiddens(x) 78 | x = self.dropout_enc(x) 79 | mu = self.f_mu_batchnorm(self.f_mu(x)) 80 | log_sigma = self.f_sigma_batchnorm(self.f_sigma(x)) 81 | 82 | return mu, log_sigma 83 | 84 | 85 | class CombinedInferenceNetwork(nn.Module): 86 | 87 | """Inference Network.""" 88 | 89 | def __init__(self, input_size, bert_size, output_size, hidden_sizes, 90 | activation='softplus', dropout=0.2): 91 | """ 92 | Initialize InferenceNetwork. 93 | 94 | Args 95 | input_size : int, dimension of input 96 | output_size : int, dimension of output 97 | hidden_sizes : tuple, length = n_layers 98 | activation : string, 'softplus' or 'relu', default 'softplus' 99 | dropout : float, default 0.2, default 0.2 100 | """ 101 | super(CombinedInferenceNetwork, self).__init__() 102 | assert isinstance(input_size, int), "input_size must by type int." 103 | assert (isinstance(output_size, int) or isinstance(output_size, np.int64)), "output_size must be type int." 104 | assert isinstance(hidden_sizes, tuple), \ 105 | "hidden_sizes must be type tuple." 106 | assert activation in ['softplus', 'relu', 'sigmoid', 'tanh', 'leakyrelu', 107 | 'rrelu', 'elu', 'selu'], \ 108 | "activation must be 'softplus', 'relu', 'sigmoid', 'leakyrelu'," \ 109 | " 'rrelu', 'elu', 'selu' or 'tanh'." 110 | 111 | assert dropout >= 0, "dropout must be >= 0." 112 | 113 | self.input_size = input_size 114 | self.output_size = output_size 115 | self.hidden_sizes = hidden_sizes 116 | self.dropout = dropout 117 | 118 | if activation == 'softplus': 119 | self.activation = nn.Softplus() 120 | elif activation == 'relu': 121 | self.activation = nn.ReLU() 122 | elif activation == 'sigmoid': 123 | self.activation = nn.Sigmoid() 124 | elif activation == 'tanh': 125 | self.activation = nn.Tanh() 126 | elif activation == 'leakyrelu': 127 | self.activation = nn.LeakyReLU() 128 | elif activation == 'rrelu': 129 | self.activation = nn.RReLU() 130 | elif activation == 'elu': 131 | self.activation = nn.ELU() 132 | elif activation == 'selu': 133 | self.activation = nn.SELU() 134 | 135 | self.input_layer = nn.Linear(input_size+input_size, hidden_sizes[0]) 136 | self.adapt_bert = nn.Linear(bert_size, input_size) 137 | self.bert_layer = nn.Linear(hidden_sizes[0], hidden_sizes[0]) 138 | 139 | self.hiddens = nn.Sequential(OrderedDict([ 140 | ('l_{}'.format(i), nn.Sequential(nn.Linear(h_in, h_out), self.activation)) 141 | for i, (h_in, h_out) in enumerate(zip(hidden_sizes[:-1], hidden_sizes[1:]))])) 142 | 143 | self.f_mu = nn.Linear(hidden_sizes[-1], output_size) 144 | self.f_mu_batchnorm = nn.BatchNorm1d(output_size, affine=False) 145 | 146 | self.f_sigma = nn.Linear(hidden_sizes[-1], output_size) 147 | self.f_sigma_batchnorm = nn.BatchNorm1d(output_size, affine=False) 148 | 149 | self.dropout_enc = nn.Dropout(p=self.dropout) 150 | 151 | def forward(self, x, x_bert): 152 | """Forward pass.""" 153 | x_bert = self.adapt_bert(x_bert) 154 | x = torch.cat((x, x_bert), 1) 155 | x = self.input_layer(x) 156 | 157 | x = self.activation(x) 158 | x = self.hiddens(x) 159 | x = self.dropout_enc(x) 160 | mu = self.f_mu_batchnorm(self.f_mu(x)) 161 | log_sigma = self.f_sigma_batchnorm(self.f_sigma(x)) 162 | 163 | return mu, log_sigma 164 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DSInCenter/pySTTM/9771af0dfa85a2996fbb90122ae22649cd076a51/models/contextualized_topic_models/utils/__init__.py -------------------------------------------------------------------------------- /models/contextualized_topic_models/utils/data_preparation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sentence_transformers import SentenceTransformer 3 | import scipy.sparse 4 | import warnings 5 | from models.contextualized_topic_models.datasets.dataset import CTMDataset 6 | import os 7 | import pickle as pkl 8 | 9 | def get_bag_of_words(data, min_length): 10 | """ 11 | Creates the bag of words 12 | """ 13 | vect = [np.bincount(x[x != np.array(None)].astype('int'), minlength=min_length) 14 | for x in data if np.sum(x[x != np.array(None)]) != 0] 15 | 16 | vect = scipy.sparse.csr_matrix(vect) 17 | return vect 18 | 19 | def bert_embeddings_from_file(text_file, sbert_model_to_load, batch_size=200): 20 | """ 21 | Creates SBERT Embeddings from an input file 22 | """ 23 | model = SentenceTransformer(sbert_model_to_load) 24 | with open(text_file, encoding="utf-8") as filino: 25 | train_text = list(map(lambda x: x, filino.readlines())) 26 | 27 | return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size)) 28 | 29 | 30 | def bert_embeddings_from_list(texts, sbert_model_to_load="bert-base-nli-mean-tokens", batch_size=100): 31 | """ 32 | Creates SBERT Embeddings from a list 33 | """ 34 | model = SentenceTransformer(sbert_model_to_load) 35 | return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size)) 36 | 37 | 38 | class QuickText: 39 | """ 40 | Integrated class to handle all the text preprocessing needed 41 | """ 42 | def __init__(self, bert_model, text_for_bow, text_for_bert=None, bert_path=None): 43 | """ 44 | :param bert_model: string, bert model to use 45 | :param text_for_bert: list, list of sentences with the unpreprocessed text 46 | :param text_for_bow: list, list of sentences with the preprocessed text 47 | """ 48 | self.vocab_dict = {} 49 | self.vocab = [] 50 | self.index_dd = None 51 | self.idx2token = None 52 | self.bow = None 53 | self.bert_model = bert_model 54 | self.text_handler = "" 55 | self.data_bert = None 56 | self.text_for_bow = text_for_bow 57 | 58 | if text_for_bert is not None: 59 | self.text_for_bert = text_for_bert 60 | else: 61 | self.text_for_bert = None 62 | self.bert_path = bert_path 63 | 64 | def prepare_bow(self): 65 | indptr = [0] 66 | indices = [] 67 | data = [] 68 | vocabulary = {} 69 | 70 | if self.text_for_bow is not None: 71 | docs = self.text_for_bow 72 | else: 73 | docs = self.text_for_bert 74 | 75 | for d in docs: 76 | for term in d.split(): 77 | index = vocabulary.setdefault(term, len(vocabulary)) 78 | indices.append(index) 79 | data.append(1) 80 | indptr.append(len(indices)) 81 | 82 | self.vocab_dict = vocabulary 83 | self.vocab = list(vocabulary.keys()) 84 | 85 | warnings.simplefilter('always', DeprecationWarning) 86 | if len(self.vocab) > 2000: 87 | warnings.warn("The vocab you are using has more than 2000 words, reconstructing high-dimensional vectors requires" 88 | "significantly more training epochs and training samples. " 89 | "Consider reducing the number of vocabulary items. " 90 | "See https://github.com/MilaNLProc/contextualized-topic-models#preprocessing " 91 | "and https://github.com/MilaNLProc/contextualized-topic-models#tldr", Warning) 92 | 93 | self.idx2token = {v: k for (k, v) in self.vocab_dict.items()} 94 | self.bow = scipy.sparse.csr_matrix((data, indices, indptr), dtype=int) 95 | 96 | def load_contextualized_embeddings(self, embeddings): 97 | self.data_bert = embeddings 98 | 99 | def load_dataset(self): 100 | self.prepare_bow() 101 | if self.bert_path is not None: 102 | if os.path.exists(self.bert_path): 103 | self.data_bert = pkl.load(open(self.bert_path, 'r')) 104 | else: 105 | if self.data_bert is None: 106 | if self.text_for_bert is not None: 107 | self.data_bert = bert_embeddings_from_list(self.text_for_bert, self.bert_model) 108 | else: 109 | self.data_bert = bert_embeddings_from_list(self.text_for_bow, self.bert_model) 110 | pkl.dump(self.data_bert, open(self.bert_path, 'w')) 111 | 112 | training_dataset = CTMDataset(self.bow, self.data_bert, self.idx2token) 113 | return training_dataset 114 | 115 | class TextHandler: 116 | """ 117 | Class used to handle the text preparation and the BagOfWord 118 | """ 119 | def __init__(self, file_name=None, sentences=None): 120 | self.file_name = file_name 121 | self.sentences = sentences 122 | self.vocab_dict = {} 123 | self.vocab = [] 124 | self.index_dd = None 125 | self.idx2token = None 126 | self.bow = None 127 | 128 | warnings.simplefilter('always', DeprecationWarning) 129 | if len(self.vocab) > 2000: 130 | warnings.warn("TextHandler class is deprecated and will be removed in version 2.0. Use QuickText.", Warning) 131 | 132 | def prepare(self): 133 | indptr = [0] 134 | indices = [] 135 | data = [] 136 | vocabulary = {} 137 | 138 | if self.sentences is None and self.file_name is None: 139 | raise Exception("Sentences and file_names cannot both be none") 140 | 141 | if self.sentences is not None: 142 | docs = self.sentences 143 | elif self.file_name is not None: 144 | with open(self.file_name, encoding="utf-8") as filino: 145 | docs = filino.readlines() 146 | else: 147 | raise Exception("One parameter between sentences and file_name should be selected") 148 | 149 | for d in docs: 150 | for term in d.split(): 151 | index = vocabulary.setdefault(term, len(vocabulary)) 152 | indices.append(index) 153 | data.append(1) 154 | indptr.append(len(indices)) 155 | 156 | self.vocab_dict = vocabulary 157 | self.vocab = list(vocabulary.keys()) 158 | 159 | warnings.simplefilter('always', DeprecationWarning) 160 | if len(self.vocab) > 2000: 161 | warnings.warn("The vocab you are using has more than 2000 words, reconstructing high-dimensional vectors requires" 162 | "significantly more training epochs and training samples. " 163 | "Consider reducing the number of vocabulary items. " 164 | "See https://github.com/MilaNLProc/contextualized-topic-models#preprocessing " 165 | "and https://github.com/MilaNLProc/contextualized-topic-models#tldr", Warning) 166 | 167 | self.idx2token = {v: k for (k, v) in self.vocab_dict.items()} 168 | self.bow = scipy.sparse.csr_matrix((data, indices, indptr), dtype=int) 169 | -------------------------------------------------------------------------------- /models/contextualized_topic_models/utils/preprocessing.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | import string 3 | from nltk.corpus import stopwords as stop_words 4 | import warnings 5 | 6 | class WhiteSpacePreprocessing(): 7 | """ 8 | Provides a very simple preprocessing script that filters infrequent tokens from text 9 | """ 10 | def __init__(self, documents, stopwords_language="english", vocabulary_size=2000): 11 | """ 12 | 13 | :param documents: list of strings 14 | :param stopwords_language: string of the language of the stopwords (see nltk stopwords) 15 | :param vocabulary_size: the number of most frequent words to include in the documents. Infrequent words will be discarded from the list of preprocessed documents 16 | """ 17 | self.documents = documents 18 | self.stopwords = set(stop_words.words(stopwords_language)) 19 | self.vocabulary_size = vocabulary_size 20 | 21 | def preprocess(self): 22 | """ 23 | Note that if after filtering some documents do not contain words we remove them. That is why we return also the 24 | list of unpreprocessed documents. 25 | 26 | :return: preprocessed documents, unpreprocessed documents and the vocabulary list 27 | """ 28 | preprocessed_docs_tmp = self.documents 29 | preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp] 30 | preprocessed_docs_tmp = [doc.translate( 31 | str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp] 32 | preprocessed_docs_tmp = [' '.join([w for w in doc.split() if len(w) > 0 and w not in self.stopwords]) 33 | for doc in preprocessed_docs_tmp] 34 | 35 | vectorizer = CountVectorizer(max_features=self.vocabulary_size, token_pattern=r'\b[a-zA-Z]{2,}\b') 36 | vectorizer.fit_transform(preprocessed_docs_tmp) 37 | vocabulary = set(vectorizer.get_feature_names()) 38 | preprocessed_docs_tmp = [' '.join([w for w in doc.split() if w in vocabulary]) 39 | for doc in preprocessed_docs_tmp] 40 | 41 | preprocessed_docs, unpreprocessed_docs = [], [] 42 | for i, doc in enumerate(preprocessed_docs_tmp): 43 | if len(doc) > 0: 44 | preprocessed_docs.append(doc) 45 | unpreprocessed_docs.append(self.documents[i]) 46 | 47 | return preprocessed_docs, unpreprocessed_docs, list(vocabulary) 48 | 49 | 50 | class SimplePreprocessing(WhiteSpacePreprocessing): 51 | def __init__(self, documents, stopwords_language="english"): 52 | super().__init__(documents, stopwords_language) 53 | warnings.simplefilter('always', DeprecationWarning) 54 | 55 | if self.__class__.__name__ == "CTM": 56 | 57 | warnings.warn("SimplePrepocessing is deprecated and will be removed in version 2.0, " 58 | "use WhiteSpacePreprocessing", DeprecationWarning) 59 | 60 | 61 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import os 3 | import numpy as np 4 | import json 5 | 6 | 7 | class AbstractModel(ABC): 8 | """ 9 | Class structure of a generic Topic Modeling implementation 10 | """ 11 | 12 | def __init__(self): 13 | """ 14 | Create a blank model to initialize 15 | """ 16 | self.hyperparameters = dict() 17 | 18 | def set_hyperparameters(self, **kwargs): 19 | """ 20 | Set model hyperparameters 21 | :param **kwargs: a dictionary of in the form {hyperparameter name: value} 22 | """ 23 | for key, value in kwargs.items(): 24 | self.hyperparameters[key] = value 25 | 26 | @abstractmethod 27 | def train_model(self, dataset, hyperparameters, top_words=10): 28 | """ 29 | Train the model. 30 | :param dataset: Dataset 31 | :param hyperparameters: dictionary in the form {hyperparameter name: value} 32 | :param top_words: number of top significant words for each topic (default: 10) 33 | :return model_output: a dictionary containing up to 4 keys: *topics*, *topic-word-matrix*, 34 | *topic-document-matrix*, *test-topic-document-matrix*. *topics* is the list of the most significant words for 35 | each topic (list of lists of strings). *topic-word-matrix* is the matrix (num topics x ||vocabulary||) 36 | containing the probabilities of a word in a given topic. *topic-document-matrix* is the matrix (||topics|| x 37 | ||training documents||) containing the probabilities of the topics in a given training document. 38 | *test-topic-document-matrix* is the matrix (||topics|| x ||testing documents||) containing the probabilities 39 | of the topics in a given testing document. 40 | """ 41 | pass 42 | 43 | 44 | def save_model_output(model_output, path=os.curdir, appr_order=7): 45 | """ 46 | Saves the model output in the chosen directory 47 | :param model_output: output of the model 48 | :param path: path in which the file will be saved and name of the file 49 | :param appr_order: approximation order (used to round model_output values) 50 | """ 51 | 52 | to_save = {} 53 | try: 54 | for single_output in model_output.keys(): 55 | if single_output != "topics" and single_output != "test-topics": 56 | to_save[single_output] = ( 57 | model_output[single_output].round(appr_order)) 58 | else: 59 | to_save[single_output] = (model_output[single_output]) 60 | np.savez_compressed(path, **to_save) 61 | except: 62 | raise Exception("error in saving the output model file") 63 | 64 | 65 | def load_model_output(output_path, vocabulary_path=None, top_words=10): 66 | """ 67 | Loads a model output from the choosen directory 68 | Parameters 69 | ---------- 70 | :param output_path: path in which th model output is saved 71 | :param vocabulary_path: path in which the vocabulary is saved (optional, used to retrieve the top k words of each 72 | topic) 73 | :param top_words: top k words to retrieve for each topic (in case a vocabulary path is given) 74 | """ 75 | output = dict(np.load(output_path, allow_pickle=True)) 76 | if vocabulary_path is not None: 77 | vocabulary_file = open(vocabulary_path, 'r') 78 | vocabulary = json.load(vocabulary_file) 79 | index2vocab = vocabulary 80 | 81 | topics_output = [] 82 | for topic in output["topic-word-matrix"]: 83 | top_k = np.argsort(topic)[-top_words:] 84 | top_k_words = list( 85 | reversed([[index2vocab[str(i)], float(topic[i])] for i in top_k])) 86 | topics_output.append(top_k_words) 87 | 88 | output["topic-word-matrix"] = output["topic-word-matrix"].tolist() 89 | output["topic-document-matrix"] = output["topic-document-matrix"].tolist() 90 | if "test-topic-word-matrix" in output: 91 | output["test-topic-word-matrix"] = output["test-topic-word-matrix"].tolist() 92 | if "test-topic-document-matrix" in output: 93 | output["test-topic-document-matrix"] = output["test-topic-document-matrix"].tolist() 94 | 95 | output["topics"] = topics_output 96 | return output -------------------------------------------------------------------------------- /models/pytorchtools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | class EarlyStopping: 5 | """Early stops the training if validation loss doesn't improve after a given patience.""" 6 | def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print): 7 | """ 8 | Args: 9 | patience (int): How long to wait after last time validation loss improved. 10 | Default: 7 11 | verbose (bool): If True, prints a message for each validation loss improvement. 12 | Default: False 13 | delta (float): Minimum change in the monitored quantity to qualify as an improvement. 14 | Default: 0 15 | path (str): Path for the checkpoint to be saved to. 16 | Default: 'checkpoint.pt' 17 | trace_func (function): trace print function. 18 | Default: print 19 | """ 20 | self.patience = patience 21 | self.verbose = verbose 22 | self.counter = 0 23 | self.best_score = None 24 | self.early_stop = False 25 | self.val_loss_min = np.Inf 26 | self.delta = delta 27 | self.path = path 28 | self.trace_func = trace_func 29 | 30 | def __call__(self, val_loss, model): 31 | 32 | score = -val_loss 33 | 34 | if self.best_score is None: 35 | self.best_score = score 36 | self.save_checkpoint(val_loss, model) 37 | elif score < self.best_score + self.delta: 38 | self.counter += 1 39 | if self.verbose: 40 | self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}') 41 | if self.counter >= self.patience: 42 | self.early_stop = True 43 | else: 44 | self.best_score = score 45 | self.save_checkpoint(val_loss, model) 46 | self.counter = 0 47 | 48 | def save_checkpoint(self, val_loss, model): 49 | '''Saves model when validation loss decrease.''' 50 | if self.verbose: 51 | self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') 52 | torch.save(model.state_dict(), self.path) 53 | self.val_loss_min = val_loss -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | snscrape 2 | hazm 3 | bitermplus==0.6.12 4 | Cython==0.29.30 5 | joblib==1.1.0 6 | numpy==1.22.4 7 | pandas==1.4.2 8 | python-dateutil==2.8.2 9 | pytz==2022.1 10 | scikit-learn==1.1.1 11 | scipy==1.8.1 12 | six==1.16.0 13 | threadpoolctl==3.1.0 14 | tqdm==4.64.0 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup( 4 | name = 'my_python_package', 5 | packages = ['my_python_package'], 6 | version = 'version number', # Ideally should be same as your GitHub release tag varsion 7 | description = 'description', 8 | author = '', 9 | author_email = '', 10 | url = 'github package source url', 11 | download_url = 'download link you saved', 12 | keywords = ['tag1', 'tag2'], 13 | classifiers = [], 14 | ) -------------------------------------------------------------------------------- /stop_words/stop_words.txt: -------------------------------------------------------------------------------- 1 | ! 2 | " 3 | # 4 | ( 5 | ) 6 | * 7 | , 8 | - 9 | . 10 | / 11 | : 12 | [ 13 | ] 14 | « 15 | » 16 | ، 17 | ؛ 18 | ؟ 19 | آباد 20 | آخ 21 | آخر 22 | آخرها 23 | آخه 24 | آدمهاست 25 | آرام 26 | آرام آرام 27 | آره 28 | آری 29 | آزادانه 30 | آسان 31 | آسیب پذیرند 32 | آشنایند 33 | آشکارا 34 | آقا 35 | آقای 36 | آقایان 37 | آمد 38 | آمدن 39 | آمده 40 | آمرانه 41 | آن 42 | آن گاه 43 | آنان 44 | آنانی 45 | آنجا 46 | آنرا 47 | آنطور 48 | آنقدر 49 | آنها 50 | آنهاست 51 | آنچنان 52 | آنچنان که 53 | اونجور 54 | اونجوری 55 | اونجوری که 56 | آنچه 57 | آنکه 58 | آنگاه 59 | آن‌ها 60 | آهان 61 | آهای 62 | آور 63 | آورد 64 | آوردن 65 | آورده 66 | آوه 67 | آی 68 | آیا 69 | آید 70 | آیند 71 | ا 72 | اتفاقا 73 | اثرِ 74 | اجراست 75 | احتراما 76 | احتمالا 77 | احیاناً 78 | اخیر 79 | اخیراً 80 | اری 81 | از 82 | از آن پس 83 | از بس که 84 | از جمله 85 | ازاین رو 86 | ازجمله 87 | ازش 88 | اساسا 89 | اساساً 90 | است 91 | استفاد 92 | استفاده 93 | اسلامی اند 94 | اش 95 | اشتباها 96 | اشکارا 97 | اصلا 98 | اصلاً 99 | اصولا 100 | اصولاً 101 | اعلام 102 | اغلب 103 | افزود 104 | افسوس 105 | اقل 106 | اقلیت 107 | الا 108 | الان 109 | البته 110 | البتّه 111 | الهی 112 | الی 113 | ام 114 | اما 115 | امروز 116 | امروزه 117 | امسال 118 | امشب 119 | امور 120 | امیدوارم 121 | امیدوارند 122 | امیدواریم 123 | ان 124 | ان شاأالله 125 | انشالا 126 | انتها 127 | انجام 128 | اند 129 | اندکی 130 | انشاالله 131 | انصافا 132 | انطور 133 | انقدر 134 | انها 135 | انچنان 136 | انکه 137 | انگار 138 | او 139 | اوست 140 | اول 141 | اولا 142 | اولاً 143 | اولین 144 | اون 145 | اکثر 146 | اکثرا 147 | اکثراً 148 | اکثریت 149 | اکنون 150 | اگر 151 | اگر چه 152 | اگرچه 153 | اگه 154 | ای 155 | ایا 156 | اید 157 | ایشان 158 | ایم 159 | این 160 | این جوری 161 | این قدر 162 | این گونه 163 | اینان 164 | اینجا 165 | اینجاست 166 | ایند 167 | اینطور 168 | اینقدر 169 | اینها 170 | اینهاست 171 | اینو 172 | اینچنین 173 | اینک 174 | اینکه 175 | اینگونه 176 | ب 177 | با 178 | بااین حال 179 | بااین وجود 180 | باد 181 | بار 182 | بارة 183 | باره 184 | بارها 185 | باز 186 | باز هم 187 | بازهم 188 | بازی کنان 189 | بازیگوشانه 190 | باش 191 | باشد 192 | باشم 193 | باشند 194 | باشی 195 | باشید 196 | باشیم 197 | بالا 198 | بالاخره 199 | بالاخص 200 | بالاست 201 | بالای 202 | بالایِ 203 | بالطبع 204 | بالعکس 205 | باوجودی که 206 | باورند 207 | باید 208 | بتدریج 209 | بتوان 210 | بتواند 211 | بتوانی 212 | بتوانیم 213 | بجز 214 | بخش 215 | بخشه 216 | بخشی 217 | بخصوص 218 | بخواه 219 | بخواهد 220 | بخواهم 221 | بخواهند 222 | بخواهی 223 | بخواهید 224 | بخواهیم 225 | بخوبی 226 | بد 227 | بدان 228 | بدانجا 229 | بدانها 230 | بدهید 231 | بدون 232 | بدین 233 | بدین ترتیب 234 | بدینجا 235 | بر 236 | برآنند 237 | برا 238 | برابر 239 | برابرِ 240 | براحتی 241 | براساس 242 | براستی 243 | برای 244 | برایت 245 | برایش 246 | برایشان 247 | برایم 248 | برایمان 249 | برایِ 250 | برخوردار 251 | برخوردارند 252 | برخی 253 | برداری 254 | برعکس 255 | برنامه سازهاست 256 | بروز 257 | بروشنی 258 | بزرگ 259 | بزودی 260 | بس 261 | بسا 262 | بسادگی 263 | بسختی 264 | بسوی 265 | بسی 266 | بسیار 267 | بسیاری 268 | بشدت 269 | بطور 270 | بطوری که 271 | بعد 272 | بعد از این که 273 | بعدا 274 | بعدازظهر 275 | بعداً 276 | بعدها 277 | بعری 278 | بعضا 279 | بعضی 280 | بعضی شان 281 | بعضیهایشان 282 | بعضی‌ها 283 | بعلاوه 284 | بعید 285 | بفهمی نفهمی 286 | بلافاصله 287 | بله 288 | بلکه 289 | بلی 290 | بماند 291 | بنابراین 292 | بندی 293 | به 294 | به آسانی 295 | به تازگی 296 | به تدریج 297 | به تمامی 298 | به جای 299 | به جز 300 | به خوبی 301 | به درشتی 302 | به دلخواه 303 | به راستی 304 | به رغم 305 | به روشنی 306 | به زودی 307 | به سادگی 308 | به سرعت 309 | به شان 310 | به شدت 311 | به طور کلی 312 | به طوری که 313 | به علاوه 314 | به قدری 315 | به مراتب 316 | به ناچار 317 | به هرحال 318 | به هیچ وجه 319 | به وضوح 320 | به ویژه 321 | به کرات 322 | به گرمی 323 | بهت 324 | بهتر 325 | بهترین 326 | بهش 327 | بود 328 | بودم 329 | بودن 330 | بودند 331 | بوده 332 | بودی 333 | بودید 334 | بودیم 335 | بویژه 336 | بپا 337 | بکار 338 | بکن 339 | بکند 340 | بکنم 341 | بکنند 342 | بکنی 343 | بکنید 344 | بکنیم 345 | بگذاریم 346 | بگو 347 | بگوید 348 | بگویم 349 | بگویند 350 | بگویی 351 | بگویید 352 | بگوییم 353 | بگیر 354 | بگیرد 355 | بگیرم 356 | بگیرند 357 | بگیری 358 | بگیرید 359 | بگیریم 360 | بی 361 | بی آنکه 362 | بی اطلاعند 363 | بی تردید 364 | بی تفاوتند 365 | بی نیازمندانه 366 | بی هدف 367 | بیا 368 | بیاب 369 | بیابد 370 | بیابم 371 | بیابند 372 | بیابی 373 | بیابید 374 | بیابیم 375 | بیاور 376 | بیاورد 377 | بیاورم 378 | بیاورند 379 | بیاوری 380 | بیاورید 381 | بیاوریم 382 | بیاید 383 | بیایم 384 | بیایند 385 | بیایی 386 | بیایید 387 | بیاییم 388 | بیرون 389 | بیرونِ 390 | بیست 391 | بیش 392 | بیشتر 393 | بیشتری 394 | بین 395 | بیگمان 396 | ت 397 | تا 398 | تازه 399 | تان 400 | تاکنون 401 | تحت 402 | تحریم هاست 403 | تر 404 | تر براساس 405 | تریلیارد 406 | تریلیون 407 | ترین 408 | تصریحاً 409 | تعدادی 410 | تعمدا 411 | تقریبا 412 | تقریباً 413 | تلویحا 414 | تلویحاً 415 | تمام 416 | تمام قد 417 | تماما 418 | تمامشان 419 | تمامی 420 | تند تند 421 | تنها 422 | تو 423 | توؤماً 424 | توان 425 | تواند 426 | توانست 427 | توانستم 428 | توانستن 429 | توانستند 430 | توانسته 431 | توانستی 432 | توانستیم 433 | توانم 434 | توانند 435 | توانی 436 | توانید 437 | توانیم 438 | توسط 439 | تولِ 440 | توی 441 | تویِ 442 | تک تک 443 | ث 444 | ثالثاً 445 | ثانیا 446 | ثانیاً 447 | ج 448 | جا 449 | جای 450 | جایی 451 | جدا 452 | جداً 453 | جداگانه 454 | جدید 455 | جدیدا 456 | جرمزاست 457 | جریان 458 | جز 459 | جلو 460 | جلوگیری 461 | جلوی 462 | جلویِ 463 | جمع اند 464 | جمعا 465 | جمعی 466 | جنابعالی 467 | جناح 468 | جنس اند 469 | جهت 470 | جور 471 | ح 472 | حاشیه‌ای 473 | حاضر 474 | حاضرم 475 | حال 476 | حالا 477 | حاکیست 478 | حتما 479 | حتماً 480 | حتی 481 | حداقل 482 | حداکثر 483 | حدود 484 | حدودا 485 | حدودِ 486 | حسابگرانه 487 | حضرتعالی 488 | حق 489 | حقیرانه 490 | حقیقتا 491 | حول 492 | حکماً 493 | خ 494 | خارجِ 495 | خالصانه 496 | خب 497 | خداحافظ 498 | خداست 499 | خدمات 500 | خسته‌ای 501 | خصوصا 502 | خصوصاً 503 | خلاصه 504 | خواست 505 | خواستم 506 | خواستن 507 | خواستند 508 | خواسته 509 | خواستی 510 | خواستید 511 | خواستیم 512 | خواه 513 | خواهد 514 | خواهم 515 | خواهند 516 | خواهی 517 | خواهید 518 | خواهیم 519 | خوب 520 | خود 521 | خود به خود 522 | خودبه خودی 523 | خودت 524 | خودتان 525 | خودتو 526 | خودش 527 | خودشان 528 | خودم 529 | خودمان 530 | خودمو 531 | خوش 532 | خوشبختانه 533 | خویش 534 | خویشتن 535 | خویشتنم 536 | خیاه 537 | خیر 538 | خیره 539 | خیلی 540 | د 541 | دا 542 | داام 543 | دااما 544 | داخل 545 | داد 546 | دادم 547 | دادن 548 | دادند 549 | داده 550 | دادی 551 | دادید 552 | دادیم 553 | دار 554 | داراست 555 | دارد 556 | دارم 557 | دارند 558 | داری 559 | دارید 560 | داریم 561 | داشت 562 | داشتم 563 | داشتن 564 | داشتند 565 | داشته 566 | داشتی 567 | داشتید 568 | داشتیم 569 | دامم 570 | دانست 571 | دانند 572 | دایم 573 | دایما 574 | در 575 | در باره 576 | در بارهٌ 577 | در ثانی 578 | در مجموع 579 | در نهایت 580 | در واقع 581 | در کل 582 | در کنار 583 | دراین میان 584 | درباره 585 | درحالی که 586 | درحالیکه 587 | درست 588 | درست و حسابی 589 | درسته 590 | درصورتی که 591 | درعین حال 592 | درمجموع 593 | درواقع 594 | درون 595 | دریغ 596 | دریغا 597 | درین 598 | دسته دسته 599 | دشمنیم 600 | دقیقا 601 | دم 602 | دنبالِ 603 | ده 604 | دهد 605 | دهم 606 | دهند 607 | دهی 608 | دهید 609 | دهیم 610 | دو 611 | دو روزه 612 | دوباره 613 | دوم 614 | دیده 615 | دیر 616 | دیرت 617 | دیرم 618 | دیروز 619 | دیشب 620 | دیوانه‌ای 621 | دیوی 622 | دیگر 623 | دیگران 624 | دیگری 625 | دیگه 626 | ذ 627 | ذاتاً 628 | ر 629 | را 630 | راجع به 631 | راحت 632 | راسا 633 | راست 634 | راستی 635 | راه 636 | رسما 637 | رسید 638 | رسیده 639 | رشته 640 | رفت 641 | رفتارهاست 642 | رفته 643 | رنجند 644 | رهگشاست 645 | رو 646 | رواست 647 | روب 648 | روبروست 649 | روز 650 | روز به روز 651 | روزانه 652 | روزه ایم 653 | روزه ست 654 | روزه م 655 | روزهای 656 | روزه‌ای 657 | روش 658 | روی 659 | رویش 660 | رویِ 661 | ریزی 662 | ز 663 | زشتکارانند 664 | زمان 665 | زمانی 666 | زمینه 667 | زنند 668 | زهی 669 | زود 670 | زودتر 671 | زیاد 672 | زیاده 673 | زیر 674 | زیرا 675 | زیرِ 676 | زیرچشمی 677 | س 678 | سابق 679 | ساخته 680 | ساده اند 681 | سازی 682 | سالانه 683 | سالته 684 | سالم‌تر 685 | سالهاست 686 | سالیانه 687 | ساکنند 688 | سایر 689 | سخت 690 | سخته 691 | سر 692 | سراسر 693 | سرانجام 694 | سراپا 695 | سری 696 | سریع 697 | سریعا 698 | سریعاً 699 | سریِ 700 | سعی 701 | سمتِ 702 | سه باره 703 | سهواً 704 | سوم 705 | سوی 706 | سویِ 707 | سپس 708 | سیاه چاله هاست 709 | سیخ 710 | ش 711 | شان 712 | شاهدند 713 | شاهدیم 714 | شاید 715 | شبهاست 716 | شخصا 717 | شخصاً 718 | شد 719 | شدم 720 | شدن 721 | شدند 722 | شده 723 | شدی 724 | شدید 725 | شدیدا 726 | شدیداً 727 | شدیم 728 | شش 729 | شش نداشته 730 | شما 731 | شماری 732 | شماست 733 | شمایند 734 | شناسی 735 | شو 736 | شود 737 | شوراست 738 | شوقم 739 | شوم 740 | شوند 741 | شونده 742 | شوی 743 | شوید 744 | شویم 745 | شیرین 746 | شیرینه 747 | شیک 748 | ص 749 | صد 750 | صددرصد 751 | صرفا 752 | صرفاً 753 | صریحاً 754 | صندوق هاست 755 | صورت 756 | ض 757 | ضدِّ 758 | ضدِّ 759 | ضمن 760 | ضمناً 761 | ط 762 | طبعا 763 | طبعاً 764 | طبقِ 765 | طبیعتا 766 | طرف 767 | طریق 768 | طلبکارانه 769 | طور 770 | طی 771 | ظ 772 | ظاهرا 773 | ظاهراً 774 | ع 775 | عاجزانه 776 | عاقبت 777 | عبارتند 778 | عجب 779 | عجولانه 780 | عدم 781 | عرفانی 782 | عقب 783 | عقبِ 784 | علاوه بر 785 | علاوه بر آن 786 | علاوه برآن 787 | علناً 788 | علّتِ 789 | علی الظاهر 790 | علی رغم 791 | علیرغم 792 | علیه 793 | عمدا 794 | عمداً 795 | عمدتا 796 | عمدتاً 797 | عمده 798 | عمل 799 | عملا 800 | عملاً 801 | عملی اند 802 | عموم 803 | عموما 804 | عموماً 805 | عنقریب 806 | عنوان 807 | عنوانِ 808 | عیناً 809 | غ 810 | غالبا 811 | غزالان 812 | غیر 813 | غیرقانونی 814 | ف 815 | فاقد 816 | فبها 817 | فر 818 | فردا 819 | فعلا 820 | فعلاً 821 | فقط 822 | فلان 823 | فلذا 824 | فوق 825 | فکر 826 | ق 827 | قاالند 828 | قابل 829 | قاطبه 830 | قاطعانه 831 | قاعدتاً 832 | قانوناً 833 | قبل 834 | قبلا 835 | قبلاً 836 | قبلند 837 | قدر 838 | قدری 839 | قصدِ 840 | قضایاست 841 | قطعا 842 | قطعاً 843 | ل 844 | لااقل 845 | لاجرم 846 | لب 847 | لذا 848 | لزوماً 849 | لطفا 850 | لطفاً 851 | لیکن 852 | م 853 | ما 854 | مادامی 855 | ماست 856 | مامان مامان گویان 857 | مان 858 | مانند 859 | مانندِ 860 | مبادا 861 | متؤسفانه 862 | متاسفانه 863 | متعاقبا 864 | متفاوتند 865 | مثل 866 | مثلا 867 | مثلِ 868 | مجانی 869 | مجبورند 870 | مجددا 871 | مجدداً 872 | مجموعا 873 | مجموعاً 874 | محتاجند 875 | محکم 876 | محکم‌تر 877 | مخالفند 878 | مختلف 879 | مخصوصاً 880 | مدام 881 | مدت 882 | مدتهاست 883 | مدّتی 884 | مذهبی اند 885 | مرا 886 | مرتب 887 | مردانه 888 | مردم 889 | مردم اند 890 | مرسی 891 | مستحضرید 892 | مستقیما 893 | مستند 894 | مسلما 895 | مشت 896 | مشترکاً 897 | مشغولند 898 | مطمانا 899 | مطمانم 900 | مطمینا 901 | مع الاسف 902 | مع ذلک 903 | معتقدم 904 | معتقدند 905 | معتقدیم 906 | معدود 907 | معذوریم 908 | معلومه 909 | معمولا 910 | معمولاً 911 | معمولی 912 | مغرضانه 913 | مفیدند 914 | مقابل 915 | مقدار 916 | مقصرند 917 | مقصری 918 | ملیارد 919 | ملیون 920 | ممکن 921 | ممیزیهاست 922 | من 923 | منتهی 924 | منطقی 925 | منی 926 | مواجهند 927 | موارد 928 | موجودند 929 | مورد 930 | موقتا 931 | مکرر 932 | مکرراً 933 | مگر 934 | مگر آن که 935 | مگر این که 936 | مگو 937 | می 938 | میان 939 | میزان 940 | میلیارد 941 | میلیون 942 | میکند 943 | میکنم 944 | میکنند 945 | میکنی 946 | میکنید 947 | میکنیم 948 | می‌تواند 949 | می‌خواهیم 950 | می‌داند 951 | می‌رسد 952 | می‌رود 953 | می‌شود 954 | می‌کنم 955 | می‌کنند 956 | می‌کنیم 957 | ن 958 | ناامید 959 | ناخواسته 960 | ناراضی اند 961 | ناشی 962 | نام 963 | ناگاه 964 | ناگزیر 965 | ناگهان 966 | ناگهانی 967 | نباید 968 | نبش 969 | نبود 970 | نخست 971 | نخستین 972 | نخواهد 973 | نخواهم 974 | نخواهند 975 | نخواهی 976 | نخواهید 977 | نخواهیم 978 | نخودی 979 | ندارد 980 | ندارم 981 | ندارند 982 | نداری 983 | ندارید 984 | نداریم 985 | نداشت 986 | نداشتم 987 | نداشتند 988 | نداشته 989 | نداشتی 990 | نداشتید 991 | نداشتیم 992 | نزد 993 | نزدِ 994 | نزدیک 995 | نزدیکِ 996 | نسبتا 997 | نشان 998 | نشده 999 | نظیر 1000 | نفرند 1001 | نماید 1002 | نموده 1003 | نمی 1004 | نمی‌شود 1005 | نمی‌کند 1006 | نه 1007 | نه تنها 1008 | نهایتا 1009 | نهایتاً 1010 | نوع 1011 | نوعاً 1012 | نوعی 1013 | نکرده 1014 | نکن 1015 | نکند 1016 | نکنم 1017 | نکنند 1018 | نکنی 1019 | نکنید 1020 | نکنیم 1021 | نگاه 1022 | نگو 1023 | نیازمندند 1024 | نیز 1025 | نیست 1026 | نیستم 1027 | نیستند 1028 | نیستیم 1029 | نیمی 1030 | ه 1031 | ها 1032 | های 1033 | هایی 1034 | هبچ 1035 | هر 1036 | هر از گاهی 1037 | هر چند 1038 | هر چند که 1039 | هر چه 1040 | هرچند 1041 | هرچه 1042 | هرکس 1043 | هرگاه 1044 | هرگز 1045 | هزار 1046 | هست 1047 | هستم 1048 | هستند 1049 | هستی 1050 | هستید 1051 | هستیم 1052 | هفت 1053 | هق هق کنان 1054 | هم 1055 | هم اکنون 1056 | هم اینک 1057 | همان 1058 | همان طور که 1059 | همان گونه که 1060 | همانا 1061 | همانند 1062 | همانها 1063 | همدیگر 1064 | همزمان 1065 | همه 1066 | همه روزه 1067 | همه ساله 1068 | همه شان 1069 | همهٌ 1070 | همه‌اش 1071 | همواره 1072 | همچنان 1073 | همچنان که 1074 | همچنین 1075 | همچون 1076 | همچین 1077 | همگان 1078 | همگی 1079 | همیشه 1080 | همین 1081 | همین که 1082 | هنوز 1083 | هنگام 1084 | هنگامِ 1085 | هنگامی 1086 | هنگامی که 1087 | هوی 1088 | هی 1089 | هیچ 1090 | هیچ گاه 1091 | هیچکدام 1092 | هیچکس 1093 | هیچگاه 1094 | هیچگونه 1095 | هیچی 1096 | و 1097 | و لا غیر 1098 | وابسته اند 1099 | واقعا 1100 | واقعاً 1101 | واقعی 1102 | واقفند 1103 | واما 1104 | وای 1105 | وجود 1106 | وحشت زده 1107 | وسطِ 1108 | وضع 1109 | وقتی 1110 | وقتی که 1111 | وقتیکه 1112 | ولی 1113 | وگرنه 1114 | وگو 1115 | وی 1116 | ویا 1117 | ویژه 1118 | ّه 1119 | ٪ 1120 | پ 1121 | پارسال 1122 | پارسایانه 1123 | پاره‌ای 1124 | پاعینِ 1125 | پایین ترند 1126 | پدرانه 1127 | پرسان 1128 | پروردگارا 1129 | پریروز 1130 | پس 1131 | پس از 1132 | پس فردا 1133 | پشت 1134 | پشتوانه اند 1135 | پشیمونی 1136 | پنج 1137 | پهن شده 1138 | پی 1139 | پی درپی 1140 | پیدا 1141 | پیداست 1142 | پیرامون 1143 | پیش 1144 | پیشاپیش 1145 | پیشتر 1146 | پیشِ 1147 | پیوسته 1148 | چ 1149 | چاپلوسانه 1150 | چت 1151 | چته 1152 | چرا 1153 | چرا که 1154 | چشم بسته 1155 | چطور 1156 | چقدر 1157 | چنان 1158 | چنانچه 1159 | چنانکه 1160 | چند 1161 | چند روزه 1162 | چندان 1163 | چنده 1164 | چندین 1165 | چنین 1166 | چه 1167 | چه بسا 1168 | چه طور 1169 | چهار 1170 | چو 1171 | چون 1172 | چکار 1173 | چگونه 1174 | چی 1175 | چیز 1176 | چیزی 1177 | چیزیست 1178 | چیست 1179 | چیه 1180 | ژ 1181 | ک 1182 | کارند 1183 | کاش 1184 | کاشکی 1185 | کامل 1186 | کاملا 1187 | کاملاً 1188 | کتبا 1189 | کجا 1190 | کجاست 1191 | کدام 1192 | کرد 1193 | کردم 1194 | کردن 1195 | کردند 1196 | کرده 1197 | کردی 1198 | کردید 1199 | کردیم 1200 | کس 1201 | کسانی 1202 | کسی 1203 | کل 1204 | کلا 1205 | کلی 1206 | کلیه 1207 | کم 1208 | کم کم 1209 | کمااینکه 1210 | کماکان 1211 | کمتر 1212 | کمتره 1213 | کمتری 1214 | کمی 1215 | کن 1216 | کنار 1217 | کنارش 1218 | کنارِ 1219 | کنایه‌ای 1220 | کند 1221 | کنم 1222 | کنند 1223 | کننده 1224 | کنون 1225 | کنونی 1226 | کنی 1227 | کنید 1228 | کنیم 1229 | که 1230 | کو 1231 | کَی 1232 | کی 1233 | گ 1234 | گاه 1235 | گاهی 1236 | گذاری 1237 | گذاشته 1238 | گذشته 1239 | گردد 1240 | گردند 1241 | گرفت 1242 | گرفتارند 1243 | گرفتم 1244 | گرفتن 1245 | گرفتند 1246 | گرفته 1247 | گرفتی 1248 | گرفتید 1249 | گرفتیم 1250 | گروهی 1251 | گرچه 1252 | گفت 1253 | گفتم 1254 | گفتن 1255 | گفتند 1256 | گفته 1257 | گفتی 1258 | گفتید 1259 | گفتیم 1260 | گه 1261 | گهگاه 1262 | گو 1263 | گونه 1264 | گوی 1265 | گویا 1266 | گوید 1267 | گویم 1268 | گویند 1269 | گویی 1270 | گویید 1271 | گوییم 1272 | گیر 1273 | گیرد 1274 | گیرم 1275 | گیرند 1276 | گیری 1277 | گیرید 1278 | گیریم 1279 | ی 1280 | یا 1281 | یاب 1282 | یابد 1283 | یابم 1284 | یابند 1285 | یابی 1286 | یابید 1287 | یابیم 1288 | یارب 1289 | یافت 1290 | یافتم 1291 | یافتن 1292 | یافته 1293 | یافتی 1294 | یافتید 1295 | یافتیم 1296 | یعنی 1297 | یقینا 1298 | یقیناً 1299 | یه 1300 | یواش یواش 1301 | یک 1302 | یک جوری 1303 | یک کم 1304 | یک کمی 1305 | یکدیگر 1306 | یکریز 1307 | یکسال 1308 | یکهزار 1309 | یکی 1310 | ۰ 1311 | ۱ 1312 | ۲ 1313 | ۳ 1314 | ۴ 1315 | ۵ 1316 | ۶ 1317 | ۷ 1318 | ۸ 1319 | ۹ 1320 | … 1321 | و 1322 | ‏‏‏علاقه مند 1323 | میخونم 1324 | میخوانم 1325 | می خوانم 1326 | میخونید 1327 | میخوانید 1328 | می خوانید 1329 | در آینده 1330 | بشم 1331 | بشی 1332 | بشید 1333 | بشین 1334 | یک چیزی 1335 | بهتون 1336 | اینم 1337 | بیفته 1338 | محض رضای خدا 1339 | هیچوقت 1340 | دونستن 1341 | میفرستین 1342 | میفرستی 1343 | میفرستم 1344 | عه 1345 | هستش 1346 | همه‌مون 1347 | همه مون 1348 | جدی 1349 | بدجور 1350 | بد جور 1351 | خداروشکر 1352 | شی 1353 | وجدانا 1354 | روم 1355 | بگین 1356 | هیچ جور 1357 | هیچجور 1358 | هیچ‌جور 1359 | مثل اینکه 1360 | دوهزاری 1361 | هستا 1362 | شون 1363 | هامو 1364 | هام رو 1365 | مارو 1366 | ما رو 1367 | رو 1368 | داره 1369 | این دفعه 1370 | دفعه -------------------------------------------------------------------------------- /stop_words/swear_words.txt: -------------------------------------------------------------------------------- 1 | آب کیر 2 | آشغال 3 | آلت تناسلی 4 | آلت 5 | ابله 6 | ابن یزید 7 | احمق 8 | اسب 9 | اسبی 10 | اسکل 11 | اسکل 12 | اسگل 13 | اسگول 14 | الاغ 15 | الاق 16 | انگل 17 | انی 18 | انی 19 | اوسکل 20 | اوسکل 21 | اوسگل 22 | اوصکل 23 | اوصگل 24 | ب ک 25 | باسن 26 | بخورش 27 | بدبخت 28 | بمال 29 | تخمم 30 | کیرم 31 | بپرروش 32 | بپرسرش 33 | کونی 34 | بکارت 35 | بکن توش 36 | بکنش 37 | بکنمت 38 | خایه 39 | بی عفت 40 | بی غیرت 41 | بی ناموس 42 | بی پدر 43 | بیابخورش 44 | بیشعور 45 | بیناموس 46 | تخم سگ 47 | تخمی 48 | ترک 49 | توله سگ 50 | جاکش 51 | جلق زدن 52 | جنده 53 | جنسی 54 | جوون 55 | جکس 56 | جیندا 57 | حرومزاده 58 | حشر 59 | حشری شدن 60 | حشری 61 | حیوانی 62 | خارکس ده 63 | خارکسده 64 | خارکسّه 65 | خانم جنده 66 | خایه خور 67 | خایه مال 68 | خایه 69 | خر 70 | خرفت 71 | خری 72 | خز 73 | خفه خون 74 | خفه شو 75 | خواهرجنده 76 | خی کاس 77 | داف ناز 78 | داف 79 | داگ استایل 80 | دخترجنده 81 | دخترقرتی 82 | درازگوش 83 | دله 84 | دهن سرویس 85 | گاییده 86 | دهنت سرویس 87 | دوجنسه 88 | دول 89 | دیوث 90 | دیوس خان 91 | دیوس 92 | دیوص 93 | رشتی 94 | ریدن 95 | ریدی 96 | زارت 97 | زباله 98 | زرنزن 99 | زن جنده 100 | زن کاسده 101 | زنا زاده 102 | زنا 103 | زنازاده 104 | زنتو 105 | زنشو 106 | زنیکه 107 | سادیسمی 108 | ساک 109 | ساکونی 110 | سرخور 111 | سرکیر 112 | سسکی 113 | سوراخ کون 114 | سوراخ کون 115 | سولاخ 116 | سکس چت 117 | سکس 118 | سکسی باش 119 | سکسی 120 | سکسیم 121 | سکسیی 122 | سگ تو روحت 123 | سگ دهن 124 | سگ صفت 125 | سگ پدر 126 | سگی 127 | سیکتیر 128 | شاسگول 129 | شاش 130 | شق کردن 131 | شل مغز 132 | شنگول 133 | شهوتی 134 | صیغه ای 135 | صیک 136 | عرب 137 | عرق خور 138 | عمتو 139 | عمه ننه 140 | عن تر 141 | عن 142 | عنتر 143 | عوضی 144 | غرمساق 145 | غرمصاق 146 | فاحشه خانم 147 | فاحشه 148 | فارس 149 | فاک فیس 150 | فیلم سوپر 151 | قرتی 152 | قرمساق 153 | قرمصاق 154 | قس 155 | لا پا 156 | لاس 157 | لاش گوشت 158 | لاشی 159 | لاکونی 160 | لجن 161 | لخت 162 | لختی 163 | لر 164 | لز 165 | مادر جنده 166 | مادرجنده 167 | مادرسگ 168 | مادرقهوه 169 | مادرکونی 170 | مالوندن 171 | ماچ کردنی 172 | مرتیکه 173 | مردیکه 174 | مرض داری 175 | مرضداری 176 | مشروب 177 | ملنگ 178 | ممه خور 179 | ممه 180 | منگل 181 | میخوریش 182 | نرکده 183 | نعشه 184 | نکبت 185 | نگاییدم 186 | هیز 187 | ولدزنا 188 | پدر سوخته 189 | پدر سگ 190 | پدر صلواتی 191 | پدرسگ 192 | پریود 193 | پستان 194 | پسون 195 | پشمام 196 | پفیوز 197 | پلشت 198 | پورن 199 | پپه 200 | چاغال 201 | چاقال 202 | چس خور 203 | چس 204 | کاسکش 205 | کث لیس 206 | کث 207 | کثافت 208 | کثافط 209 | کردن 210 | کردنی 211 | کرم 212 | کس خل 213 | کس خور 214 | کس خیس 215 | کس دادن 216 | کس لیس 217 | کس لیسیدن 218 | کس ننت 219 | کس و کیر 220 | کس کردن 221 | کس کش 222 | کس 223 | کسخل 224 | کسشعر 225 | کسکش 226 | کسکیر 227 | کص خل 228 | کص لیس 229 | کص 230 | کصافت 231 | کصافط 232 | کصخل 233 | کصکش 234 | کلفت 235 | کله کیری 236 | کوث لیس 237 | کوس خل 238 | کوس خور 239 | کوس لیس 240 | کوس 241 | کوص خل 242 | کوص لیس 243 | کوص 244 | کون تپل 245 | کون ده 246 | کون سوراخ 247 | کون پنیر 248 | کون گنده 249 | کون 250 | کونده خار 251 | کونده خوار 252 | کونده 253 | کونشو 254 | کونی 255 | کونی 256 | کیر 257 | کیردراز 258 | کیردوس 259 | کیرر 260 | کیرمکیدن 261 | کیرناز 262 | کیروکس 263 | کیروکس 264 | کیری 265 | گاو 266 | گاوی 267 | گاگول 268 | گایدن 269 | گایدی 270 | گاییدن 271 | گردن دراز 272 | گشاد 273 | گوز 274 | گوزو 275 | گوسفند 276 | گوش دراز 277 | گوه 278 | گوه 279 | گی زن 280 | گیخوار 281 | یبن زنا 282 | مادرتو 283 | ناموستو 284 | چنده 285 | باسنی 286 | سیکیم 287 | سگ ناموس 288 | نوب 289 | خایمال 290 | مادر به خطا 291 | کصلیس 292 | بکنت 293 | کصده 294 | گورومساخ 295 | پوفیوز 296 | پدرتو 297 | قورومساق 298 | سیهدیر 299 | اوبی 300 | مادر سگ 301 | نگایدم -------------------------------------------------------------------------------- /tools/Dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Dataset Module to load custom datasets 3 | ''' 4 | 5 | from operator import index 6 | from textwrap import indent 7 | import numpy as np 8 | 9 | 10 | class Dataset: 11 | ''' 12 | This module is designed to help users load their own custom dataset. 13 | ''' 14 | def __init__(self, path:str, encoding:str='utf-8') -> None: 15 | ''' 16 | initialization of Dataset 17 | :param path : string, path to the dataset 18 | :param encoding : string, encoding to read data (default 'utf-8') 19 | ''' 20 | 21 | self.initialize_corpus( 22 | self.load_data(path, encoding)) # initialize train, test, dev 23 | self.load_vocab(path, encoding) # get vocabulary 24 | self.wordtoindex = {word: index for index, word in enumerate(self.vocab)} 25 | self.indextoword = {index: word for word, index in self.wordtoindex.items()} 26 | self.count_words() 27 | 28 | def initialize_corpus(self, data:dict) -> None: 29 | self.train_corpus = data['train_corpus'] 30 | self.test_corpus = data['test_corpus'] 31 | self.dev_corpus = data['dev_corpus'] 32 | 33 | self.train_labels = data['train_labels'] 34 | self.test_labels = data['test_labels'] 35 | self.dev_labels = data['dev_labels'] 36 | 37 | def load_data(self, path:str, encoding:str) -> None: 38 | data = { 39 | 'train_corpus' : [], 40 | 'test_corpus' : [], 41 | 'dev_corpus' : [], 42 | 'train_labels' : [], 43 | 'test_labels' : [], 44 | 'dev_labels' : [] 45 | } 46 | 47 | with open(f'{path}/data.tsv', 'r', encoding=encoding) as f: 48 | lines = f.readlines() 49 | for line in lines: 50 | _ = line.split('\t') 51 | _slice = _[1] 52 | if len(_) == 3: 53 | try: 54 | data[f'{_slice}_corpus'].append(_[0]) 55 | data[f'{_slice}_labels'].append(_[2]) 56 | except Exception: 57 | print(f'{_slice} is not in [train, test, dev]...') 58 | elif len(_) == 2: 59 | try: 60 | data[f'{_slice}_corpus'].append(_[0]) 61 | except Exception: 62 | print(f'{_slice} is not in [train, test, dev]...') 63 | else: 64 | raise Exception('data file must have at least 2 and at most 3 columns...') 65 | return data 66 | 67 | def load_vocab(self, path:str, encoding:str) -> None: 68 | self.vocab = ['UNK'] 69 | with open(f'{path}/vocab.txt', 'r', encoding=encoding) as f: 70 | lines = f.readlines() 71 | for line in lines: 72 | _ = line.split() 73 | self.vocab.append(_[0]) 74 | 75 | def count_words(self): 76 | self.words_count = {} 77 | 78 | for doc in self.train_corpus: 79 | tokenized = doc.split() 80 | for token in tokenized: 81 | if token in self.vocab: 82 | try: 83 | self.words_count[token] += 1 84 | except: 85 | self.words_count[token] = 1 86 | 87 | for i in list(self.words_count.keys()): 88 | if self.words_count[i] == 0: 89 | del self.words_count[i] 90 | del self.indextoword[self.wordtoindex[i]] 91 | del self.wordtoindex[i] 92 | -------------------------------------------------------------------------------- /tools/create_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the function of the scraper that generates a dataset from a list of hashtags. 3 | """ 4 | 5 | from scraper import TwitterScraper 6 | from hazm import word_tokenize, Normalizer, Lemmatizer 7 | import numpy as np 8 | import pandas as pd 9 | import argparse 10 | from tqdm import tqdm 11 | 12 | normalizer = Normalizer().normalize 13 | lemmatizer = Lemmatizer().lemmatize 14 | 15 | # Retrieved from https://github.com/kharazi/persian-stopwords 16 | stopwords = set(open('../stop_words/stop_words.txt', encoding='utf8').read().splitlines()) 17 | # Retrieved from https://github.com/amirshnll/Persian-Swear-Words 18 | swearing_words = set(open('../stop_words/swear_words.txt', encoding='utf8').read().splitlines()) 19 | 20 | bad_hashtags = set(['تا_آخوند_کفن_نشود_این_وطن_وطن_نشود', 21 | 'ایران_را_پس_میگیریم', 22 | 'جمهوری_اسلامی_نابود_باید_گردد', 23 | 'مرگ_بر_خامنه\\u200cای_جنایتکار', 24 | 'مرگ_بر_کلیت_و_تمامیت_جمهوری_اسلامی', 25 | 'جاویدشاه', 26 | 'نه_به_جمهورى_اسلامى', 27 | 'ریدم_تو_اسلام', 28 | 'براندازم', 29 | 'قيام_تا_سرنگونی', 30 | 'مريم_رجوی']) 31 | 32 | swearing_words.update(bad_hashtags) 33 | 34 | class const: 35 | farsi = ('ب', 'س', 'ش', 'ل', 'ت', 'ن', 'م', 'گ', 'ظ', 'ط', 'ز', 36 | 'ر', 'ژ', 'ذ', 'د', 'پ', 'چ', 'ج', 'ح', 'ع', 37 | 'خ', 'غ', 'ف', 'ق', 'ث', 'ص', 'ض','\u0020', 38 | '\u200C', '\u060c','؟', '!', '?', '.', ':','\n', '_') 39 | 40 | alef = ('ا', 'آ', 'ء', 'أ', 'إ') 41 | vav = ('و', 'ؤ') 42 | heh = ('ه', 'ة', 'ە') 43 | yah = ('ی', 'ي', 'ئ', 'ى') 44 | kaf = ('ک', 'ك') 45 | 46 | hashtags = { 47 | "economics": [ 48 | "بورس", 49 | "نفت", 50 | "دلار", 51 | "بازارکار", 52 | "اقتصادی", 53 | "اخبار_اقتصادی", 54 | "اقتصاد_ایران", 55 | "بازار_آزاد", 56 | "بانک_مرکزی", 57 | "ارز", 58 | "مالیات", 59 | "تورم", 60 | "نرخ", 61 | "تحریم", 62 | "طلا", 63 | "ارز۴۲۰۰", 64 | "گرانی", 65 | "بانک", 66 | "سهام_عدالت", 67 | "خودرو", 68 | "فارکس", 69 | "بنزین", 70 | "بازار", 71 | "نرخ_ارز", 72 | "یورو", 73 | "قیمت_نفت", 74 | "بودجه", 75 | "قیمت", 76 | "بازار_کار", 77 | "اقتصاد", 78 | "سکه", 79 | "فرابورس", 80 | "سهام", 81 | "بیمه", 82 | ], 83 | "health": [ 84 | "کرونا", 85 | "وزارت_بهداشت", 86 | "نه_به_واکسن_اجباری", 87 | "واکسن", 88 | "واکسن_بزنیم", 89 | "كرونا", 90 | "اومیکرون", 91 | "پزشکی", 92 | "واکسن_اجباری", 93 | "واکسن_کرونا", 94 | "پزشک", 95 | "امیکرون", 96 | "واکسیناسیون", 97 | "ماسک", 98 | "آمار_کرونا", 99 | "واکسن_میزنم", 100 | "وزات_بهداشت", 101 | "بهداشت", 102 | "کووید۱۹", 103 | "COVID19", 104 | "وزیر_بهداشت", 105 | "HIV", 106 | "اميكرون", 107 | "نه_به_واکسن", 108 | "بهترین_واکسن_در_دسترس_ترین_واکسن", 109 | "أوميكرون", 110 | "واکسن_حق_مردم", 111 | "واكسن", 112 | "برکت", 113 | ], 114 | "sport": [ 115 | "استقلال", 116 | "پرسپولیس", 117 | "فوتبال", 118 | "پرسپوليس", 119 | "ورزش", 120 | "HalaMadrid", 121 | "رئال_مادرید", 122 | "ورزش_سیاسی_نیست", 123 | "لیگ_برتر", 124 | "تیم_حکومتی", 125 | "تاج", 126 | "آرسنال", 127 | "پیروزی", 128 | "فرهاد_مجیدی", 129 | "والیبال", 130 | "المپیک", 131 | "حامد_لک", 132 | "فوتبال_پاک", 133 | "دربی", 134 | "فیفا", 135 | "لیورپول", 136 | "پنالتی", 137 | "فنرباغچه", 138 | "تراکتور", 139 | "لیگ", 140 | "فدراسیون_آبی", 141 | "ورزش_سیاسی", 142 | "چلسی", 143 | "RealPSG", 144 | "جام_جهانی", 145 | "مهدی_طارمی", 146 | "تیم", 147 | "تنیس", 148 | "باشگاه", 149 | ], 150 | "art": [ 151 | "شعر", 152 | "کتاب", 153 | "سینما", 154 | "تئاتر", 155 | "فیلم", 156 | "سریال", 157 | "كتاب", 158 | "موسیقی", 159 | "پیشنهاد_فیلم", 160 | "آهنگ", 161 | "حافظ", 162 | "سعدی", 163 | "معرفی_کتاب", 164 | "کارگردان", 165 | "خواننده", 166 | "جشنواره_فیلم_فجر", 167 | "film", 168 | "cinema", 169 | "actor", 170 | "drama", 171 | "moviestar", 172 | "Movietime", 173 | ], 174 | "tech": [ 175 | "اینترنت", 176 | "اپل", 177 | "سامسونگ", 178 | "بازی", 179 | "گیم", 180 | "گوگل", 181 | "بیت_کوین", 182 | "کریپتو", 183 | "اتریوم", 184 | "ارزدیجیتال", 185 | "BTC", 186 | "همراه_اول", 187 | "Bitcoin", 188 | "ارز_دیجیتال", 189 | "بيتكوين", 190 | "سئو", 191 | "بیتکوین", 192 | "ایرانسل", 193 | "btc", 194 | "کاردانو", 195 | "دیجیکالا", 196 | "هوشمند", 197 | "استارلینک", 198 | ], 199 | "transport": [ 200 | "ترافیک", 201 | "اسنپ", 202 | "تپسی", 203 | "تاکسی", 204 | "هواپیما", 205 | "مترو", 206 | "اتوبوس", 207 | "طرح_ترافیک", 208 | "قطار", 209 | "فرودگاه", 210 | "سفر_استانی", 211 | "فرودگاه_مهرآباد", 212 | "جاده_چالوس", 213 | ], 214 | "education": [ 215 | "معلم", 216 | "آموزش", 217 | "دانشگاه", 218 | "کنکور", 219 | "دانشگاه_آزاد", 220 | "مدرسه", 221 | "دانش_آموز", 222 | "کنکور_سراسری", 223 | "سازمان_سنجش", 224 | "دانشگاه_تهران", 225 | "آموزش_و_پرورش", 226 | "دانشجو", 227 | "معلمان", 228 | "روز_معلم", 229 | "فرهنگیان", 230 | "مدارس", 231 | "دانشگاه_فرهنگیان", 232 | ], 233 | "religion": [ 234 | "یا_سید_الساجدین", 235 | "امام_سجاد", 236 | "اللهم_عجل_لوليك_الفرج", 237 | "امام_حسین", 238 | "خدا", 239 | "امام", 240 | "رمضان", 241 | "قرآن", 242 | "مسلمان", 243 | "اسلام", 244 | "عاشورا", 245 | "شیعه", 246 | "حج", 247 | "MuhammadForAll", 248 | "زين_العابدين", 249 | "امام_رضا", 250 | ], 251 | "lifestyle": [ 252 | "شیک", 253 | "زیبایی", 254 | "تقویم_آشپزی", 255 | "پوست", 256 | "آشپزی", 257 | "غذا", 258 | "قهوه", 259 | "رستوران", 260 | ], 261 | "social": [ 262 | "روز_جهانی_زن", 263 | "زن", 264 | "زنان", 265 | "روز_زن", 266 | "خانواده", 267 | "کشف_حجاب", 268 | "هشتم_مارس", 269 | "باحجاب_باوقار", 270 | "خودکشی", 271 | "ازدواج", 272 | "طلاق", 273 | "فقر", 274 | "مردان", 275 | "کودک_همسری", 276 | "زندانی_سیاسی", 277 | "حقوق_زنان", 278 | "حجاب", 279 | ], 280 | "ecology": [ 281 | "باران", 282 | "هوا", 283 | "آب", 284 | "زلزله", 285 | "کم_آبی", 286 | "آلودگی_هوا", 287 | "آلودگی", 288 | "ریزگرد", 289 | "هوای_تهران", 290 | "کولاک", 291 | "گردوخاک", 292 | "گردوغبار", 293 | "بارش", 294 | "سیلاب", 295 | "بارندگی", 296 | "آلودگی_هوای_تهران", 297 | "مدیریت_بحران", 298 | "برف", 299 | "سیل", 300 | "آتش", 301 | "آتش_سوزی", 302 | "خشکسالی", 303 | "محیط_زیست", 304 | "خاک", 305 | "هواشناسى", 306 | "هواشناسی_توییتر", 307 | ], 308 | } 309 | 310 | 311 | def remover(char): 312 | if char in const.farsi: 313 | return char 314 | if char in const.alef: 315 | return const.alef[0] 316 | if char in const.vav: 317 | return const.vav[0] 318 | if char in const.heh: 319 | return const.heh[0] 320 | if char in const.yah: 321 | return const.yah[0] 322 | if char in const.kaf: 323 | return const.kaf[0] 324 | return '' 325 | 326 | 327 | def pre_process(text): 328 | persian_words = map(remover, text) 329 | sentence = ''.join(persian_words) 330 | if (len(sentence) < 20): 331 | return None 332 | word_tokens = word_tokenize(sentence) 333 | 334 | for w in word_tokens: 335 | if w in swearing_words: 336 | return None 337 | 338 | filtered_stopwords = [w for w in word_tokens if w not in stopwords and len(w) > 1] 339 | 340 | if (len(filtered_stopwords) < 5): 341 | return None 342 | filtered_stopwords = ' '.join(filtered_stopwords) 343 | return filtered_stopwords 344 | 345 | 346 | def main(args): 347 | df = pd.DataFrame([]) 348 | for topic in tqdm(hashtags.keys(), desc='Scraping Topics'): 349 | scraper = TwitterScraper( 350 | max_results=args.max_results, 351 | hashtags=hashtags[topic], 352 | lang=args.lang, 353 | until=args.until, 354 | since=args.since, 355 | with_replies=args.with_replies, 356 | ) 357 | result = scraper.basic_mode() 358 | result['topic'] = topic 359 | df = pd.concat([df, result], axis=0) 360 | 361 | # preprocess 362 | df = df[df['username'].notna()] 363 | tweets = map(pre_process, df.text) 364 | tweets = list(tweets) 365 | df['processed_text'] = tweets 366 | df = df[df['processed_text'].notna()] 367 | df = df.reset_index(drop=True) 368 | 369 | df = df.drop_duplicates(subset='tweet_id') 370 | print('-- Dataframe shape: {}'.format(df.shape)) 371 | df = df.groupby('topic').apply(lambda x: x.sample( len(x) if len(x) < 10000 else 10000)).reset_index(drop=True) 372 | df = df.reset_index(drop=True) 373 | 374 | df.to_csv("../datasets/twitter_dataset.tsv", index=False, sep='\t') 375 | print('[ OK ] Dataset created.') 376 | 377 | 378 | if __name__ == "__main__": 379 | parser = argparse.ArgumentParser() 380 | parser.add_argument("--max_results", default=(2 * (10 ** 4)), type=int) 381 | parser.add_argument("--lang", default="fa", type=str) 382 | parser.add_argument("--until", default="2022-02-10", type=str) 383 | parser.add_argument("--since", default="2019-06-01", type=str) 384 | parser.add_argument("--with_replies", default=False, type=bool) 385 | args = parser.parse_args() 386 | print(args) 387 | main(args) 388 | -------------------------------------------------------------------------------- /tools/scraper.py: -------------------------------------------------------------------------------- 1 | # https://github.com/JustAnotherArchivist/snscrape/blob/master/snscrape/modules/twitter.py 2 | import snscrape.modules.twitter as sntwitter 3 | import pandas as pd 4 | from random import random 5 | from datetime import date 6 | from multiprocessing.dummy import Pool 7 | import time 8 | 9 | 10 | class TwitterScraper: 11 | def __init__( 12 | self, 13 | max_results: int, 14 | all_words=[], 15 | exact_pharase=[], 16 | any_words=[], 17 | none_words=[], 18 | hashtags=[], 19 | mentioned_users=[], 20 | from_users=[], 21 | to_users=[], 22 | with_links=True, 23 | with_replies=True, 24 | **kwargs, 25 | ): 26 | """ 27 | :param max_results: Number of tweets will be captured per user. 28 | :param all_words: ['what’s', 'happening'] · contains both “what’s” and “happening” 29 | :param exact_pharase: ['happy hour'] · contains the exact phrase “happy hour” 30 | :param any_words: ['cats', 'dogs'] · contains either “cats” or “dogs” (or both) 31 | :param none_words: ['cats', 'dogs'] · does not contain “cats” and does not contain “dogs” 32 | :param hashtags: ['#ThrowbackThursday'] or ['ThrowbackThursday'] · contains the hashtag #ThrowbackThursday 33 | :param mentioned_users: ['@SFBART', '@Caltrain'] or ['SFBART', 'Caltrain'] · mentions @SFBART or mentions @Caltrain 34 | :param from_users: ['@Twitter'] or ['Twitter'] · sent from @Twitter 35 | :param to_users: ['@Twitter'] or ['Twitter'] · sent in reply to @Twitter 36 | :param with_links: Tweets contain link 37 | :param with_replies: Tweets as a replies 38 | :param kwargs: The setting of start_time, end_time, and language of the tweets 39 | 40 | """ 41 | 42 | self.number_of_user = 0 43 | self.max_results = max_results 44 | self.all_words = TwitterScraper.all_of_these_words(all_words) 45 | self.exact_pharase = TwitterScraper.any_of_these_exact_pharase(exact_pharase) 46 | self.any_words = TwitterScraper.any_of_these_words(any_words) 47 | self.none_words = TwitterScraper.none_of_these_words(none_words) 48 | self.these_hashtags = TwitterScraper.any_of_these_hashtags(hashtags) 49 | self.mentioned_users = TwitterScraper.mentioning_these_users(mentioned_users) 50 | self.with_links = f"-filter:links" if not with_links else "" 51 | self.with_replies = f"-filter:replies" if not with_replies else "" 52 | 53 | self.query_dict = { 54 | "all_words": self.all_words, 55 | "exact_pharase": self.exact_pharase, 56 | "any_words": self.any_words, 57 | "none_words": self.none_words, 58 | "these_hashtags": self.these_hashtags, 59 | "mentioned_users": self.mentioned_users, 60 | "with_links": self.with_links, 61 | "with_replies": self.with_replies, 62 | } 63 | 64 | self.query_dict["from"] = TwitterScraper.f_or_t_users(from_users, "from") 65 | self.query_dict["to"] = TwitterScraper.f_or_t_users(to_users, "to") 66 | 67 | for key, value in kwargs.items(): 68 | self.query_dict[key] = f"({key}:{value})" 69 | 70 | @staticmethod 71 | def f_or_t_users(users, key): 72 | if not users: 73 | return "" 74 | tmp_list = [f"{key}:{user}" for user in users] 75 | return "(" + " OR ".join(tmp_list) + ")" 76 | 77 | @staticmethod 78 | def all_of_these_words(all_words): 79 | if not all_words: 80 | return "" 81 | return " ".join(all_words) 82 | 83 | @staticmethod 84 | def any_of_these_words(any_words): 85 | if not any_words: 86 | return "" 87 | return "(" + " OR ".join(any_words) + ")" 88 | 89 | @staticmethod 90 | def any_of_these_exact_pharase(exact_pharase): 91 | if not exact_pharase: 92 | return "" 93 | return '("' + '" OR "'.join(exact_pharase) + '")' 94 | 95 | @staticmethod 96 | def none_of_these_words(none_words): 97 | if not none_words: 98 | return "" 99 | return "-" + " -".join(none_words) 100 | 101 | @staticmethod 102 | def any_of_these_hashtags(hashtags): 103 | if not hashtags: 104 | return "" 105 | tmp_list = ["#" + h.replace("#", "") for h in hashtags] 106 | return "(" + " OR ".join(tmp_list) + ")" 107 | 108 | @staticmethod 109 | def mentioning_these_users(users): 110 | if not users: 111 | return "" 112 | tmp_list = ["@" + h.replace("@", "") for h in users] 113 | return "(" + " OR ".join(tmp_list) + ")" 114 | 115 | def create_query(self, query_dict): 116 | tmp_string = "" 117 | res = dict([(key, val) for key, val in query_dict.items() if val]) 118 | del query_dict 119 | query = " ".join(res.values()) 120 | del res 121 | 122 | return query 123 | 124 | def crawler(self, query, error_counter=0): 125 | """ 126 | This is the main function to send the query to Twitter and collect the tweets. 127 | 128 | :param query: The Twitter query that we built it. 129 | """ 130 | # Creating list to append tweet data 131 | tweets_list = [] 132 | try: 133 | # Using TwitterSearchScraper to scrape data and append tweets to list 134 | scraper = sntwitter.TwitterSearchScraper(query) 135 | i = 0 136 | for tweet in scraper.get_items(): # declare a username 137 | if i >= self.max_results: # check number and date 138 | break 139 | 140 | tweets_list.append( 141 | [ 142 | tweet.date, 143 | tweet.id, 144 | tweet.content, 145 | tweet.replyCount, 146 | tweet.retweetCount, 147 | tweet.likeCount, 148 | tweet.user.username, 149 | tweet.lang, 150 | tweet.media, 151 | tweet.hashtags, 152 | ] 153 | ) # declare the attributes to be returned 154 | i += 1 155 | except Exception as e: 156 | if "Unable to find guest token" in str(e): 157 | error_counter += 1 158 | if error_counter > 3: 159 | error_counter = 0 160 | print("Sleep Time!") 161 | time.sleep(30.3 * 60) 162 | print("Morning!") 163 | 164 | return self.crawler(query, error_counter) 165 | print(f"query: {query} , {e}") 166 | 167 | # Creating a dataframe from the tweets list above 168 | tweets_df = pd.DataFrame( 169 | tweets_list, 170 | columns=[ 171 | "datetime", 172 | "tweet_id", 173 | "text", 174 | "reply_count", 175 | "retweet_count", 176 | "like_count", 177 | "username", 178 | "lang", 179 | "media", 180 | "hashtags", 181 | ], 182 | ) 183 | return tweets_df 184 | 185 | def basic_mode(self): 186 | query = self.create_query(self.query_dict) 187 | return self.crawler(query) 188 | 189 | def user_crawler(self, user): 190 | """ 191 | Calling the query function for a specefic user. 192 | 193 | :param user: Account username in Twitter 194 | """ 195 | tmp_dict = self.query_dict.copy() 196 | tmp_dict["from"] = f"(from:{user})" 197 | query = self.create_query(tmp_dict) 198 | del tmp_dict 199 | return self.crawler(query) 200 | 201 | def user_mode(self, user_list): 202 | """ 203 | parallelize the process with multithreading(call query function per user) 204 | 205 | :param user_list: List of users that we going to collect their tweets. 206 | """ 207 | user_crawler = self.user_crawler 208 | pool = Pool(22) 209 | df_list = pool.map(user_crawler, user_list) 210 | pool.close() 211 | pool.join() 212 | result_df = pd.concat(df_list, ignore_index=True) 213 | return result_df 214 | --------------------------------------------------------------------------------