├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── build
    ├── Dockerfile
    └── requirements.txt
├── data
    ├── README.md
    └── test.csv
├── docker_build_run.sh
├── docs
    ├── images
    │   ├── BERT
    │   │   ├── 2D_vis_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic0_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic1_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic2_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic3_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic4_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic5_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic6_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic7_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   ├── Topic8_wordcloud_BERT_2020_02_07_06_36_35.png
    │   │   └── Topic9_wordcloud_BERT_2020_02_07_06_36_35.png
    │   ├── LDA_BERT
    │   │   ├── 2D_vis_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic0_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic1_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic2_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic3_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic4_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic5_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic6_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic7_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   ├── Topic8_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   │   └── Topic9_wordcloud_LDA_BERT_2020_02_07_06_53_30.png
    │   ├── TFIDF
    │   │   ├── 2D_vis_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic0_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic1_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic2_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic3_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic4_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic5_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic6_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic7_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   ├── Topic8_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   │   └── Topic9_wordcloud_TFIDF_2020_02_07_06_24_55.png
    │   ├── bert.png
    │   ├── lda_bert.png
    │   ├── model.png
    │   ├── steam_review.jpeg
    │   └── tfidf.png
    └── saved_models
    │   └── README.md
├── model
    ├── Autoencoder.py
    ├── README.md
    ├── main.py
    ├── model.py
    ├── preprocess.py
    └── utils.py
├── requirements.txt
└── test.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | *.i
 3 | *.ii
 4 | *.gpu
 5 | *.ptx
 6 | *.cubin
 7 | *.fatbin
 8 | *.ipynb
 9 | model/__pycache__
10 | *.swp
11 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7-slim
2 | COPY ./requirements.txt /
3 | RUN pip install -r requirements.txt
4 | RUN python -m nltk.downloader punkt
5 | RUN python -m nltk.downloader wordnet
6 | RUN python -m nltk.downloader averaged_perceptron_tagger
7 | ENTRYPOINT ["python", "-u", "contextual_topic_identification/model/main.py"]
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Stve
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Contextual Topic Identification for Steam Reviews
  2 | 
  3 | This repository is the implementation of contextual topic identification model. The model is based on _LDA_ probabilistic topic assignment and pre-trained sentence embeddings from _BERT/RoBERTa_. The analysis is conducted on the dataset of game reviews on the steam platform.
  4 | 
  5 | ## Motivation
  6 | 
  7 | Product reviews are important as influencing people's choices especially for online shopping. We usually have dreadfully huge numbers of reviews for whatever products. However, many platforms have barely a satisfying categorization system for the reviews when it comes to what the reviewers are really talking about. Steam, for example, has a very carefully designed system where people can do a lot of things, but still there is no such access to categorizing the reviews by their semantic meanings.
  8 | 
  9 | ![Steam review logo](./docs/images/steam_review.jpeg)
 10 | 
 11 | Therefore, we provide a topic identification procedure thats combines both bag-of-words and contextual information to explore potential __semantically meaningful__ categories out of the oceans of steam reviews.
 12 | 
 13 | ## Setup
 14 | 
 15 | Clone the repo
 16 | 
 17 | ```
 18 | git clone https://github.com/Stveshawn/contextual_topic_identification.git
 19 | cd contextual_topic_identification
 20 | ```
 21 | 
 22 | and make sure you have dataset in the `data` folder (you can specify the path in the bash script later).
 23 | 
 24 | 
 25 | To run the model and get trained model objects and visualization
 26 | 
 27 | ### With Docker
 28 | 
 29 | run the bash script on your terminal
 30 | 
 31 | ```
 32 | sudo bash docker_build_run.sh
 33 | ```
 34 | 
 35 | The results will be saved in the `docs` folder with corresponding model id (_Method_Year_Month_Day_Hour_Minute_Second_).
 36 | 
 37 | Four parameters can be specified in the bash script
 38 | 
 39 | + `samp_size`: number of reviews used in the model
 40 | + `method={"TFIDF", "LDA", "BERT", "LDA_BERT"}`: method for the topic model
 41 | + `ntopic`: number of topics
 42 | + `fpath=/contextual_topic_identification/data/steam_reviews.csv`: file path to the csv data
 43 | 
 44 | To run a test case on the sampled test data, do
 45 | ```
 46 | sudo bash test.sh
 47 | ```
 48 | 
 49 | 
 50 | ## Data
 51 | 
 52 | The dataset ([Steam review dataset](https://www.kaggle.com/luthfim/steam-reviews-dataset)) is published on Kaggle covering ~480K reviews for 46 best selling video games on steam.
 53 | 
 54 | To successfully run the model, you should have this dataset downloaded (kaggle authentication required) and placed in the `data` folder (or specify your own file path in the bash script).
 55 | 
 56 | ## Model
 57 | 
 58 | To identify the potential topics of the target documents, traditional approaches are
 59 | 
 60 | + Latent Dirichlet Allocation
 61 | 
 62 | + Embedding + Clustering
 63 | 
 64 | Although LDA generally works well for topic modeling tasks, it fails with short documents, in which there isn’t much text to model and documents that don’t coherently discuss topics. Using only bag-of-words information also make it quite limited. 
 65 | 
 66 | The contextual topic identification model leverages both bag-of-words and contextual information by including both the LDA topic probabilities and the sentence embeddings. The model is as follows
 67 | 
 68 | ![Model](./docs/images/model.png)
 69 | 
 70 | 
 71 | where we 
 72 | 
 73 | + take the information from LDA probabilistic topic assignment (`v_1`) and sentence embeddings (`v_2`)
 74 | + concatenate `\lambda * v_1` and `v_2` to get `v_{full}`
 75 | + learn the latent space representation `v_{latent}` of by autoencoder
 76 | + implement clustering on the latent space representations.
 77 | 
 78 | 
 79 | ## Result
 80 | 
 81 | Visualizations (2D UMAP) of clustering results with different vectorization methods with `n_topic=10`
 82 | 
 83 | | TF-IDF | BERT | LDA_BERT |
 84 | |---|---|---|
 85 | ![Model](./docs/images/tfidf.png) | ![Model](./docs/images/bert.png) | ![Model](./docs/images/lda_bert.png)|
 86 | 
 87 | 
 88 | Evaluation of different topic identification models with `n_topic=10`
 89 | 
 90 | 
 91 | 
 92 | | Metric\Method | TF-IDF + Clustering | LDA | BERT + Clustering | LDA_BERT + Clustering |
 93 | |---|---|---|---|---|
 94 | |C_Umass|__-2.161__|-5.233|-4.368|-3.394|
 95 | |CV|0.538|0.482|0.547|__0.551__|
 96 | |Silhouette score|0.025|/|0.063|__0.234__|
 97 | 
 98 | ## Acknowledgements
 99 | 
100 | ### Libraries
101 | 
102 | [Sentence Transformers](https://github.com/UKPLab/sentence-transformers): Sentence Embeddings using BERT / RoBERTa / DistilBERT / ALBERT / XLNet with PyTorch
103 | 
104 | [SymSpell](https://github.com/wolfgarbe/SymSpell): 1 million times faster through Symmetric Delete spelling correction algorithm
105 | 
106 | [Gensim](https://github.com/RaRe-Technologies/gensim): Topic Modelling in Python
107 | 


--------------------------------------------------------------------------------
/build/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7-slim
2 | COPY ./requirements.txt /
3 | RUN pip install -r requirements.txt
4 | RUN python -m nltk.downloader punkt
5 | RUN python -m nltk.downloader wordnet
6 | RUN python -m nltk.downloader averaged_perceptron_tagger
7 | ENTRYPOINT ["python", "-u", "contextual_topic_identification/model/main.py"]
8 | 


--------------------------------------------------------------------------------
/build/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | numpy>=1.18.1
 3 | pandas
 4 | stop_words
 5 | language_detector
 6 | sklearn
 7 | symspellpy==6.5.2
 8 | gensim==3.8.1
 9 | wordcloud==1.6.0
10 | tensorflow==1.14.0
11 | keras==2.3.1
12 | sentence-transformers==0.2.5
13 | umap-learn==0.3.10
14 | nltk==3.4.5
15 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | To train the model on steam review dataset, you should have `steam_reviews.csv` downloaded and put in this folder.
2 | 


--------------------------------------------------------------------------------
/docker_build_run.sh:
--------------------------------------------------------------------------------
1 | docker build . -t tm:1.0
2 | docker run --rm -v $(pwd):/contextual_topic_identification tm:1.0 --samp_size=3000 --method=LDA_BERT --ntopic=10
3 | 


--------------------------------------------------------------------------------
/docs/images/BERT/2D_vis_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/2D_vis_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic0_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic0_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic1_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic1_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic2_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic2_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic3_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic3_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic4_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic4_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic5_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic5_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic6_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic6_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic7_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic7_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic8_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic8_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/BERT/Topic9_wordcloud_BERT_2020_02_07_06_36_35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/BERT/Topic9_wordcloud_BERT_2020_02_07_06_36_35.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/2D_vis_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/2D_vis_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic0_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic0_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic1_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic1_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic2_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic2_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic3_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic3_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic4_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic4_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic5_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic5_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic6_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic6_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic7_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic7_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic8_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic8_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/LDA_BERT/Topic9_wordcloud_LDA_BERT_2020_02_07_06_53_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/LDA_BERT/Topic9_wordcloud_LDA_BERT_2020_02_07_06_53_30.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/2D_vis_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/2D_vis_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic0_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic0_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic1_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic1_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic2_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic2_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic3_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic3_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic4_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic4_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic5_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic5_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic6_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic6_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic7_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic7_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic8_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic8_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/TFIDF/Topic9_wordcloud_TFIDF_2020_02_07_06_24_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/TFIDF/Topic9_wordcloud_TFIDF_2020_02_07_06_24_55.png


--------------------------------------------------------------------------------
/docs/images/bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/bert.png


--------------------------------------------------------------------------------
/docs/images/lda_bert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/lda_bert.png


--------------------------------------------------------------------------------
/docs/images/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/model.png


--------------------------------------------------------------------------------
/docs/images/steam_review.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/steam_review.jpeg


--------------------------------------------------------------------------------
/docs/images/tfidf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stveshawn/contextual_topic_identification/4129beb8e3e24af9870b7aeb4c1c9333569e86e5/docs/images/tfidf.png


--------------------------------------------------------------------------------
/docs/saved_models/README.md:
--------------------------------------------------------------------------------
1 | Folder for saved model objects after training.
2 | 


--------------------------------------------------------------------------------
/model/Autoencoder.py:
--------------------------------------------------------------------------------
 1 | import keras
 2 | from keras.layers import Input, Dense
 3 | from keras.models import Model
 4 | from sklearn.model_selection import train_test_split
 5 | import warnings
 6 | warnings.filterwarnings('ignore')
 7 | 
 8 | 
 9 | class Autoencoder:
10 |     """
11 |     Autoencoder for learning latent space representation
12 |     architecture simplified for only one hidden layer
13 |     """
14 | 
15 |     def __init__(self, latent_dim=32, activation='relu', epochs=200, batch_size=128):
16 |         self.latent_dim = latent_dim
17 |         self.activation = activation
18 |         self.epochs = epochs
19 |         self.batch_size = batch_size
20 |         self.autoencoder = None
21 |         self.encoder = None
22 |         self.decoder = None
23 |         self.his = None
24 | 
25 |     def _compile(self, input_dim):
26 |         """
27 |         compile the computational graph
28 |         """
29 |         input_vec = Input(shape=(input_dim,))
30 |         encoded = Dense(self.latent_dim, activation=self.activation)(input_vec)
31 |         decoded = Dense(input_dim, activation=self.activation)(encoded)
32 |         self.autoencoder = Model(input_vec, decoded)
33 |         self.encoder = Model(input_vec, encoded)
34 |         encoded_input = Input(shape=(self.latent_dim,))
35 |         decoder_layer = self.autoencoder.layers[-1]
36 |         self.decoder = Model(encoded_input, self.autoencoder.layers[-1](encoded_input))
37 |         self.autoencoder.compile(optimizer='adam', loss=keras.losses.mean_squared_error)
38 | 
39 |     def fit(self, X):
40 |         if not self.autoencoder:
41 |             self._compile(X.shape[1])
42 |         X_train, X_test = train_test_split(X)
43 |         self.his = self.autoencoder.fit(X_train, X_train,
44 |                                         epochs=200,
45 |                                         batch_size=128,
46 |                                         shuffle=True,
47 |                                         validation_data=(X_test, X_test), verbose=0)


--------------------------------------------------------------------------------
/model/README.md:
--------------------------------------------------------------------------------
1 | ## Model scripts
2 | 
3 | All python scripts go here.
4 | 


--------------------------------------------------------------------------------
/model/main.py:
--------------------------------------------------------------------------------
 1 | from model import *
 2 | from utils import *
 3 | import pandas as pd
 4 | import pickle
 5 | import matplotlib.pyplot as plt
 6 | 
 7 | import warnings
 8 | warnings.filterwarnings('ignore', category=Warning)
 9 | 
10 | import argparse
11 | 
12 | if __name__ == '__main__':
13 | 
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('--fpath', default='/contextual_topic_identification/data/steam_reviews.csv')
16 |     parser.add_argument('--ntopic', default=10)
17 |     parser.add_argument('--method', default='TFIDF')
18 |     parser.add_argument('--samp_size', default=10000)
19 |     args = parser.parse_args()
20 | 
21 |     data = pd.read_csv(str(args.fpath))
22 |     data = data.fillna('')  # only the comments has NaN's
23 |     rws = data.review
24 |     sentences, token_lists, idx_in = preprocess(rws, samp_size=int(args.samp_size))
25 |     # Define the topic model object
26 |     tm = Topic_Model(k = int(args.ntopic), method = str(args.method))
27 |     # Fit the topic model by chosen method
28 |     tm.fit(sentences, token_lists)
29 |     # Evaluate using metrics
30 |     with open("/contextual_topic_identification/docs/saved_models/{}.file".format(tm.id), "wb") as f:
31 |         pickle.dump(tm, f, pickle.HIGHEST_PROTOCOL)
32 | 
33 |     print('Coherence:', get_coherence(tm, token_lists, 'c_v'))
34 |     print('Silhouette Score:', get_silhouette(tm))
35 |     # visualize and save img
36 |     visualize(tm)
37 |     for i in range(tm.k):
38 |         get_wordcloud(tm, token_lists, i)
39 | 


--------------------------------------------------------------------------------
/model/model.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import TfidfVectorizer
  2 | from sklearn.cluster import KMeans
  3 | from gensim import corpora
  4 | import gensim
  5 | import numpy as np
  6 | from Autoencoder import *
  7 | from preprocess import *
  8 | from datetime import datetime
  9 | 
 10 | 
 11 | def preprocess(docs, samp_size=None):
 12 |     """
 13 |     Preprocess the data
 14 |     """
 15 |     if not samp_size:
 16 |         samp_size = 100
 17 | 
 18 |     print('Preprocessing raw texts ...')
 19 |     n_docs = len(docs)
 20 |     sentences = []  # sentence level preprocessed
 21 |     token_lists = []  # word level preprocessed
 22 |     idx_in = []  # index of sample selected
 23 |     #     samp = list(range(100))
 24 |     samp = np.random.choice(n_docs, samp_size)
 25 |     for i, idx in enumerate(samp):
 26 |         sentence = preprocess_sent(docs[idx])
 27 |         token_list = preprocess_word(sentence)
 28 |         if token_list:
 29 |             idx_in.append(idx)
 30 |             sentences.append(sentence)
 31 |             token_lists.append(token_list)
 32 |         print('{} %'.format(str(np.round((i + 1) / len(samp) * 100, 2))), end='\r')
 33 |     print('Preprocessing raw texts. Done!')
 34 |     return sentences, token_lists, idx_in
 35 | 
 36 | 
 37 | # define model object
 38 | class Topic_Model:
 39 |     def __init__(self, k=10, method='TFIDF'):
 40 |         """
 41 |         :param k: number of topics
 42 |         :param method: method chosen for the topic model
 43 |         """
 44 |         if method not in {'TFIDF', 'LDA', 'BERT', 'LDA_BERT'}:
 45 |             raise Exception('Invalid method!')
 46 |         self.k = k
 47 |         self.dictionary = None
 48 |         self.corpus = None
 49 |         #         self.stopwords = None
 50 |         self.cluster_model = None
 51 |         self.ldamodel = None
 52 |         self.vec = {}
 53 |         self.gamma = 15  # parameter for reletive importance of lda
 54 |         self.method = method
 55 |         self.AE = None
 56 |         self.id = method + '_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
 57 | 
 58 |     def vectorize(self, sentences, token_lists, method=None):
 59 |         """
 60 |         Get vecotr representations from selected methods
 61 |         """
 62 |         # Default method
 63 |         if method is None:
 64 |             method = self.method
 65 | 
 66 |         # turn tokenized documents into a id <-> term dictionary
 67 |         self.dictionary = corpora.Dictionary(token_lists)
 68 |         # convert tokenized documents into a document-term matrix
 69 |         self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]
 70 | 
 71 |         if method == 'TFIDF':
 72 |             print('Getting vector representations for TF-IDF ...')
 73 |             tfidf = TfidfVectorizer()
 74 |             vec = tfidf.fit_transform(sentences)
 75 |             print('Getting vector representations for TF-IDF. Done!')
 76 |             return vec
 77 | 
 78 |         elif method == 'LDA':
 79 |             print('Getting vector representations for LDA ...')
 80 |             if not self.ldamodel:
 81 |                 self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary,
 82 |                                                                 passes=20)
 83 | 
 84 |             def get_vec_lda(model, corpus, k):
 85 |                 """
 86 |                 Get the LDA vector representation (probabilistic topic assignments for all documents)
 87 |                 :return: vec_lda with dimension: (n_doc * n_topic)
 88 |                 """
 89 |                 n_doc = len(corpus)
 90 |                 vec_lda = np.zeros((n_doc, k))
 91 |                 for i in range(n_doc):
 92 |                     # get the distribution for the i-th document in corpus
 93 |                     for topic, prob in model.get_document_topics(corpus[i]):
 94 |                         vec_lda[i, topic] = prob
 95 | 
 96 |                 return vec_lda
 97 | 
 98 |             vec = get_vec_lda(self.ldamodel, self.corpus, self.k)
 99 |             print('Getting vector representations for LDA. Done!')
100 |             return vec
101 | 
102 |         elif method == 'BERT':
103 | 
104 |             print('Getting vector representations for BERT ...')
105 |             from sentence_transformers import SentenceTransformer
106 |             model = SentenceTransformer('bert-base-nli-max-tokens')
107 |             vec = np.array(model.encode(sentences, show_progress_bar=True))
108 |             print('Getting vector representations for BERT. Done!')
109 |             return vec
110 | 
111 |         #         elif method == 'LDA_BERT':
112 |         else:
113 |             vec_lda = self.vectorize(sentences, token_lists, method='LDA')
114 |             vec_bert = self.vectorize(sentences, token_lists, method='BERT')
115 |             vec_ldabert = np.c_[vec_lda * self.gamma, vec_bert]
116 |             self.vec['LDA_BERT_FULL'] = vec_ldabert
117 |             if not self.AE:
118 |                 self.AE = Autoencoder()
119 |                 print('Fitting Autoencoder ...')
120 |                 self.AE.fit(vec_ldabert)
121 |                 print('Fitting Autoencoder Done!')
122 |             vec = self.AE.encoder.predict(vec_ldabert)
123 |             return vec
124 | 
125 |     def fit(self, sentences, token_lists, method=None, m_clustering=None):
126 |         """
127 |         Fit the topic model for selected method given the preprocessed data
128 |         :docs: list of documents, each doc is preprocessed as tokens
129 |         :return:
130 |         """
131 |         # Default method
132 |         if method is None:
133 |             method = self.method
134 |         # Default clustering method
135 |         if m_clustering is None:
136 |             m_clustering = KMeans
137 | 
138 |         # turn tokenized documents into a id <-> term dictionary
139 |         if not self.dictionary:
140 |             self.dictionary = corpora.Dictionary(token_lists)
141 |             # convert tokenized documents into a document-term matrix
142 |             self.corpus = [self.dictionary.doc2bow(text) for text in token_lists]
143 | 
144 |         ####################################################
145 |         #### Getting ldamodel or vector representations ####
146 |         ####################################################
147 | 
148 |         if method == 'LDA':
149 |             if not self.ldamodel:
150 |                 print('Fitting LDA ...')
151 |                 self.ldamodel = gensim.models.ldamodel.LdaModel(self.corpus, num_topics=self.k, id2word=self.dictionary,
152 |                                                                 passes=20)
153 |                 print('Fitting LDA Done!')
154 |         else:
155 |             print('Clustering embeddings ...')
156 |             self.cluster_model = m_clustering(self.k)
157 |             self.vec[method] = self.vectorize(sentences, token_lists, method)
158 |             self.cluster_model.fit(self.vec[method])
159 |             print('Clustering embeddings. Done!')
160 | 
161 |     def predict(self, sentences, token_lists, out_of_sample=None):
162 |         """
163 |         Predict topics for new_documents
164 |         """
165 |         # Default as False
166 |         out_of_sample = out_of_sample is not None
167 | 
168 |         if out_of_sample:
169 |             corpus = [self.dictionary.doc2bow(text) for text in token_lists]
170 |             if self.method != 'LDA':
171 |                 vec = self.vectorize(sentences, token_lists)
172 |                 print(vec)
173 |         else:
174 |             corpus = self.corpus
175 |             vec = self.vec.get(self.method, None)
176 | 
177 |         if self.method == "LDA":
178 |             lbs = np.array(list(map(lambda x: sorted(self.ldamodel.get_document_topics(x),
179 |                                                      key=lambda x: x[1], reverse=True)[0][0],
180 |                                     corpus)))
181 |         else:
182 |             lbs = self.cluster_model.predict(vec)
183 |         return lbs
184 | 


--------------------------------------------------------------------------------
/model/preprocess.py:
--------------------------------------------------------------------------------
  1 | from stop_words import get_stop_words
  2 | from nltk.stem.porter import PorterStemmer
  3 | import re
  4 | import nltk
  5 | from nltk.tokenize import word_tokenize
  6 | from language_detector import detect_language
  7 | 
  8 | import pkg_resources
  9 | from symspellpy import SymSpell, Verbosity
 10 | 
 11 | sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
 12 | dictionary_path = pkg_resources.resource_filename(
 13 |     "symspellpy", "frequency_dictionary_en_82_765.txt")
 14 | if sym_spell.word_count:
 15 |     pass
 16 | else:
 17 |     sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
 18 | 
 19 | 
 20 | ###################################
 21 | #### sentence level preprocess ####
 22 | ###################################
 23 | 
 24 | # lowercase + base filter
 25 | # some basic normalization
 26 | def f_base(s):
 27 |     """
 28 |     :param s: string to be processed
 29 |     :return: processed string: see comments in the source code for more info
 30 |     """
 31 |     # normalization 1: xxxThis is a --> xxx. This is a (missing delimiter)
 32 |     s = re.sub(r'([a-z])([A-Z])', r'\1\. \2', s)  # before lower case
 33 |     # normalization 2: lower case
 34 |     s = s.lower()
 35 |     # normalization 3: "&gt", "&lt"
 36 |     s = re.sub(r'&gt|&lt', ' ', s)
 37 |     # normalization 4: letter repetition (if more than 2)
 38 |     s = re.sub(r'([a-z])\1{2,}', r'\1', s)
 39 |     # normalization 5: non-word repetition (if more than 1)
 40 |     s = re.sub(r'([\W+])\1{1,}', r'\1', s)
 41 |     # normalization 6: string * as delimiter
 42 |     s = re.sub(r'\*|\W\*|\*\W', '. ', s)
 43 |     # normalization 7: stuff in parenthesis, assumed to be less informal
 44 |     s = re.sub(r'\(.*?\)', '. ', s)
 45 |     # normalization 8: xxx[?!]. -- > xxx.
 46 |     s = re.sub(r'\W+?\.', '.', s)
 47 |     # normalization 9: [.?!] --> [.?!] xxx
 48 |     s = re.sub(r'(\.|\?|!)(\w)', r'\1 \2', s)
 49 |     # normalization 10: ' ing ', noise text
 50 |     s = re.sub(r' ing ', ' ', s)
 51 |     # normalization 11: noise text
 52 |     s = re.sub(r'product received for free[.| ]', ' ', s)
 53 |     # normalization 12: phrase repetition
 54 |     s = re.sub(r'(.{2,}?)\1{1,}', r'\1', s)
 55 | 
 56 |     return s.strip()
 57 | 
 58 | 
 59 | # language detection
 60 | def f_lan(s):
 61 |     """
 62 |     :param s: string to be processed
 63 |     :return: boolean (s is English)
 64 |     """
 65 | 
 66 |     # some reviews are actually english but biased toward french
 67 |     return detect_language(s) in {'English', 'French'}
 68 | 
 69 | 
 70 | ###############################
 71 | #### word level preprocess ####
 72 | ###############################
 73 | 
 74 | # filtering out punctuations and numbers
 75 | def f_punct(w_list):
 76 |     """
 77 |     :param w_list: word list to be processed
 78 |     :return: w_list with punct and number filter out
 79 |     """
 80 |     return [word for word in w_list if word.isalpha()]
 81 | 
 82 | 
 83 | # selecting nouns
 84 | def f_noun(w_list):
 85 |     """
 86 |     :param w_list: word list to be processed
 87 |     :return: w_list with only nouns selected
 88 |     """
 89 |     return [word for (word, pos) in nltk.pos_tag(w_list) if pos[:2] == 'NN']
 90 | 
 91 | 
 92 | # typo correction
 93 | def f_typo(w_list):
 94 |     """
 95 |     :param w_list: word list to be processed
 96 |     :return: w_list with typo fixed by symspell. words with no match up will be dropped
 97 |     """
 98 |     w_list_fixed = []
 99 |     for word in w_list:
100 |         suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=3)
101 |         if suggestions:
102 |             w_list_fixed.append(suggestions[0].term)
103 |         else:
104 |             pass
105 |             # do word segmentation, deprecated for inefficiency
106 |             # w_seg = sym_spell.word_segmentation(phrase=word)
107 |             # w_list_fixed.extend(w_seg.corrected_string.split())
108 |     return w_list_fixed
109 | 
110 | 
111 | # stemming if doing word-wise
112 | p_stemmer = PorterStemmer()
113 | 
114 | 
115 | def f_stem(w_list):
116 |     """
117 |     :param w_list: word list to be processed
118 |     :return: w_list with stemming
119 |     """
120 |     return [p_stemmer.stem(word) for word in w_list]
121 | 
122 | 
123 | # filtering out stop words
124 | # create English stop words list
125 | en_stop = get_stop_words('en')
126 | en_stop.append('game')
127 | en_stop.append('play')
128 | en_stop.append('player')
129 | en_stop.append('time')
130 | 
131 | 
132 | def f_stopw(w_list):
133 |     """
134 |     filtering out stop words
135 |     """
136 |     return [word for word in w_list if word not in en_stop]
137 | 
138 | 
139 | def preprocess_sent(rw):
140 |     """
141 |     Get sentence level preprocessed data from raw review texts
142 |     :param rw: review to be processed
143 |     :return: sentence level pre-processed review
144 |     """
145 |     s = f_base(rw)
146 |     if not f_lan(s):
147 |         return None
148 |     return s
149 | 
150 | 
151 | def preprocess_word(s):
152 |     """
153 |     Get word level preprocessed data from preprocessed sentences
154 |     including: remove punctuation, select noun, fix typo, stem, stop_words
155 |     :param s: sentence to be processed
156 |     :return: word level pre-processed review
157 |     """
158 |     if not s:
159 |         return None
160 |     w_list = word_tokenize(s)
161 |     w_list = f_punct(w_list)
162 |     w_list = f_noun(w_list)
163 |     w_list = f_typo(w_list)
164 |     w_list = f_stem(w_list)
165 |     w_list = f_stopw(w_list)
166 | 
167 |     return w_list


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from sklearn.metrics import silhouette_score
  3 | import umap
  4 | import matplotlib.pyplot as plt
  5 | from wordcloud import WordCloud
  6 | from gensim.models.coherencemodel import CoherenceModel
  7 | import numpy as np
  8 | import os
  9 | 
 10 | 
 11 | def get_topic_words(token_lists, labels, k=None):
 12 |     """
 13 |     get top words within each topic from clustering results
 14 |     """
 15 |     if k is None:
 16 |         k = len(np.unique(labels))
 17 |     topics = ['' for _ in range(k)]
 18 |     for i, c in enumerate(token_lists):
 19 |         topics[labels[i]] += (' ' + ' '.join(c))
 20 |     word_counts = list(map(lambda x: Counter(x.split()).items(), topics))
 21 |     # get sorted word counts
 22 |     word_counts = list(map(lambda x: sorted(x, key=lambda x: x[1], reverse=True), word_counts))
 23 |     # get topics
 24 |     topics = list(map(lambda x: list(map(lambda x: x[0], x[:10])), word_counts))
 25 | 
 26 |     return topics
 27 | 
 28 | def get_coherence(model, token_lists, measure='c_v'):
 29 |     """
 30 |     Get model coherence from gensim.models.coherencemodel
 31 |     :param model: Topic_Model object
 32 |     :param token_lists: token lists of docs
 33 |     :param topics: topics as top words
 34 |     :param measure: coherence metrics
 35 |     :return: coherence score
 36 |     """
 37 |     if model.method == 'LDA':
 38 |         cm = CoherenceModel(model=model.ldamodel, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
 39 |                             coherence=measure)
 40 |     else:
 41 |         topics = get_topic_words(token_lists, model.cluster_model.labels_)
 42 |         cm = CoherenceModel(topics=topics, texts=token_lists, corpus=model.corpus, dictionary=model.dictionary,
 43 |                             coherence=measure)
 44 |     return cm.get_coherence()
 45 | 
 46 | def get_silhouette(model):
 47 |     """
 48 |     Get silhouette score from model
 49 |     :param model: Topic_Model object
 50 |     :return: silhouette score
 51 |     """
 52 |     if model.method == 'LDA':
 53 |         return
 54 |     lbs = model.cluster_model.labels_
 55 |     vec = model.vec[model.method]
 56 |     return silhouette_score(vec, lbs)
 57 | 
 58 | def plot_proj(embedding, lbs):
 59 |     """
 60 |     Plot UMAP embeddings
 61 |     :param embedding: UMAP (or other) embeddings
 62 |     :param lbs: labels
 63 |     """
 64 |     n = len(embedding)
 65 |     counter = Counter(lbs)
 66 |     for i in range(len(np.unique(lbs))):
 67 |         plt.plot(embedding[:, 0][lbs == i], embedding[:, 1][lbs == i], '.', alpha=0.5,
 68 |                  label='cluster {}: {:.2f}%'.format(i, counter[i] / n * 100))
 69 |     plt.legend()
 70 | 
 71 | 
 72 | def visualize(model):
 73 |     """
 74 |     Visualize the result for the topic model by 2D embedding (UMAP)
 75 |     :param model: Topic_Model object
 76 |     """
 77 |     if model.method == 'LDA':
 78 |         return
 79 |     reducer = umap.UMAP()
 80 |     print('Calculating UMAP projection ...')
 81 |     vec_umap = reducer.fit_transform(model.vec[model.method])
 82 |     print('Calculating UMAP projection. Done!')
 83 |     plot_proj(vec_umap, model.cluster_model.labels_)
 84 |     dr = '/contextual_topic_identification/docs/images/{}/{}'.format(model.method, model.id)
 85 |     if not os.path.exists(dr):
 86 |         os.makedirs(dr)
 87 |     plt.savefig(dr + '/2D_vis')
 88 | 
 89 | def get_wordcloud(model, token_lists, topic):
 90 |     """
 91 |     Get word cloud of each topic from fitted model
 92 |     :param model: Topic_Model object
 93 |     :param sentences: preprocessed sentences from docs
 94 |     """
 95 |     if model.method == 'LDA':
 96 |         return
 97 |     print('Getting wordcloud for topic {} ...'.format(topic))
 98 |     lbs = model.cluster_model.labels_
 99 |     tokens = ' '.join([' '.join(_) for _ in np.array(token_lists)[lbs == topic]])
100 | 
101 |     wordcloud = WordCloud(width=800, height=560,
102 |                           background_color='white', collocations=False,
103 |                           min_font_size=10).generate(tokens)
104 | 
105 |     # plot the WordCloud image
106 |     plt.figure(figsize=(8, 5.6), facecolor=None)
107 |     plt.imshow(wordcloud)
108 |     plt.axis("off")
109 |     plt.tight_layout(pad=0)
110 |     dr = '/contextual_topic_identification/docs/images/{}/{}'.format(model.method, model.id)
111 |     if not os.path.exists(dr):
112 |         os.makedirs(dr)
113 |     plt.savefig(dr + '/Topic' + str(topic) + '_wordcloud')
114 |     print('Getting wordcloud for topic {}. Done!'.format(topic))
115 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | numpy>=1.18.1
 3 | pandas
 4 | stop_words
 5 | language_detector
 6 | sklearn
 7 | symspellpy==6.5.2
 8 | gensim==3.8.1
 9 | wordcloud==1.6.0
10 | tensorflow==1.14.0
11 | keras==2.3.1
12 | sentence-transformers==0.2.5
13 | umap-learn==0.3.10
14 | nltk==3.4.5
15 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | docker build . -t tm_test:1.0
2 | docker run --rm -v $(pwd):/contextual_topic_identification tm_test:1.0 --samp_size=1000 --method=TFIDF --ntopic=10 --fpath=/contextual_topic_identification/data/test.csv
3 | # docker run --rm -v $(pwd):/contextual_topic_identification tm_test:1.0 --samp_size=1000 --method=LDA --ntopic=10 --fpath=/contextual_topic_identification/data/test.csv
4 | # docker run --rm -v $(pwd):/contextual_topic_identification tm_test:1.0 --samp_size=1000 --# method=BERT --ntopic=10 --fpath=/contextual_topic_identification/data/test.csv
5 | # docker run --rm -v $(pwd):/contextual_topic_identification tm_test:1.0 --samp_size=1000 --method=LDA_BERT --ntopic=10 --fpath=/contextual_topic_identification/data/test.csv
6 | 


--------------------------------------------------------------------------------