├── .gitignore ├── LICENSE ├── README.md ├── REQUIREMENTS.txt ├── credentials.py ├── data └── pycon_dict.pkl.zip ├── images ├── blei_news.png ├── gensim.png ├── inference.jpg ├── lda_dim_red.png ├── pipeline.png └── word_avg.png ├── ipynb_with_output ├── credentials.py ├── data │ └── pycon_dict.pkl.zip ├── images │ ├── blei_news.png │ ├── gensim.png │ ├── inference.jpg │ ├── lda_dim_red.png │ ├── pipeline.png │ └── word_avg.png └── pycon_twitter_with_output.ipynb ├── pycon_fr_slides.pdf └── pycon_twitter.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # Pickle files 92 | *.pkl 93 | 94 | # Don't need your credentials 95 | credentials.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Devashish Deshpande 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # twitter-user-classification 2 | Twitter user classification tutorial at PyCon France 2016 3 | 4 | __Setting up__: 5 | - Clone this repo using 6 | 7 | `git clone https://github.com/dsquareindia/twitter-user-classification.git` 8 | 9 | - Install `virtualenv` using 10 | 11 | `pip install virtualenv` 12 | 13 | - Install all requirements in the virtualenv 14 | 15 | ``` 16 | cd twitter-user-classification 17 | virtualenv gensim 18 | source gensim/bin/activate 19 | pip install -r REQUIREMENTS.txt 20 | ``` 21 | 22 | - Download pre-trained stanford GloVe vectors from [here](http://nlp.stanford.edu/data/glove.twitter.27B.zip) 23 | - Download nltk data using 24 | `python -m nltk.downloader corpus` 25 | - Unzip `pycon_dict_pkl.zip` to extract `pycon_dict.pkl` which is our toy dataset. 26 | - Twitter app authentication is only needed for final function concerning real-time classification 27 | -------------------------------------------------------------------------------- /REQUIREMENTS.txt: -------------------------------------------------------------------------------- 1 | Cython==0.24.1 2 | numpy==1.11.1 3 | scipy==0.18.1 4 | matplotlib==1.5.3 5 | python-twitter==3.1 6 | scikit-learn==0.18 7 | nltk==3.4.5 8 | pattern 9 | gensim==0.13.2 -------------------------------------------------------------------------------- /credentials.py: -------------------------------------------------------------------------------- 1 | creds_dict = { 2 | 'consumer_key': 'ENTER CONSUMER KEY', 3 | 'consumer_secret': 'ENTER CONSUMER SECRET KEY', 4 | 'access_token_key': 'ENTER ACCESS TOKEN KEY', 5 | 'access_token_secret': 'ENTER SECRET ACCESS TOKEN KEY' 6 | } 7 | -------------------------------------------------------------------------------- /data/pycon_dict.pkl.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/data/pycon_dict.pkl.zip -------------------------------------------------------------------------------- /images/blei_news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/images/blei_news.png -------------------------------------------------------------------------------- /images/gensim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/images/gensim.png -------------------------------------------------------------------------------- /images/inference.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/images/inference.jpg -------------------------------------------------------------------------------- /images/lda_dim_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/images/lda_dim_red.png -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/images/pipeline.png -------------------------------------------------------------------------------- /images/word_avg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/images/word_avg.png -------------------------------------------------------------------------------- /ipynb_with_output/credentials.py: -------------------------------------------------------------------------------- 1 | creds_dict = { 2 | 'consumer_key': 'ENTER CONSUMER KEY', 3 | 'consumer_secret': 'ENTER CONSUMER SECRET KEY', 4 | 'access_token_key': 'ENTER ACCESS TOKEN KEY', 5 | 'access_token_secret': 'ENTER SECRET ACCESS TOKEN KEY' 6 | } 7 | -------------------------------------------------------------------------------- /ipynb_with_output/data/pycon_dict.pkl.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/data/pycon_dict.pkl.zip -------------------------------------------------------------------------------- /ipynb_with_output/images/blei_news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/images/blei_news.png -------------------------------------------------------------------------------- /ipynb_with_output/images/gensim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/images/gensim.png -------------------------------------------------------------------------------- /ipynb_with_output/images/inference.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/images/inference.jpg -------------------------------------------------------------------------------- /ipynb_with_output/images/lda_dim_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/images/lda_dim_red.png -------------------------------------------------------------------------------- /ipynb_with_output/images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/images/pipeline.png -------------------------------------------------------------------------------- /ipynb_with_output/images/word_avg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/ipynb_with_output/images/word_avg.png -------------------------------------------------------------------------------- /pycon_fr_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devashishd12/twitter-user-classification/788087e23b73715c5fd9aad72d79f9c10d6e5ea1/pycon_fr_slides.pdf -------------------------------------------------------------------------------- /pycon_twitter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Twitter user classification\n", 8 | "# Topic modeling and GloVe vectors in gensim\n", 9 | "\n", 10 | "\n", 11 | "We will be developing a toy machine learning pipeline which can classify a twitter user into one of 8 categories based on the user's tweets.\n", 12 | "Following are the concepts we'll be exploring:\n", 13 | "- Bag of words\n", 14 | "- TF-IDF\n", 15 | "- Visualizing our data with topic models.\n", 16 | "- LDA model tuning with topic coherence\n", 17 | "- Dimensionality reduction with topic models\n", 18 | "- GloVe vectors for classification\n", 19 | "\n", 20 | "Through this tutorial, you will become comfortable with the gensim API and also learn how to do topic modeling effectively using gensim. There could be better ways of tuning certain topic models using other techniques however we will be using the newly released topic coherence pipeline implemented in gensim inspired from [this](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf) paper by Roeder et al.\n", 21 | "### Requirements\n", 22 | "- [OPTIONAL] [python-twitter](https://python-twitter.readthedocs.io/en/latest/) (only for final function)\n", 23 | "- [Pre-trained stanford GloVe vectors for twitter](http://nlp.stanford.edu/data/glove.twitter.27B.zip)\n", 24 | "- gensim (latest development version preferred)\n", 25 | "- [OPTIONAL] [twitter app authentication](http://iag.me/socialmedia/how-to-create-a-twitter-app-in-8-easy-steps/) and hence a twitter account (only for final function where we'll do real-time classification) \n", 26 | "- scikit-learn\n", 27 | "- pandas\n", 28 | "- numpy\n", 29 | "- nltk with english stopwords\n", 30 | "- At least 4 GB RAM preferred. Can use the smaller GloVe vector files if RAM is an issue as there are multiple GloVe vector files available.\n", 31 | "\n", 32 | "### Dataset\n", 33 | "I've already built a mini-dataset consisting of a `category->handle->tweets` mapping however this is just a toy dataset. A production level twitter classifier will most likely have many more categories with a lot more handles." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "try:\n", 45 | " import twitter\n", 46 | "except ImportError:\n", 47 | " print(\"python-twitter not installed. Will not be able to do real-time classification\")\n", 48 | "import pickle\n", 49 | "import credentials # You'll have to fill in the credentials in the credentials file here\n", 50 | "\n", 51 | "from pprint import pprint" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "api = twitter.Api(**credentials.creds_dict)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "### Creating and exploring our dataset" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Creating the dataset" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "I had created a function to get tweets category-wise to avoid rate limiting problems." 84 | ] 85 | }, 86 | { 87 | "cell_type": "raw", 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "source": [ 92 | "def getTweets(category_dict, category):\n", 93 | " \"\"\"\n", 94 | " Function to get the tweets for each handle in the dictionary in the particular category.\n", 95 | " \n", 96 | " Parameters:\n", 97 | " ----------\n", 98 | " category_dict: User category dictionary consisting of categories and user handles.\n", 99 | " category: String. Name of the category.\n", 100 | " \n", 101 | " Returns:\n", 102 | " -------\n", 103 | " category_dict: Dictionary with the most recent 200 tweets of all user handles.\n", 104 | " \"\"\"\n", 105 | " for handle in category_dict[category]:\n", 106 | " category_dict[category][handle] = api.GetUserTimeline(screen_name=handle, count=200)\n", 107 | " return category_dict" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "I had added all the handles I could think of in one category and then gathered the tweets. Not the best way to go about it! To add new handles to a category, simply use `api.GetUserTimeline(screen_name=handle)` and add that under the relevant category with the handle" 115 | ] 116 | }, 117 | { 118 | "cell_type": "raw", 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "source": [ 123 | "pycon_dict = getTweets(pycon_dict, 'Business & CEOs')" 124 | ] 125 | }, 126 | { 127 | "cell_type": "raw", 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "source": [ 132 | "pickle.dump(pycon_dict, open('pycon_dict.pkl', 'wb'))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "We can just load the pre-created pickle for now" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "pycon_dict = pickle.load(open(\"data/pycon_dict.pkl\", \"rb\"))" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "### Exploring the dataset" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "Let's see which are the categories in the dictionary" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "pycon_dict.keys()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Each category contains a mix of benchmark twitter handles which can be magazines/channels and personalities tweeting about that category" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": false 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "pycon_dict['Music'].keys()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "All categories have almost the same number of handles" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": false, 208 | "scrolled": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "print(['{}-{}'.format(cat, len(pycon_dict[cat].keys())) for cat in pycon_dict.keys()])" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Time for manual inspection\n", 220 | "This is an important step while dealing with textual data\n", 221 | "\n", 222 | "What do tweets from different categories look like? Can we spot any patterns or signals through manual inspection?" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [], 232 | "source": [ 233 | "pycon_dict['Business & CEOs']['XinfuCEOs'][:5]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "pycon_dict['Music']['mtvmusic'][:5]" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "pycon_dict['Science']['NASA'][:5]" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Manual inspection surely reveals some signal. We can see a marked difference between how typical tweets in the business, music and science category look like. Given some tweets from a category we should be able to classify accurately. The human mind is a pretty good classifier :)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "### Creating training and test sets" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "import numpy as np\n", 281 | "import pandas as pd" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "We'll be using 90% of the tweets (for each profile) for training and 10% for testing, i.e. a 90-10 split as mentioned in [this](http://snap.stanford.edu/soma2010/papers/soma2010_12.pdf) paper by Davison et al. Cross validation can and should be done however since our dataset is small we'll be skipping it for now." 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": { 295 | "collapsed": true 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "columns = ['message', 'category']" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "A [two-way](http://stackoverflow.com/questions/1456373/two-way-reverse-map) dictionary will help in creating a category mapping. I've already done this and we'll be using that mapping so that all of us have a uniform mapping" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "categories_map = {0: u'Business & CEOs',\n", 318 | " 1: u'Music',\n", 319 | " 2: u'Entertainment',\n", 320 | " 3: u'Fashion, Travel & Lifestyle',\n", 321 | " 4: u'Sports',\n", 322 | " 5: u'Tech',\n", 323 | " 6: u'Politics',\n", 324 | " 7: u'Science',\n", 325 | " u'Business & CEOs': 0,\n", 326 | " u'Entertainment': 2,\n", 327 | " u'Fashion, Travel & Lifestyle': 3,\n", 328 | " u'Music': 1,\n", 329 | " u'Politics': 6,\n", 330 | " u'Science': 7,\n", 331 | " u'Sports': 4,\n", 332 | " u'Tech': 5}" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": { 339 | "collapsed": true 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "def get_dataframes(pycon_dict):\n", 344 | " \"\"\"\n", 345 | " Function to get train and test dataframes (without any preprocessing).\n", 346 | " \n", 347 | " Parameters:\n", 348 | " ----------\n", 349 | " pycon_dict: The twitter user dictionary being used.\n", 350 | " \n", 351 | " Returns:\n", 352 | " -------\n", 353 | " train, test: Train and test dataframes.\n", 354 | " \"\"\"\n", 355 | " train = pd.DataFrame(columns=columns)\n", 356 | " test = pd.DataFrame(columns=columns)\n", 357 | " \n", 358 | " for category in pycon_dict:\n", 359 | " for entity in pycon_dict[category]:\n", 360 | " train_texts = []\n", 361 | " test_texts = []\n", 362 | " num_texts = len(pycon_dict[category][entity]) # To get number of tweets\n", 363 | " train_indices = np.random.choice(num_texts, int(0.9 * num_texts), replace=False) # Random selection\n", 364 | " test_indices = [i for i in range(num_texts) if i not in train_indices] # Rest go into test set\n", 365 | " train_texts.extend(pycon_dict[category][entity][i].text for i in train_indices) # Add to train texts\n", 366 | " test_texts.extend(pycon_dict[category][entity][i].text for i in test_indices) # Add to test texts\n", 367 | " #### Create train dataframe ####\n", 368 | " train_texts = ' '.join(train_texts)\n", 369 | " df_train = pd.DataFrame([[train_texts, categories_map[category]]], columns=columns)\n", 370 | " train = train.append(df_train, ignore_index=True)\n", 371 | " #### Create test dataframe ####\n", 372 | " test_texts = ' '.join(test_texts)\n", 373 | " df_test = pd.DataFrame([[test_texts, categories_map[category]]], columns=columns)\n", 374 | " test = test.append(df_test, ignore_index=True)\n", 375 | " \n", 376 | " return train, test" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "### Preprocessing our data. Remember: Garbage in, garbage out\n", 384 | " NLP is 80% preprocessing\n", 385 | " - Lev Konstantinovskiy\n", 386 | "Preprocessing is probably the single most important step in producing good topic models which give human interpretable topics. Do take some time to study the preprocessor well because we'll be using the same preprocessor for the tasks later on for classification using Bag of words and TF-IDF" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "import re\n", 398 | "from nltk.corpus import stopwords # for using english stopwords\n", 399 | "from gensim.models.phrases import Phrases\n", 400 | "from gensim.utils import deaccent, decode_htmlentities, lemmatize" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "stops = stopwords.words('english')" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Following are the steps we'll be taking to preprocess a user profile, i.e. a space seperated collection of at most 200 most recent tweets of the profile:\n", 419 | "1. Decode html entities. eg. \"AT`&`amp;T\" will become \"AT&T\"\n", 420 | "2. Deaccent. Eg:\n", 421 | " ```\n", 422 | " >>> deaccent(\"Šéf chomutovských komunistů dostal poštou bílý prášek\")\n", 423 | " u'Sef chomutovskych komunistu dostal postou bily prasek'\n", 424 | " ```\n", 425 | "3. Remove links.\n", 426 | "4. Remove any user mentions (__@name__). This could maybe be skipped as user mentions can also provide signal. However we'll be removing them for this tutorial.\n", 427 | "5. Lemmatize and remove stopwords. Lemmatization is preferred over stemming here because lemmatization retains readability of the words which is important in our case.\n", 428 | "6. We will only pick nouns here since they contribute the most to the signal" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [], 438 | "source": [ 439 | "def preprocess_text(tweet):\n", 440 | " \"\"\"\n", 441 | " Function to process an aggregated user profile. This does the following:\n", 442 | " 1. Decode html entities. eg. \"AT&T\" will become \"AT&T\"\n", 443 | " 2. Deaccent\n", 444 | " 3. Remove links.\n", 445 | " 4. Remove any user mentions (@name).\n", 446 | " 5. Lemmatize and remove stopwords.\n", 447 | " \n", 448 | " Parameters:\n", 449 | " ----------\n", 450 | " text : String. If train_texts is a list of tweets, ' '.join and pass\n", 451 | " \n", 452 | " Returns:\n", 453 | " -------\n", 454 | " text : preprocessed (tokenized) tweet.\n", 455 | " \"\"\"\n", 456 | " tweet = decode_htmlentities(tweet)\n", 457 | " tweet = deaccent(tweet)\n", 458 | " tweet = tweet.encode('ascii', 'ignore') # To prevent UnicodeDecodeErrors later on\n", 459 | " tweet = re.sub(r'http\\S+', '', tweet) # Step 3\n", 460 | " tweet = re.sub(r'@\\w+', '', tweet) # Step 4\n", 461 | " tweet = tweet.split()\n", 462 | " tweet = lemmatize(' '.join(tweet), re.compile('(NN)'), stopwords=stops, min_length=3, max_length=15)\n", 463 | " tweet = [word.split('/')[0] for word in tweet]\n", 464 | " return tweet" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "collapsed": false 472 | }, 473 | "outputs": [], 474 | "source": [ 475 | "preprocess_text('Tropical House Lives On At Electric Zoo :) https://t.co/lUpLIpWg3H by @_davidturner_: https://t.co/NI3bq2k8am')" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "collapsed": true 482 | }, 483 | "source": [ 484 | "### Preprocessing our train, test dataframes" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "collapsed": true 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "train, test = get_dataframes(pycon_dict)" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "Let's see what our function does" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "preprocess_text(train['message'][0])[100:120]" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "Apply function to all tweets in our training dataframe" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": { 527 | "collapsed": true 528 | }, 529 | "outputs": [], 530 | "source": [ 531 | "train_texts = train['message'].apply(preprocess_text)" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "Bigram collocation can detect words which frequently occur together and know whether they are in fact the same entity" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "collapsed": true 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "bigram = Phrases(train_texts) # For collocation detection" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": { 556 | "collapsed": false 557 | }, 558 | "outputs": [], 559 | "source": [ 560 | "bigram['hillary', 'clinton']" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": {}, 566 | "source": [ 567 | "As we can see, the bigram collocation has figured out that the words 'hillary' and 'clinton' refer to the same entity that is 'Hillary Clinton' the politician." 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": { 574 | "collapsed": false 575 | }, 576 | "outputs": [], 577 | "source": [ 578 | "train_texts = [bigram[profile] for profile in train_texts]" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "You can notice the other bigram collocations below for the sports category" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": { 592 | "collapsed": false 593 | }, 594 | "outputs": [], 595 | "source": [ 596 | "train_texts[100][:20]" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": { 603 | "collapsed": false 604 | }, 605 | "outputs": [], 606 | "source": [ 607 | "len(train_texts)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": { 614 | "collapsed": false 615 | }, 616 | "outputs": [], 617 | "source": [ 618 | "test_texts = test['message'].apply(preprocess_text)\n", 619 | "test_texts = [bigram[message] for message in test_texts]" 620 | ] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": {}, 625 | "source": [ 626 | "# Set up helper functions for classification evaluation\n", 627 | "It is important to choose an evaluation metric to compare our models. We will choose accuracy here however other metrics such as F-Measure can also be used depending on the use-case of the model." 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "metadata": { 634 | "collapsed": true 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "from sklearn.metrics import accuracy_score, confusion_matrix\n", 639 | "from matplotlib import pyplot as plt\n", 640 | "\n", 641 | "%matplotlib inline" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "metadata": { 648 | "collapsed": false 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "my_tags = pycon_dict.keys()" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": { 659 | "collapsed": true 660 | }, 661 | "outputs": [], 662 | "source": [ 663 | "categories = train['category']" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": { 670 | "collapsed": false 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):\n", 675 | " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n", 676 | " plt.title(title)\n", 677 | " plt.colorbar()\n", 678 | " tick_marks = np.arange(len(my_tags))\n", 679 | " target_names = my_tags\n", 680 | " plt.xticks(tick_marks, target_names, rotation=90)\n", 681 | " plt.yticks(tick_marks, target_names)\n", 682 | " plt.ylabel('True label')\n", 683 | " plt.xlabel('Predicted label')" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": { 690 | "collapsed": true 691 | }, 692 | "outputs": [], 693 | "source": [ 694 | "def evaluate_prediction(predictions, target, title=\"Confusion matrix\"):\n", 695 | " print('accuracy %s' % accuracy_score(target, predictions))\n", 696 | " cm = confusion_matrix(target, predictions)\n", 697 | " print('confusion matrix\\n %s' % cm)\n", 698 | " print('(row=expected, col=predicted)')\n", 699 | " \n", 700 | " cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", 701 | " plot_confusion_matrix(cm_normalized, title + ' Normalized')" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": { 708 | "collapsed": true 709 | }, 710 | "outputs": [], 711 | "source": [ 712 | "def most_influential_words(clf, vectorizer, category_index=0, num_words=10):\n", 713 | " features = vectorizer.get_feature_names()\n", 714 | " max_coef = sorted(enumerate(clf.coef_[category_index]), key=lambda x:x[1], reverse=True)\n", 715 | " return [features[x[0]] for x in max_coef[:num_words]] " 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "# Let's build our first classification model\n", 723 | "__Note__: We will be using `LogisticRegression` throughout this tutorial. I would also encourage you to try out the new `MLPClassifier` released as part of `scikit-learn 0.18`. This uses a neural network for classification and is a very powerful tool for learning complex non-linear functions.\n", 724 | "\n", 725 | "# Bag of words with LogisticRegression\n", 726 | "It sometimes important in any machine learning task to set up a baseline model in as little time as possible. This can help us gauge how much better our ideal model should be. It can also help us find out the areas where we need to work on to make our model better. Would bringing in more data help? Would making our preprocessing better help? Such questions can be answered more accurately. There are many ways to find out the answers to these questions however they lie of of scope for this tutorial\n", 727 | "\n", 728 | "We will be using bag of words first as it is the simplest document feature which is available. It is simply a word occurence matrix across all the words and all the documents." 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "collapsed": false 736 | }, 737 | "outputs": [], 738 | "source": [ 739 | "from sklearn.feature_extraction.text import CountVectorizer\n", 740 | "from sklearn.linear_model import LogisticRegression" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "We'll only take the most frequent 5000 features." 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": { 754 | "collapsed": true 755 | }, 756 | "outputs": [], 757 | "source": [ 758 | "count_vectorizer = CountVectorizer(max_features=5000)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": { 765 | "collapsed": true 766 | }, 767 | "outputs": [], 768 | "source": [ 769 | "train_count_features = count_vectorizer.fit_transform(' '.join(text) for text in train_texts)" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": { 776 | "collapsed": true 777 | }, 778 | "outputs": [], 779 | "source": [ 780 | "clf_count = LogisticRegression()" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": null, 786 | "metadata": { 787 | "collapsed": false 788 | }, 789 | "outputs": [], 790 | "source": [ 791 | "clf_count = clf_count.fit(train_count_features, categories)" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": { 798 | "collapsed": false, 799 | "scrolled": true 800 | }, 801 | "outputs": [], 802 | "source": [ 803 | "test_count_features = count_vectorizer.transform(' '.join(text) for text in test_texts)" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "metadata": { 810 | "collapsed": true 811 | }, 812 | "outputs": [], 813 | "source": [ 814 | "predictions = clf_count.predict(test_count_features)" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": { 821 | "collapsed": false 822 | }, 823 | "outputs": [], 824 | "source": [ 825 | "evaluate_prediction(predictions, test['category'])" 826 | ] 827 | }, 828 | { 829 | "cell_type": "markdown", 830 | "metadata": {}, 831 | "source": [ 832 | "Greater than 90% accuracy is great for our first model!\n", 833 | "\n", 834 | "__Why did it work__? Twitter benchmark users of any particular category tweet in a very distinct way. As we saw above, users in the \"Music\" category are more likely to use words like \"performance\" or \"album\" more. Similarly users in the \"Politics\" category tend to use words like \"policy\" more. Bag of words can catch this quite brilliantly! We can see evidence of this below." 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": null, 840 | "metadata": { 841 | "collapsed": false 842 | }, 843 | "outputs": [], 844 | "source": [ 845 | "most_influential_words(clf_count, count_vectorizer, category_index=7) # Top words for science category" 846 | ] 847 | }, 848 | { 849 | "cell_type": "markdown", 850 | "metadata": {}, 851 | "source": [ 852 | "# [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) with LogisticRegression\n", 853 | "This is a slightly more advanced method than bag of words. Bag of words needs good preprocessing more than TF-IDF as it is a raw count. However TF-IDF can reduce the weight for highly occuring terms such as 'the' by dividing by normalizing it with the number of documents it occurs in." 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": null, 859 | "metadata": { 860 | "collapsed": false 861 | }, 862 | "outputs": [], 863 | "source": [ 864 | "from sklearn.feature_extraction.text import TfidfVectorizer" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": null, 870 | "metadata": { 871 | "collapsed": true 872 | }, 873 | "outputs": [], 874 | "source": [ 875 | "tfidf_vectorizer = TfidfVectorizer(max_features=5000)" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": null, 881 | "metadata": { 882 | "collapsed": false 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "train_tfidf_features = tfidf_vectorizer.fit_transform(' '.join(text) for text in train_texts)" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": null, 892 | "metadata": { 893 | "collapsed": true 894 | }, 895 | "outputs": [], 896 | "source": [ 897 | "clf_tfidf = LogisticRegression()" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": null, 903 | "metadata": { 904 | "collapsed": true 905 | }, 906 | "outputs": [], 907 | "source": [ 908 | "clf_tfidf = clf_tfidf.fit(train_tfidf_features, categories)" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": null, 914 | "metadata": { 915 | "collapsed": false 916 | }, 917 | "outputs": [], 918 | "source": [ 919 | "test_tfidf_features = tfidf_vectorizer.transform(' '.join(text) for text in test_texts)" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": { 926 | "collapsed": true 927 | }, 928 | "outputs": [], 929 | "source": [ 930 | "predictions = clf_tfidf.predict(test_tfidf_features)" 931 | ] 932 | }, 933 | { 934 | "cell_type": "code", 935 | "execution_count": null, 936 | "metadata": { 937 | "collapsed": false 938 | }, 939 | "outputs": [], 940 | "source": [ 941 | "evaluate_prediction(predictions, test['category'])" 942 | ] 943 | }, 944 | { 945 | "cell_type": "markdown", 946 | "metadata": {}, 947 | "source": [ 948 | "__Exercise__: What are the most influential words from this model in the politics category?" 949 | ] 950 | }, 951 | { 952 | "cell_type": "markdown", 953 | "metadata": {}, 954 | "source": [ 955 | "Even this model gives us a wonderful accuracy!\n", 956 | "\n", 957 | "__However let's try some more methods__. First we will do some topic modeling. After all the package we will be using is called \"Gensim-Topic Modeling for Humans\"! Topic modeling can help find hidden structure in our data and thus is very useful in visualizing our dataset. It can in turn also be used for classification and dimensionality reduction among many of it's other uses." 958 | ] 959 | }, 960 | { 961 | "cell_type": "markdown", 962 | "metadata": {}, 963 | "source": [ 964 | "# Let's do some topic modelling\n", 965 | "Essentially we're trying to find out the topics of the tweets which the user is tweeting about. Since topic models come up with topic distributions, they can probably help us in this task.\n", 966 | "\n", 967 | "Let's see if we can use topic models to find some hidden structure in our data first!\n", 968 | "\n", 969 | "### Setting up our standard gensim corpus\n", 970 | "You can refer to the short tutorial [here](https://radimrehurek.com/gensim/tut1.html) for reference as to how a standard gensim corpus is set. " 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": null, 976 | "metadata": { 977 | "collapsed": false 978 | }, 979 | "outputs": [], 980 | "source": [ 981 | "from gensim.corpora import Dictionary\n", 982 | "from gensim.models import LdaModel" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": { 989 | "collapsed": true 990 | }, 991 | "outputs": [], 992 | "source": [ 993 | "dictionary = Dictionary(train_texts)" 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": null, 999 | "metadata": { 1000 | "collapsed": false 1001 | }, 1002 | "outputs": [], 1003 | "source": [ 1004 | "print(dictionary)" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "code", 1009 | "execution_count": null, 1010 | "metadata": { 1011 | "collapsed": true 1012 | }, 1013 | "outputs": [], 1014 | "source": [ 1015 | "corpus = [dictionary.doc2bow(text) for text in train_texts]" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "markdown", 1020 | "metadata": {}, 1021 | "source": [ 1022 | "### [LSI](https://radimrehurek.com/gensim/models/lsimodel.html) (Latent Semantic Indexing)\n", 1023 | "LSI uses Singular Value Decomposition (SVD) for topic modeling. It can rank topics automatically but needs the `num_topics` parameter to be entered for the number of latent dimensions requested. There is currently an open [issue](https://github.com/RaRe-Technologies/gensim/issues/28) in gensim where an automatic number of topics \"chooser\" has been proposed." 1024 | ] 1025 | }, 1026 | { 1027 | "cell_type": "code", 1028 | "execution_count": null, 1029 | "metadata": { 1030 | "collapsed": true 1031 | }, 1032 | "outputs": [], 1033 | "source": [ 1034 | "from gensim.models import LsiModel" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "code", 1039 | "execution_count": null, 1040 | "metadata": { 1041 | "collapsed": true 1042 | }, 1043 | "outputs": [], 1044 | "source": [ 1045 | "lsim = LsiModel(corpus=corpus, num_topics=100, id2word=dictionary)" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": null, 1051 | "metadata": { 1052 | "collapsed": false 1053 | }, 1054 | "outputs": [], 1055 | "source": [ 1056 | "lsim.show_topics(num_topics=10)" 1057 | ] 1058 | }, 1059 | { 1060 | "cell_type": "code", 1061 | "execution_count": null, 1062 | "metadata": { 1063 | "collapsed": true 1064 | }, 1065 | "outputs": [], 1066 | "source": [ 1067 | "lsimtopics = lsim.show_topics(formatted=False)" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "markdown", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "### [HDP](https://radimrehurek.com/gensim/models/hdpmodel.html) (Hierarchical Dirichlet Process)\n", 1075 | "An HDP model is fully unsupervised. It can also determine the ideal number of topics through posterior inference. This property of HDP is particularly useful when in the \"online\" mode where the number of documents keep streaming and hence the number of topics becomes hard to determine before-hand as in LDA." 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": null, 1081 | "metadata": { 1082 | "collapsed": true 1083 | }, 1084 | "outputs": [], 1085 | "source": [ 1086 | "from gensim.models import HdpModel" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": null, 1092 | "metadata": { 1093 | "collapsed": true 1094 | }, 1095 | "outputs": [], 1096 | "source": [ 1097 | "hdpm = HdpModel(corpus=corpus, id2word=dictionary)" 1098 | ] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "execution_count": null, 1103 | "metadata": { 1104 | "collapsed": false 1105 | }, 1106 | "outputs": [], 1107 | "source": [ 1108 | "hdpm.show_topics()" 1109 | ] 1110 | }, 1111 | { 1112 | "cell_type": "markdown", 1113 | "metadata": {}, 1114 | "source": [ 1115 | "HDP surely comes up with better, more human interpretable topics as we can see. There are topics which clearly talk about sports, others which only have words related to politics or some which relate to travel. As we saw above in LSI, the topics were a bit mixed up. HDP doesn't seem to mix it up as much." 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "code", 1120 | "execution_count": null, 1121 | "metadata": { 1122 | "collapsed": true 1123 | }, 1124 | "outputs": [], 1125 | "source": [ 1126 | "hdpmtopics = hdpm.show_topics(num_topics=-1, num_words=10, formatted=False)" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "markdown", 1131 | "metadata": {}, 1132 | "source": [ 1133 | "### [LDA](https://radimrehurek.com/gensim/models/ldamodel.html) (Latent Dirichlet Allocation)\n", 1134 | "This is one the most popular topic modeling algorithms today. It is a generative model in that it assumes each document is a mixture of topics and in turn, each topic is a mixture of words. To understand it better you can watch [this](https://www.youtube.com/watch?v=DDq3OVp9dNA) lecture by David Blei.\n", 1135 | "" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "code", 1140 | "execution_count": null, 1141 | "metadata": { 1142 | "collapsed": true 1143 | }, 1144 | "outputs": [], 1145 | "source": [ 1146 | "ldam = LdaModel(corpus=corpus, num_topics=100, id2word=dictionary)" 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "code", 1151 | "execution_count": null, 1152 | "metadata": { 1153 | "collapsed": false 1154 | }, 1155 | "outputs": [], 1156 | "source": [ 1157 | "ldam.show_topics()" 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "markdown", 1162 | "metadata": {}, 1163 | "source": [ 1164 | "`LdaModel` also comes up with decent topics. We can decipher some topics by the word distributions." 1165 | ] 1166 | }, 1167 | { 1168 | "cell_type": "code", 1169 | "execution_count": null, 1170 | "metadata": { 1171 | "collapsed": true 1172 | }, 1173 | "outputs": [], 1174 | "source": [ 1175 | "ldamtopics = ldam.show_topics(num_topics=-1, formatted=False)" 1176 | ] 1177 | }, 1178 | { 1179 | "cell_type": "markdown", 1180 | "metadata": {}, 1181 | "source": [ 1182 | "# Topic Coherence\n", 1183 | "\n", 1184 | "Topic coherence is a way to quantify the human interpretability of the topics. Simply enter the __topics 't'__ and you get __coherence value 'c'__ as the output! The topic coherence pipeline recently got merged into gensim which was based on [this](http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf) paper by Roeder et al.\n", 1185 | "\n", 1186 | "This is a great way to compare individual topics and can be extended to comparing different topic models as well. We will be doing this later on in this tutorial.\n", 1187 | "\n", 1188 | "To know more about topic coherence you can read my blog [here](https://rare-technologies.com/what-is-topic-coherence/) where I have to tried to give an introductory explanation of how topic coherence works.\n", 1189 | "\n", 1190 | "### Finding optimal number of topics\n", 1191 | "As we saw above, `LdaModel` requires the number of topics to be entered for fitting it. This is one of the problem of LDA as knowing the number of topics beforehand can be a challenging task especially in an online setting. Traditionally, perplexity has been used to find out the optimum number of topics. We will be using topic coherence instead. The logic is pretty simple in that: \"Let's select the number of topics which we can understand easily\"." 1192 | ] 1193 | }, 1194 | { 1195 | "cell_type": "code", 1196 | "execution_count": null, 1197 | "metadata": { 1198 | "collapsed": false 1199 | }, 1200 | "outputs": [], 1201 | "source": [ 1202 | "import operator\n", 1203 | "from gensim.models import CoherenceModel\n", 1204 | "\n", 1205 | "%matplotlib inline" 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "execution_count": null, 1211 | "metadata": { 1212 | "collapsed": true 1213 | }, 1214 | "outputs": [], 1215 | "source": [ 1216 | "BASE = 6 # base number of topics for graph evaluation" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "code", 1221 | "execution_count": null, 1222 | "metadata": { 1223 | "collapsed": false 1224 | }, 1225 | "outputs": [], 1226 | "source": [ 1227 | "def evaluate_graph(dictionary, corpus, texts, limit):\n", 1228 | " \"\"\"\n", 1229 | " Function to display num_topics - LDA graph using c_v coherence\n", 1230 | " \n", 1231 | " Parameters:\n", 1232 | " ----------\n", 1233 | " dictionary : Gensim dictionary\n", 1234 | " corpus : Gensim corpus\n", 1235 | " limit : topic limit\n", 1236 | " \n", 1237 | " Returns:\n", 1238 | " -------\n", 1239 | " lm_list : List of LDA topic models\n", 1240 | " \"\"\"\n", 1241 | " if limit < BASE:\n", 1242 | " raise ValueError(\"Please enter limit > %d. You entered %d\" % (BASE, limit))\n", 1243 | " c_v = []\n", 1244 | " lm_list = []\n", 1245 | " for num_topics in range(BASE, limit):\n", 1246 | " lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)\n", 1247 | " lm_list.append(lm)\n", 1248 | " cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')\n", 1249 | " c_v.append(cm.get_coherence())\n", 1250 | " \n", 1251 | " # Show graph\n", 1252 | " x = range(6, limit)\n", 1253 | " plt.plot(x, c_v)\n", 1254 | " plt.xlabel(\"num_topics\")\n", 1255 | " plt.ylabel(\"Coherence score\")\n", 1256 | " plt.legend((\"c_v\"), loc='best')\n", 1257 | " plt.show()\n", 1258 | " \n", 1259 | " return lm_list, c_v" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "markdown", 1264 | "metadata": {}, 1265 | "source": [ 1266 | "We iteratively train LdaModels for different number of topics ranging from 6 to 15. Topics lesser than 6 might not make sense however we should be getting the ideal value around 8 as the number of categories we have are 8.\n", 1267 | "\n", 1268 | "__Warning__: Can take quite long to compute. If you want to reduce the amount of time taken here, you can reduce the limit or better, change the coherence algorithm to `u_mass` instead. However it has been experimentally proven that `c_v` correlates best with human interpretation." 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": null, 1274 | "metadata": { 1275 | "collapsed": false, 1276 | "scrolled": false 1277 | }, 1278 | "outputs": [], 1279 | "source": [ 1280 | "%%time\n", 1281 | "lm_list, c_v = evaluate_graph(dictionary, corpus, train_texts, 15)" 1282 | ] 1283 | }, 1284 | { 1285 | "cell_type": "code", 1286 | "execution_count": null, 1287 | "metadata": { 1288 | "collapsed": false 1289 | }, 1290 | "outputs": [], 1291 | "source": [ 1292 | "ldam_tc = lm_list[np.argmax(c_v)] # Select the LdaModel corresponding to the best coherence value" 1293 | ] 1294 | }, 1295 | { 1296 | "cell_type": "code", 1297 | "execution_count": null, 1298 | "metadata": { 1299 | "collapsed": false 1300 | }, 1301 | "outputs": [], 1302 | "source": [ 1303 | "ldam_tc.show_topics(num_topics=-1)" 1304 | ] 1305 | }, 1306 | { 1307 | "cell_type": "code", 1308 | "execution_count": null, 1309 | "metadata": { 1310 | "collapsed": false 1311 | }, 1312 | "outputs": [], 1313 | "source": [ 1314 | "ldam_tctopics = ldam_tc.show_topics(num_topics=-1, formatted=False)" 1315 | ] 1316 | }, 1317 | { 1318 | "cell_type": "markdown", 1319 | "metadata": {}, 1320 | "source": [ 1321 | "### LDA as LSI\n", 1322 | "As we saw above, LSI can automatically rank topics. We can actually do the same with LDA also by making it work with the topic coherence pipeline. We rank individual topics based on their human interpretability.\n", 1323 | "\n", 1324 | "We will be going one step further here and making a new LDA model by using only the best topics from the first LDA model. An \"LDA migration\" of sorts! This can be done by selecting the `alphas` and `betas` corresponding to only the best topics from the first LDA model and transferring them onto the new model." 1325 | ] 1326 | }, 1327 | { 1328 | "cell_type": "code", 1329 | "execution_count": null, 1330 | "metadata": { 1331 | "collapsed": true 1332 | }, 1333 | "outputs": [], 1334 | "source": [ 1335 | "def ret_top_model(num_topics):\n", 1336 | " \"\"\"\n", 1337 | " Since LDAmodel is a probabilistic model, it comes up different topics each time we run it. To control the\n", 1338 | " quality of the topic model we produce, we can create another topic model from only the most coherent n topics\n", 1339 | " from the LDAmodel.\n", 1340 | " \n", 1341 | " Parameters:\n", 1342 | " ----------\n", 1343 | " num_topics: Number of top topics for new LdaModel.\n", 1344 | " \n", 1345 | " Returns:\n", 1346 | " -------\n", 1347 | " refined_lm: Final evaluated topic model\n", 1348 | " top_topics: ranked topics in decreasing order. List of tuples\n", 1349 | " \"\"\"\n", 1350 | " lm = LdaModel(corpus=corpus, id2word=dictionary)\n", 1351 | " coherence_values = {}\n", 1352 | " doc = []\n", 1353 | " for n, topic in lm.show_topics(num_topics=-1, formatted=False):\n", 1354 | " topic = [word for word, _ in topic]\n", 1355 | " cm = CoherenceModel(topics=[topic], texts=train_texts, dictionary=dictionary)\n", 1356 | " coherence_values[n] = cm.get_coherence()\n", 1357 | " top_topics = sorted(coherence_values.items(), key=operator.itemgetter(1), reverse=True)[:num_topics]\n", 1358 | " top_topics = [t for t, c in top_topics]\n", 1359 | " alpha = lm.alpha[top_topics]\n", 1360 | " beta = lm.expElogbeta[top_topics, :]\n", 1361 | " refined_lm = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha)\n", 1362 | " refined_lm.expElogbeta[:] = beta\n", 1363 | " return refined_lm" 1364 | ] 1365 | }, 1366 | { 1367 | "cell_type": "markdown", 1368 | "metadata": {}, 1369 | "source": [ 1370 | "__Warning__: Can take some time to run" 1371 | ] 1372 | }, 1373 | { 1374 | "cell_type": "code", 1375 | "execution_count": null, 1376 | "metadata": { 1377 | "collapsed": false 1378 | }, 1379 | "outputs": [], 1380 | "source": [ 1381 | "%%time\n", 1382 | "lda_lsim = ret_top_model(BASE + np.argmax(c_v)) # Using the value we found above" 1383 | ] 1384 | }, 1385 | { 1386 | "cell_type": "code", 1387 | "execution_count": null, 1388 | "metadata": { 1389 | "collapsed": false 1390 | }, 1391 | "outputs": [], 1392 | "source": [ 1393 | "lda_lsim.show_topics(-1)" 1394 | ] 1395 | }, 1396 | { 1397 | "cell_type": "markdown", 1398 | "metadata": {}, 1399 | "source": [ 1400 | "HDP model still seems like the best one yet...." 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "code", 1405 | "execution_count": null, 1406 | "metadata": { 1407 | "collapsed": true 1408 | }, 1409 | "outputs": [], 1410 | "source": [ 1411 | "lda_lsimtopics = lda_lsim.show_topics(num_topics=-1, formatted=False)" 1412 | ] 1413 | }, 1414 | { 1415 | "cell_type": "markdown", 1416 | "metadata": {}, 1417 | "source": [ 1418 | "Let's transfer out HDP model into an LDA model. We can do these by copying over the corresponding alphas and betas into the LDA model. This will also help us calculate topic inferences easily if this turns out to be the best LDA model so far." 1419 | ] 1420 | }, 1421 | { 1422 | "cell_type": "code", 1423 | "execution_count": null, 1424 | "metadata": { 1425 | "collapsed": true 1426 | }, 1427 | "outputs": [], 1428 | "source": [ 1429 | "alpha, beta = hdpm.hdp_to_lda()\n", 1430 | "num_topics = len(hdpmtopics)" 1431 | ] 1432 | }, 1433 | { 1434 | "cell_type": "code", 1435 | "execution_count": null, 1436 | "metadata": { 1437 | "collapsed": false 1438 | }, 1439 | "outputs": [], 1440 | "source": [ 1441 | "%%time\n", 1442 | "lda_hdp = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, alpha=alpha)\n", 1443 | "lda_hdp.expElogbeta[:] = beta" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "markdown", 1448 | "metadata": {}, 1449 | "source": [ 1450 | "### Evaluating all topic models\n", 1451 | "Finally, we will be using the topic coherence pipeline to compare all the topic models we have created so far. Till now we have only been qualitatively comparing the models. The topic coherence pipeline allows us to quantitatively compare the different topic models. " 1452 | ] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": null, 1457 | "metadata": { 1458 | "collapsed": true 1459 | }, 1460 | "outputs": [], 1461 | "source": [ 1462 | "lsimtopics = [[word for word, prob in topic] for topicid, topic in lsimtopics]\n", 1463 | "hdpmtopics = [[word for word, prob in topic] for topicid, topic in hdpmtopics]\n", 1464 | "ldamtopics = [[word for word, prob in topic] for topicid, topic in ldamtopics]\n", 1465 | "ldam_tctopics = [[word for word, prob in topic] for topicid, topic in ldam_tctopics]\n", 1466 | "lda_lsimtopics = [[word for word, prob in topic] for topicid, topic in lda_lsimtopics]" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": null, 1472 | "metadata": { 1473 | "collapsed": false 1474 | }, 1475 | "outputs": [], 1476 | "source": [ 1477 | "%%time\n", 1478 | "lsi_coherence = CoherenceModel(topics=lsimtopics, texts=train_texts, dictionary=dictionary, window_size=110).get_coherence()\n", 1479 | "hdp_coherence = CoherenceModel(topics=hdpmtopics, texts=train_texts, dictionary=dictionary, window_size=110).get_coherence()\n", 1480 | "lda_coherence = CoherenceModel(topics=ldamtopics, texts=train_texts, dictionary=dictionary, window_size=110).get_coherence()\n", 1481 | "lm_coherence = CoherenceModel(topics=ldam_tctopics, texts=train_texts, dictionary=dictionary, window_size=110).get_coherence()\n", 1482 | "lda_lsi_coherence = CoherenceModel(topics=lda_lsimtopics, texts=train_texts, dictionary=dictionary, window_size=110).get_coherence()\n", 1483 | "lda_hdp_coherence = CoherenceModel(model=lda_hdp, texts=train_texts, dictionary=dictionary).get_coherence()" 1484 | ] 1485 | }, 1486 | { 1487 | "cell_type": "code", 1488 | "execution_count": null, 1489 | "metadata": { 1490 | "collapsed": true 1491 | }, 1492 | "outputs": [], 1493 | "source": [ 1494 | "def evaluate_bar_graph(coherences, indices):\n", 1495 | " \"\"\"\n", 1496 | " Function to plot bar graph.\n", 1497 | " \n", 1498 | " coherences: list of coherence values\n", 1499 | " indices: Indices to be used to mark bars. Length of this and coherences should be equal.\n", 1500 | " \"\"\"\n", 1501 | " assert len(coherences) == len(indices)\n", 1502 | " n = len(coherences)\n", 1503 | " x = np.arange(n)\n", 1504 | " plt.bar(x, coherences, width=0.2, tick_label=indices, align='center')\n", 1505 | " plt.xlabel('Models')\n", 1506 | " plt.ylabel('Coherence Value')" 1507 | ] 1508 | }, 1509 | { 1510 | "cell_type": "code", 1511 | "execution_count": null, 1512 | "metadata": { 1513 | "collapsed": false 1514 | }, 1515 | "outputs": [], 1516 | "source": [ 1517 | "evaluate_bar_graph([lsi_coherence, hdp_coherence, lda_coherence, lm_coherence, lda_lsi_coherence, lda_hdp_coherence],\n", 1518 | " ['LSI', 'HDP', 'LDA', 'LDA_TC', 'LDA_LSI', 'LDA_HDP'])" 1519 | ] 1520 | }, 1521 | { 1522 | "cell_type": "markdown", 1523 | "metadata": {}, 1524 | "source": [ 1525 | "Our intuition was correct! HDP model turned out to be the best one and the transferred HDP model turned out to be the best LDA model. I would have liked our optimal topics and lda_lsi to perform better but we can be happy that it at least performs better than vanilla LDA!\n", 1526 | "\n", 1527 | "__DO REMEMBER__ that these values and topic models only correspond to this dataset and can vary accross different datasets. In fact since LDA models are probabilistic, they can vary a lot across runs too! We could have done better in LDA tuning however this was a tutorial on how topic coherence in particular can be used for LDA tuning. \n", 1528 | "I have observed that HDP models generally perform better when each `text` within `texts` has a lot of tokens. You will notice that the comparison turned out to be very different in my [news classification notebook](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/gensim_news_classification.ipynb)." 1529 | ] 1530 | }, 1531 | { 1532 | "cell_type": "markdown", 1533 | "metadata": {}, 1534 | "source": [ 1535 | "We can infer topic distributions from individual word tokens" 1536 | ] 1537 | }, 1538 | { 1539 | "cell_type": "code", 1540 | "execution_count": null, 1541 | "metadata": { 1542 | "collapsed": false 1543 | }, 1544 | "outputs": [], 1545 | "source": [ 1546 | "lda_hdp[dictionary.doc2bow(['jimmy', 'fallon', 'entertainment'])]" 1547 | ] 1548 | }, 1549 | { 1550 | "cell_type": "code", 1551 | "execution_count": null, 1552 | "metadata": { 1553 | "collapsed": false 1554 | }, 1555 | "outputs": [], 1556 | "source": [ 1557 | "lda_hdp.show_topic(60, topn=10)" 1558 | ] 1559 | }, 1560 | { 1561 | "cell_type": "markdown", 1562 | "metadata": {}, 1563 | "source": [ 1564 | "# LDA topic inference for classification\n", 1565 | "In this tutorial we will be using our best LDA model from above for inference and classification. When we use LDA for inference it gives large probabilities for some topics but negligible for the others. This is how it looks like:\n", 1566 | "\n", 1567 | "
_Image taken from Blei's video on LDA_
\n", 1568 | "\n", 1569 | "__This is the pipeline we're following for LDA inference classification__ (wanted to see how well I can use the pencil in gimp):\n", 1570 | "\n", 1571 | "P, B = Preprocessing, bigram collocation\n", 1572 | "\n", 1573 | "LDA = LDA model for inference\n", 1574 | "\n", 1575 | "CLF = Classifier" 1576 | ] 1577 | }, 1578 | { 1579 | "cell_type": "markdown", 1580 | "metadata": {}, 1581 | "source": [ 1582 | "As we can see above in the graph, LDA inference yields positive values for only some topics and negligible values for the others. We can assume them to be 0 to create our dataframe." 1583 | ] 1584 | }, 1585 | { 1586 | "cell_type": "code", 1587 | "execution_count": null, 1588 | "metadata": { 1589 | "collapsed": false 1590 | }, 1591 | "outputs": [], 1592 | "source": [ 1593 | "def ret_lda_features(ldamodel, texts, num_topics):\n", 1594 | " \"\"\"\n", 1595 | " Function to return LDA inference features for texts.\n", 1596 | " \n", 1597 | " Parameters:\n", 1598 | " ----------\n", 1599 | " ldamodel: LDA model to infer docs\n", 1600 | " texts: Texts to be inferred\n", 1601 | " num_topics: Number of topics. Will determine columns in dataframe\n", 1602 | " \n", 1603 | " Returns:\n", 1604 | " -------\n", 1605 | " lda_features: LDA features dataframe\n", 1606 | " \"\"\"\n", 1607 | " lda_features = pd.DataFrame()\n", 1608 | " for message in texts:\n", 1609 | " features = np.zeros(shape=(1, num_topics))\n", 1610 | " inference = ldamodel[dictionary.doc2bow(message)]\n", 1611 | " for tid, val in inference:\n", 1612 | " features[:, tid] = val\n", 1613 | " features = pd.DataFrame(features)\n", 1614 | " lda_features = lda_features.append(features, ignore_index=True)\n", 1615 | " return lda_features" 1616 | ] 1617 | }, 1618 | { 1619 | "cell_type": "code", 1620 | "execution_count": null, 1621 | "metadata": { 1622 | "collapsed": true 1623 | }, 1624 | "outputs": [], 1625 | "source": [ 1626 | "train_lda_features = ret_lda_features(lda_hdp, train_texts, num_topics)" 1627 | ] 1628 | }, 1629 | { 1630 | "cell_type": "code", 1631 | "execution_count": null, 1632 | "metadata": { 1633 | "collapsed": false 1634 | }, 1635 | "outputs": [], 1636 | "source": [ 1637 | "train_lda_features.shape" 1638 | ] 1639 | }, 1640 | { 1641 | "cell_type": "code", 1642 | "execution_count": null, 1643 | "metadata": { 1644 | "collapsed": false 1645 | }, 1646 | "outputs": [], 1647 | "source": [ 1648 | "train_lda_features.head()" 1649 | ] 1650 | }, 1651 | { 1652 | "cell_type": "code", 1653 | "execution_count": null, 1654 | "metadata": { 1655 | "collapsed": true 1656 | }, 1657 | "outputs": [], 1658 | "source": [ 1659 | "clf_lda = LogisticRegression()" 1660 | ] 1661 | }, 1662 | { 1663 | "cell_type": "code", 1664 | "execution_count": null, 1665 | "metadata": { 1666 | "collapsed": false 1667 | }, 1668 | "outputs": [], 1669 | "source": [ 1670 | "clf_lda = clf_lda.fit(train_lda_features, categories)" 1671 | ] 1672 | }, 1673 | { 1674 | "cell_type": "code", 1675 | "execution_count": null, 1676 | "metadata": { 1677 | "collapsed": false 1678 | }, 1679 | "outputs": [], 1680 | "source": [ 1681 | "test_lda_features = ret_lda_features(lda_hdp, test_texts, num_topics)" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "code", 1686 | "execution_count": null, 1687 | "metadata": { 1688 | "collapsed": false 1689 | }, 1690 | "outputs": [], 1691 | "source": [ 1692 | "test_lda_features.head()" 1693 | ] 1694 | }, 1695 | { 1696 | "cell_type": "code", 1697 | "execution_count": null, 1698 | "metadata": { 1699 | "collapsed": false 1700 | }, 1701 | "outputs": [], 1702 | "source": [ 1703 | "predictions = clf_lda.predict(test_lda_features)" 1704 | ] 1705 | }, 1706 | { 1707 | "cell_type": "code", 1708 | "execution_count": null, 1709 | "metadata": { 1710 | "collapsed": false 1711 | }, 1712 | "outputs": [], 1713 | "source": [ 1714 | "evaluate_prediction(predictions, test['category'])" 1715 | ] 1716 | }, 1717 | { 1718 | "cell_type": "markdown", 1719 | "metadata": {}, 1720 | "source": [ 1721 | "__Exercise__: Plug in our HDP model above instead of LDA model and see what's the difference" 1722 | ] 1723 | }, 1724 | { 1725 | "cell_type": "markdown", 1726 | "metadata": {}, 1727 | "source": [ 1728 | "Turns out, our TF-IDF and bag of words based models outperform this method. However no gensim tutorial is complete without word embeddings. Let's try word embeddings now!" 1729 | ] 1730 | }, 1731 | { 1732 | "cell_type": "markdown", 1733 | "metadata": {}, 1734 | "source": [ 1735 | "# Word2Vec can make your heart skip a gram\n", 1736 | "Well this is not really word2vec. We're just loading pre-trained twitter GloVe vectors. GloVe produces word embeddings in a different way than word2vec. GloVe uses a count-based model to learn word embeddings whereas word2vec tries to learn a \"predictive\" model by using different algorithms such as skip-gram or CBOW.\n", 1737 | "\n", 1738 | "We will be using twitter pre-trained GloVe vectors on twitter. We can convert them into word2vec vectors by just running [this](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/scripts/glove2word2vec.py) script present in gensim. I have used the 50d vectors however you are free to use the larger embeddings depending on the RAM you have." 1739 | ] 1740 | }, 1741 | { 1742 | "cell_type": "code", 1743 | "execution_count": null, 1744 | "metadata": { 1745 | "collapsed": true 1746 | }, 1747 | "outputs": [], 1748 | "source": [ 1749 | "import gensim\n", 1750 | "import logging\n", 1751 | "\n", 1752 | "from gensim.models import Word2Vec" 1753 | ] 1754 | }, 1755 | { 1756 | "cell_type": "code", 1757 | "execution_count": null, 1758 | "metadata": { 1759 | "collapsed": false 1760 | }, 1761 | "outputs": [], 1762 | "source": [ 1763 | "wv = Word2Vec.load_word2vec_format(\"/home/devashish/Downloads/GloVes/wv.twitter.27B.50d.txt\",\n", 1764 | " binary=False) # To use C text format\n", 1765 | "wv.init_sims(replace=True) # To decrease RAM usage" 1766 | ] 1767 | }, 1768 | { 1769 | "cell_type": "code", 1770 | "execution_count": null, 1771 | "metadata": { 1772 | "collapsed": false 1773 | }, 1774 | "outputs": [], 1775 | "source": [ 1776 | "wv.most_similar(positive=['arsenal'])" 1777 | ] 1778 | }, 1779 | { 1780 | "cell_type": "markdown", 1781 | "metadata": {}, 1782 | "source": [ 1783 | "### For this task our preprocessing will be different\n", 1784 | "Preprocessing taken from [stanford twitter-preprocessor](http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb)" 1785 | ] 1786 | }, 1787 | { 1788 | "cell_type": "code", 1789 | "execution_count": null, 1790 | "metadata": { 1791 | "collapsed": true 1792 | }, 1793 | "outputs": [], 1794 | "source": [ 1795 | "import string\n", 1796 | "\n", 1797 | "exclude = string.punctuation\n", 1798 | "exclude = re.sub('<|>', '', exclude) # To keep our intact\n", 1799 | "exclude = set(exclude)" 1800 | ] 1801 | }, 1802 | { 1803 | "cell_type": "code", 1804 | "execution_count": null, 1805 | "metadata": { 1806 | "collapsed": true 1807 | }, 1808 | "outputs": [], 1809 | "source": [ 1810 | "\"\"\"\n", 1811 | "preprocess-twitter.py\n", 1812 | "python preprocess-twitter.py \"Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)\"\n", 1813 | "Script for preprocessing tweets by Romain Paulus\n", 1814 | "with small modifications by Jeffrey Pennington\n", 1815 | "with translation to Python by Motoki Wu\n", 1816 | "Translation of Ruby script to create features for GloVe vectors for Twitter data.\n", 1817 | "http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb\n", 1818 | "\"\"\"\n", 1819 | "\n", 1820 | "FLAGS = re.MULTILINE | re.DOTALL\n", 1821 | "\n", 1822 | "def hashtag(text):\n", 1823 | " text = text.group()\n", 1824 | " hashtag_body = text[1:]\n", 1825 | " if hashtag_body.isupper():\n", 1826 | " result = \" {} \".format(hashtag_body)\n", 1827 | " else:\n", 1828 | " result = \" \".join([\"\"] + re.split(r\"(?=[A-Z])\", hashtag_body, flags=FLAGS))\n", 1829 | " return result\n", 1830 | "\n", 1831 | "def allcaps(text):\n", 1832 | " text = text.group()\n", 1833 | " return text.lower() + \" \"\n", 1834 | "\n", 1835 | "\n", 1836 | "def preprocess_tweet(text):\n", 1837 | " # Different regex parts for smiley faces\n", 1838 | " eyes = r\"[8:=;]\"\n", 1839 | " nose = r\"['`\\-]?\"\n", 1840 | "\n", 1841 | " # function so code less repetitive\n", 1842 | " def re_sub(pattern, repl):\n", 1843 | " return re.sub(pattern, repl, text, flags=FLAGS)\n", 1844 | "\n", 1845 | " text = re_sub(r\"https?:\\/\\/\\S+\\b|www\\.(\\w+\\.)+\\S*\", \"\")\n", 1846 | " text = re_sub(r\"/\",\" / \")\n", 1847 | " text = re_sub(r\"@\\w+\", \"\")\n", 1848 | " text = re_sub(r\"{}{}[)dD]+|[)dD]+{}{}\".format(eyes, nose, nose, eyes), \"\")\n", 1849 | " text = re_sub(r\"{}{}p+\".format(eyes, nose), \"\")\n", 1850 | " text = re_sub(r\"{}{}\\(+|\\)+{}{}\".format(eyes, nose, nose, eyes), \"\")\n", 1851 | " text = re_sub(r\"{}{}[\\/|l*]\".format(eyes, nose), \"\")\n", 1852 | " text = re_sub(r\"<3\",\"\")\n", 1853 | " text = re_sub(r\"[-+]?[.\\d]*[\\d]+[:,.\\d]*\", \" \")\n", 1854 | " text = re_sub(r\"#\\S+\", hashtag)\n", 1855 | " text = re_sub(r\"([!?.]){2,}\", r\"\\1 \")\n", 1856 | " text = re_sub(r\"\\b(\\S*?)(.)\\2{2,}\\b\", r\"\\1\\2 \")\n", 1857 | "\n", 1858 | " ## -- I just don't understand why the Ruby script adds to everything so I limited the selection.\n", 1859 | " # text = re_sub(r\"([^a-z0-9()<>'`\\-]){2,}\", allcaps)\n", 1860 | " text = re_sub(r\"([A-Z]){2,}\", allcaps)\n", 1861 | " \n", 1862 | " text = ''.join(ch for ch in text if ch not in exclude) # Remove punctuation\n", 1863 | "\n", 1864 | " return text.lower()" 1865 | ] 1866 | }, 1867 | { 1868 | "cell_type": "code", 1869 | "execution_count": null, 1870 | "metadata": { 1871 | "collapsed": true 1872 | }, 1873 | "outputs": [], 1874 | "source": [ 1875 | "def g2v_tokenize_tweet(text):\n", 1876 | " text = text.encode('ascii', 'ignore') # Deal with UnicodeDecodeErrors\n", 1877 | " text = preprocess_tweet(text)\n", 1878 | " return text.split()" 1879 | ] 1880 | }, 1881 | { 1882 | "cell_type": "markdown", 1883 | "metadata": {}, 1884 | "source": [ 1885 | "# Word Vector Averaging\n", 1886 | "To get a vector for a document, we simply average it's word vectors. As suggested in [this](https://www.youtube.com/watch?v=7gTjYwiaJiU) video by Mike Tamir, the word averaging results to some kind of a unique word summarizing the whole document in one word. Eg all words in a book 'A tale of two cities' would probably add up to 'class-struggle'.\n", 1887 | "" 1888 | ] 1889 | }, 1890 | { 1891 | "cell_type": "code", 1892 | "execution_count": null, 1893 | "metadata": { 1894 | "collapsed": true 1895 | }, 1896 | "outputs": [], 1897 | "source": [ 1898 | "# code from \"Movie plots by genre\" by Lev Konstantinovskiy:\n", 1899 | "# https://github.com/RaRe-Technologies/movie-plots-by-genre\n", 1900 | "def word_averaging(wv, words):\n", 1901 | " all_words, mean = set(), []\n", 1902 | " \n", 1903 | " for word in words:\n", 1904 | " if isinstance(word, np.ndarray):\n", 1905 | " mean.append(word)\n", 1906 | " elif word in wv.vocab:\n", 1907 | " mean.append(wv.syn0norm[wv.vocab[word].index])\n", 1908 | " all_words.add(wv.vocab[word].index)\n", 1909 | "\n", 1910 | " if not mean:\n", 1911 | " logging.warning(\"cannot compute similarity with no input %s\", words)\n", 1912 | " # FIXME: remove these examples in pre-processing\n", 1913 | " return np.zeros(wv.layer_size,)\n", 1914 | "\n", 1915 | " mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)\n", 1916 | " return mean\n", 1917 | "\n", 1918 | "def word_averaging_list(wv, text_list):\n", 1919 | " return np.vstack([word_averaging(wv, review) for review in text_list ])" 1920 | ] 1921 | }, 1922 | { 1923 | "cell_type": "code", 1924 | "execution_count": null, 1925 | "metadata": { 1926 | "collapsed": false, 1927 | "scrolled": true 1928 | }, 1929 | "outputs": [], 1930 | "source": [ 1931 | "train_tokenized = train.apply(lambda t: g2v_tokenize_tweet(t['message']), axis=1).values\n", 1932 | "test_tokenized = test.apply(lambda t: g2v_tokenize_tweet(t['message']), axis=1).values" 1933 | ] 1934 | }, 1935 | { 1936 | "cell_type": "code", 1937 | "execution_count": null, 1938 | "metadata": { 1939 | "collapsed": true 1940 | }, 1941 | "outputs": [], 1942 | "source": [ 1943 | "X_train_word_average = word_averaging_list(wv,train_tokenized)\n", 1944 | "X_test_word_average = word_averaging_list(wv,test_tokenized)" 1945 | ] 1946 | }, 1947 | { 1948 | "cell_type": "code", 1949 | "execution_count": null, 1950 | "metadata": { 1951 | "collapsed": true 1952 | }, 1953 | "outputs": [], 1954 | "source": [ 1955 | "clf_g2v = LogisticRegression()\n", 1956 | "clf_g2v = clf_g2v.fit(X_train_word_average, train['category'])" 1957 | ] 1958 | }, 1959 | { 1960 | "cell_type": "code", 1961 | "execution_count": null, 1962 | "metadata": { 1963 | "collapsed": true 1964 | }, 1965 | "outputs": [], 1966 | "source": [ 1967 | "predictions = clf_g2v.predict(X_test_word_average)" 1968 | ] 1969 | }, 1970 | { 1971 | "cell_type": "code", 1972 | "execution_count": null, 1973 | "metadata": { 1974 | "collapsed": false 1975 | }, 1976 | "outputs": [], 1977 | "source": [ 1978 | "evaluate_prediction(predictions, test['category'])" 1979 | ] 1980 | }, 1981 | { 1982 | "cell_type": "markdown", 1983 | "metadata": {}, 1984 | "source": [ 1985 | "Turned out to be pretty disappointing :( Probably word averaging doesn't work too well for tweets. This classifies most categories as \"Business and CEOs\" for some reason.\n", 1986 | "\n", 1987 | "__Things to try__: We can tweak our preprocessing here. Probably use our previous preprocessor. Can also use a neural network based MLPClassifier instead of logistic regression." 1988 | ] 1989 | }, 1990 | { 1991 | "cell_type": "markdown", 1992 | "metadata": {}, 1993 | "source": [ 1994 | "# You can use the function below for user classification!" 1995 | ] 1996 | }, 1997 | { 1998 | "cell_type": "code", 1999 | "execution_count": null, 2000 | "metadata": { 2001 | "collapsed": true 2002 | }, 2003 | "outputs": [], 2004 | "source": [ 2005 | "def get_tweets(handle):\n", 2006 | " \"\"\"\n", 2007 | " Function to return user profile.\n", 2008 | " \n", 2009 | " Parameters:\n", 2010 | " ----------\n", 2011 | " handle: Twitter handle\n", 2012 | " \n", 2013 | " Returns:\n", 2014 | " -------\n", 2015 | " tweets: space seperated list of tweets\n", 2016 | " \"\"\"\n", 2017 | " tweets = api.GetUserTimeline(screen_name=handle, count=200)\n", 2018 | " tweets = [tweet.text for tweet in tweets]\n", 2019 | " return ' '.join(tweets)" 2020 | ] 2021 | }, 2022 | { 2023 | "cell_type": "code", 2024 | "execution_count": null, 2025 | "metadata": { 2026 | "collapsed": true 2027 | }, 2028 | "outputs": [], 2029 | "source": [ 2030 | "def pred_handle(handle, vectorizer=None, clf=None, g2v_or_lda=None):\n", 2031 | " \"\"\"\n", 2032 | " Function to classify handle. Vectorizer need not be\n", 2033 | " provided if g2v_or_lda is being used. g2v_or_lda can be:\n", 2034 | " 'g2v' for classification with glove word averaging\n", 2035 | " 'lda' for classification with LDA inference\n", 2036 | " 'None' for classification with standard preprocessor and vectorizer\n", 2037 | " \n", 2038 | " Parameters:\n", 2039 | " ----------\n", 2040 | " handle: Twitter handle of user to classify\n", 2041 | " vectorizer: Vectorizer (pre-trained) to be used\n", 2042 | " clf: Classifier (pre-trained) to be used\n", 2043 | " g2v_or_lda: 'g2v', 'lda' or 'None' depending on which \"mode\" you want to use\n", 2044 | " \n", 2045 | " Returns:\n", 2046 | " -------\n", 2047 | " category: Classified category\n", 2048 | " \"\"\"\n", 2049 | " if clf is None:\n", 2050 | " raise Exception('Classifier has to be provided')\n", 2051 | " if vectorizer is None and g2v_or_lda is None:\n", 2052 | " raise Exception('Vectorizer should be provided if glove'\n", 2053 | " ' or LDA classification is not being used')\n", 2054 | " profile = get_tweets(handle)\n", 2055 | " if g2v_or_lda is None:\n", 2056 | " profile = preprocess_text(profile)\n", 2057 | " profile = bigram[profile]\n", 2058 | " features = vectorizer.transform([' '.join(profile)])\n", 2059 | " else:\n", 2060 | " if g2v_or_lda == 'g2v':\n", 2061 | " profile = g2v_tokenize_tweet(profile)\n", 2062 | " features = word_averaging_list(wv, profile)\n", 2063 | " elif g2v_or_lda == 'lda':\n", 2064 | " features = np.zeros(shape=(1, num_topics))\n", 2065 | " inference = lda_hdp[dictionary.doc2bow(message)]\n", 2066 | " for tid, val in inference:\n", 2067 | " features[:, tid] = val\n", 2068 | " category = clf.predict(features)\n", 2069 | " return category" 2070 | ] 2071 | }, 2072 | { 2073 | "cell_type": "code", 2074 | "execution_count": null, 2075 | "metadata": { 2076 | "collapsed": false 2077 | }, 2078 | "outputs": [], 2079 | "source": [ 2080 | "try:\n", 2081 | " pred = pred_handle('chelseafc', vectorizer=count_vectorizer, clf=clf_count, g2v_or_lda=None)[0]\n", 2082 | "except:\n", 2083 | " print(\"Please check whether python-twitter is installed. If yes, please check\"\n", 2084 | " \" whether credentials are stored correctly.\")\n", 2085 | "print categories_map[pred]" 2086 | ] 2087 | }, 2088 | { 2089 | "cell_type": "markdown", 2090 | "metadata": {}, 2091 | "source": [ 2092 | "__Please note that the results for these techniques apply to this particular dataset only__\n", 2093 | "\n", 2094 | "GloVe and word embeddings are a very powerful technique to do complex NLP tasks. Do watch [this](https://www.youtube.com/watch?v=vkfXBGnDplQ) video by Chris Moody to see some of the applications of word2vec outside of text processing.\n", 2095 | "\n", 2096 | "You can try other techniques too such as Doc2Vec, word mover's distance (WMD) etc. for exploring more techniques to working with text data. Do check out [this](https://github.com/RaRe-Technologies/movie-plots-by-genre/blob/master/ipynb_with_output/Document%20classification%20with%20word%20embeddings%20tutorial%20-%20with%20output.ipynb) notebook by Lev Konstantinovskiy." 2097 | ] 2098 | }, 2099 | { 2100 | "cell_type": "markdown", 2101 | "metadata": {}, 2102 | "source": [ 2103 | "# Bibliography\n", 2104 | "Do have a look at the following resources I used for making this tutorial:\n", 2105 | "- [Movie plots by genre by Lev Konstantinovskiy](https://github.com/RaRe-Technologies/movie-plots-by-genre/blob/master/ipynb_with_output/Document%20classification%20with%20word%20embeddings%20tutorial%20-%20with%20output.ipynb)\n", 2106 | "- [Topic coherence pipeline paper by Roeder et al](svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf)\n", 2107 | "- [Original LDA paper by Blei et al](https://www.cs.princeton.edu/~blei/papers/BleiNgJordan2003.pdf)\n", 2108 | "- [Online LDA paper by Blei et al](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf)\n", 2109 | "- [Empirical study of topic modeling in Twitter by Davison et al](http://snap.stanford.edu/soma2010/papers/soma2010_12.pdf)\n", 2110 | "- [Machine learning approach to twitter user classification by Pennacchiotti et al](https://www.google.co.in/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwjHiN7E8svPAhUKPo8KHaY3CsUQFggiMAA&url=https%3A%2F%2Fwww.aaai.org%2Focs%2Findex.php%2FICWSM%2FICWSM11%2Fpaper%2Fdownload%2F2886%2F3262&usg=AFQjCNE7NxTUl11QpN6GA7qj6_NNK0tjqw&sig2=LXSszb8tQN00sej6yr-I5w)\n", 2111 | "- [Stanford GloVe project](http://nlp.stanford.edu/projects/glove/)" 2112 | ] 2113 | } 2114 | ], 2115 | "metadata": { 2116 | "anaconda-cloud": {}, 2117 | "kernelspec": { 2118 | "display_name": "Python 2", 2119 | "language": "python", 2120 | "name": "python2" 2121 | }, 2122 | "language_info": { 2123 | "codemirror_mode": { 2124 | "name": "ipython", 2125 | "version": 2 2126 | }, 2127 | "file_extension": ".py", 2128 | "mimetype": "text/x-python", 2129 | "name": "python", 2130 | "nbconvert_exporter": "python", 2131 | "pygments_lexer": "ipython2", 2132 | "version": "2.7.12" 2133 | } 2134 | }, 2135 | "nbformat": 4, 2136 | "nbformat_minor": 0 2137 | } 2138 | --------------------------------------------------------------------------------