├── .gitignore ├── LICENSE.txt ├── README.md ├── build-sentiment-classifier.ipynb ├── params.py └── twitter_sentiment_model.pkl /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | stanford* 3 | test* 4 | ven* 5 | .ipynb_checkpoints 6 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Chris Rawles 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Sentiment Classifer in PL/Python 2 | The `build-sentiment-classifier.ipynb` Jupyter Notebook builds and exports a serialized Twitter sentiment classifier `twitter_sentiment_model.pkl` using PL/Python for PostgreSQL, Greenplum Database, or Apache HAWQ. The classifier is based on the approach of [Go et al](http://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf) using the [Sentiment140 data](http://help.sentiment140.com/for-students/). The data can be downloaded from the Sentiment140 website. 3 | 4 | The classifier has an accuracy of 80% on the test dataset consisting of several hundred annotated tweets. The training set consists of 1.6 million tweets automatically labeled by assuming that any tweet with positive emoticons, like :), were positive, and tweets with negative emoticons, like :(, were negative. This technique is called distant supervision using emoticons as noisy labels. 5 | 6 | ## Additional Resources 7 | * [Deploying the model as a service](https://github.com/crawles/text-analytics-service-example) 8 | * [Sentiment analysis on Wikipedia](https://en.wikipedia.org/wiki/Sentiment_analysis) 9 | 10 | ## Author 11 | `Chris Rawles` 12 | -------------------------------------------------------------------------------- /build-sentiment-classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook implements an English-language tweet sentiment classifier based on the approach of [Go et al.](http://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf) The accuracy on the test data containing positive and negative sentiment tweets is 80%. Training and test data was downloaded [here](http://help.sentiment140.com/for-students/)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2016-09-16T09:55:42.980391", 16 | "start_time": "2016-09-16T09:55:42.703406" 17 | }, 18 | "collapsed": false 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import cPickle\n", 23 | "\n", 24 | "from IPython.core.magic import (register_line_magic, register_cell_magic,\n", 25 | " register_line_cell_magic)\n", 26 | "from IPython.display import display\n", 27 | "from IPython.display import HTML\n", 28 | "import pandas as pd\n", 29 | "import pandas.io.sql as psql\n", 30 | "import psycopg2\n", 31 | "from sklearn.feature_extraction.text import CountVectorizer\n", 32 | "from sklearn.linear_model import LogisticRegression\n", 33 | "from sklearn.metrics import classification_report, roc_auc_score, roc_curve\n", 34 | "from sklearn.pipeline import Pipeline\n", 35 | "\n", 36 | "import params" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "ExecuteTime": { 44 | "end_time": "2016-09-16T09:55:43.033269", 45 | "start_time": "2016-09-16T09:55:42.982300" 46 | }, 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "# connect to database\n", 52 | "conn = psycopg2.connect(database=params.database,\n", 53 | " host=params.host,\n", 54 | " port=params.port,\n", 55 | " user=params.username,\n", 56 | " password=params.password)\n", 57 | "\n", 58 | "conn.autocommit = True" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "ExecuteTime": { 66 | "end_time": "2016-09-15T16:52:12.343466", 67 | "start_time": "2016-09-15T16:52:12.311246" 68 | }, 69 | "collapsed": false 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "# magic functions to aid interaction with PostgresSQL/GPDB/HAWQ\n", 74 | "_df = None\n", 75 | "@register_cell_magic\n", 76 | "def showsql(line, cell):\n", 77 | " \"\"\"\n", 78 | " Extract the code in the specific cell (should be valid SQL), and execute\n", 79 | " it using the connection object to the backend database. \n", 80 | " The resulting Pandas dataframe\n", 81 | " is rendered inline below the cell using IPython.display.\n", 82 | " You'd use this for SELECT\n", 83 | " \"\"\"\n", 84 | " #Use the global connection object defined above.\n", 85 | " global conn\n", 86 | " global _df\n", 87 | " _df = psql.read_sql(cell, conn)\n", 88 | " conn.commit()\n", 89 | " display(_df)\n", 90 | " return\n", 91 | " \n", 92 | "@register_cell_magic\n", 93 | "def execsql(line, cell):\n", 94 | " \"\"\"\n", 95 | " Extract the code in the specific cell (should be valid SQL), and execute\n", 96 | " it using the connection object to the backend database. \n", 97 | " You'd use this for CREATE/UPDATE/DELETE\n", 98 | " \"\"\"\n", 99 | " #Use the global connection object defined above.\n", 100 | " global conn\n", 101 | " global _df\n", 102 | " _df = psql.execute(cell, conn)\n", 103 | " conn.commit()\n", 104 | " return\n", 105 | "\n", 106 | "# We delete these to avoid name conflicts for automagic to work\n", 107 | "del execsql, showsql" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## Build PL/Python function and model" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "ExecuteTime": { 122 | "start_time": "2016-09-15T20:37:28.216Z" 123 | }, 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "%%execsql\n", 129 | "\n", 130 | "DROP FUNCTION IF EXISTS mdl.train_sentiment_model(tweets text[], polarities bigint[]);\n", 131 | "CREATE FUNCTION mdl.train_sentiment_model(tweets text[], polarities bigint[])\n", 132 | "RETURNS bytea AS $$\n", 133 | "import cPickle\n", 134 | "import re\n", 135 | "\n", 136 | "import pandas as pd\n", 137 | "from sklearn.feature_extraction.text import CountVectorizer\n", 138 | "from sklearn.linear_model import LogisticRegression\n", 139 | "from sklearn.pipeline import Pipeline\n", 140 | "\n", 141 | "def regex_preprocess(raw_tweets):\n", 142 | " pp_text = pd.Series(raw_tweets)\n", 143 | " \n", 144 | " user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\\.]))@([A-Za-z]+[A-Za-z0-9]+)'\n", 145 | " http_pat = '(https?:\\/\\/(?:www\\.|(?!www))[^\\s\\.]+\\.[^\\s]{2,}|www\\.[^\\s]+\\.[^\\s]{2,})'\n", 146 | " repeat_pat, repeat_repl = \"(.)\\\\1\\\\1+\",'\\\\1\\\\1'\n", 147 | "\n", 148 | " pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')\n", 149 | " pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')\n", 150 | " pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)\n", 151 | " return pp_text\n", 152 | " \n", 153 | "sentiment_lr = Pipeline([('count_vect', CountVectorizer(min_df = 100,\n", 154 | " ngram_range = (1,1),\n", 155 | " stop_words = 'english')), \n", 156 | " ('lr', LogisticRegression())])\n", 157 | "\n", 158 | "sentiment_lr.fit(regex_preprocess(tweets), polarities)\n", 159 | "return cPickle.dumps(sentiment_lr)\n", 160 | "$$ LANGUAGE plpythonu;\n", 161 | "\n", 162 | "DROP TABLE IF EXISTS mdl.sentiment_model;\n", 163 | "CREATE TABLE mdl.sentiment_model AS\n", 164 | "SELECT mdl.train_sentiment_model(array_agg(text),array_agg(polarity)) model\n", 165 | "FROM mdl.tweets_train;" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "## Apply function to test set" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "ExecuteTime": { 180 | "end_time": "2016-09-16T09:55:56.464333", 181 | "start_time": "2016-09-16T09:55:56.337734" 182 | }, 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "%%execsql\n", 188 | "SELECT *\n", 189 | "FROM mdl.sentiment_model;\n", 190 | "\n", 191 | "DROP FUNCTION IF EXISTS mdl.apply_sentiment_model(model bytea, tweets text[]);\n", 192 | "CREATE FUNCTION mdl.apply_sentiment_model(model bytea, tweets text[])\n", 193 | "RETURNS float8[] AS $$\n", 194 | "import cPickle\n", 195 | "import re\n", 196 | "\n", 197 | "import pandas as pd\n", 198 | "from sklearn.feature_extraction.text import CountVectorizer\n", 199 | "from sklearn.linear_model import LogisticRegression\n", 200 | "from sklearn.pipeline import Pipeline\n", 201 | "\n", 202 | "def regex_preprocess(raw_tweets):\n", 203 | " pp_text = pd.Series(raw_tweets)\n", 204 | " \n", 205 | " user_pat = '(?<=^|(?<=[^a-zA-Z0-9-_\\.]))@([A-Za-z]+[A-Za-z0-9]+)'\n", 206 | " http_pat = '(https?:\\/\\/(?:www\\.|(?!www))[^\\s\\.]+\\.[^\\s]{2,}|www\\.[^\\s]+\\.[^\\s]{2,})'\n", 207 | " repeat_pat, repeat_repl = \"(.)\\\\1\\\\1+\",'\\\\1\\\\1'\n", 208 | "\n", 209 | " pp_text = pp_text.str.replace(pat = user_pat, repl = 'USERNAME')\n", 210 | " pp_text = pp_text.str.replace(pat = http_pat, repl = 'URL')\n", 211 | " pp_text.str.replace(pat = repeat_pat, repl = repeat_repl)\n", 212 | " return pp_text\n", 213 | "\n", 214 | "cl = cPickle.loads(model)\n", 215 | "X = regex_preprocess(tweets)\n", 216 | "return cl.predict_proba(X)[:,1]\n", 217 | "$$ LANGUAGE plpythonu;" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "ExecuteTime": { 225 | "end_time": "2016-09-16T11:27:14.958327", 226 | "start_time": "2016-09-16T11:27:14.344450" 227 | }, 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "%%showsql\n", 233 | "SELECT unnest(tweets) tweet, unnest(mdl.apply_sentiment_model(model, tweets)) polarity\n", 234 | "FROM\n", 235 | "mdl.sentiment_model,\n", 236 | "(SELECT array['i am so ridiculously happy!!',\n", 237 | " 'i am very very mad and angry',\n", 238 | " 'steph curry is a basketball player'] tweets)f" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "ExecuteTime": { 246 | "end_time": "2016-09-15T18:00:19.206611", 247 | "start_time": "2016-09-15T18:00:18.402493" 248 | }, 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "%%showsql\n", 254 | "\n", 255 | "--# build table \n", 256 | "DROP TABLE IF EXISTS mdl.tweets_test_results;\n", 257 | "CREATE TABLE mdl.tweets_test_results\n", 258 | "AS\n", 259 | "SELECT unnest(tweets),\n", 260 | " round(unnest(mdl.apply_sentiment_model(model,tweets))) prediction,\n", 261 | " unnest(polarities) polarity\n", 262 | "FROM\n", 263 | "mdl.sentiment_model,\n", 264 | "(SELECT array_agg(text) tweets, array_agg(greatest(polarity-3,0)) polarities\n", 265 | "FROM mdl.tweets_test\n", 266 | "WHERE polarity != 2 --#neutral tweets\n", 267 | ")f1;\n", 268 | "\n", 269 | "--# check accuracy of model\n", 270 | "SELECT 1 - AVG(ABS(prediction - polarity)) accuracy\n", 271 | "FROM mdl.tweets_test_results;" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "## Appendix" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "ExecuteTime": { 286 | "end_time": "2016-09-15T17:37:37.273781", 287 | "start_time": "2016-09-15T17:37:37.242115" 288 | }, 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "%%showsql\n", 294 | "SELECT greatest(4-1,5)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "ExecuteTime": { 301 | "end_time": "2016-09-15T17:23:26.639435", 302 | "start_time": "2016-09-15T17:23:26.611570" 303 | } 304 | }, 305 | "source": [ 306 | "### Load twitter data" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "ExecuteTime": { 314 | "end_time": "2016-09-15T17:24:00.978493", 315 | "start_time": "2016-09-15T17:24:00.937202" 316 | }, 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "# data downloaded from http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip\n", 322 | "columns = ['polarity', 'tweetid', 'date', 'query_name', 'user', 'text']\n", 323 | "dftrain = pd.read_csv('stanford-sentiment-twitter-data/training.1600000.processed.noemoticon.csv',\n", 324 | " header = None,\n", 325 | " encoding ='ISO-8859-1')\n", 326 | "dftest = pd.read_csv('stanford-sentiment-twitter-data/testdata.manual.2009.06.14.csv',\n", 327 | " header = None,\n", 328 | " encoding ='ISO-8859-1')\n", 329 | "dftrain.columns = columns\n", 330 | "dftest.columns = columns" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "ExecuteTime": { 337 | "end_time": "2016-09-15T17:23:44.979551", 338 | "start_time": "2016-09-15T17:23:44.953881" 339 | } 340 | }, 341 | "source": [ 342 | "### Upload data to db" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": true 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "def df_add_id_train(df,is_train):\n", 354 | " df.insert(0,'id',df.index.tolist())\n", 355 | " df.insert(1,'is_train',[is_train]*df.shape[0])\n", 356 | " return df\n", 357 | "\n", 358 | "# train data\n", 359 | "dftrain_export = dftrain.copy()\n", 360 | "dftrain_export = dftrain_export[['polarity','text']]\n", 361 | "dftrain_export = df_add_id_train(dftrain_export,1)\n", 362 | "dftrain_export.to_sql('tweets_train', engine, schema='mdl', index = False, if_exists = 'replace', chunksize=10000)\n", 363 | "\n", 364 | "# test data\n", 365 | "dftest_export = dftest.copy()\n", 366 | "dftest_export = dftest_export[['polarity','text']]\n", 367 | "dftest_export = df_add_id_train(dftest_export,1)\n", 368 | "dftest_export.to_sql('tweets_test', engine, schema='mdl', index = False, if_exists = 'replace', chunksize=10000)" 369 | ] 370 | } 371 | ], 372 | "metadata": { 373 | "kernelspec": { 374 | "display_name": "Python 2", 375 | "language": "python", 376 | "name": "python2" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 2 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython2", 388 | "version": "2.7.12" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 0 393 | } 394 | -------------------------------------------------------------------------------- /params.py: -------------------------------------------------------------------------------- 1 | database = # 2 | host = # 3 | port = # 4 | username = # 5 | password = # 6 | -------------------------------------------------------------------------------- /twitter_sentiment_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crawles/gpdb_sentiment_analysis_twitter_model/5d9f0dd38556501840492b6a4037a95c5da2d23b/twitter_sentiment_model.pkl --------------------------------------------------------------------------------