├── README.md ├── LICENSE ├── .gitignore └── Stock_Market.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Daily-News-for-Stock-Market-Prediction 2 | Stock market prediction with Top Reddit's News 3 | 4 | Here I used some machine learning algorithms in order to predict DJIA (Dow Jones index) close price taking Reddit's Top News as features of the classifier. 5 | 6 | This project is based on the kaggle's dataset availabe at https://www.kaggle.com/aaron7sun/stocknews. 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Igor Santos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv/ 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # End of https://www.gitignore.io/api/python -------------------------------------------------------------------------------- /Stock_Market.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Stock Market Prediction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 186, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# Import dependencies\n", 19 | "import seaborn as sns\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "%matplotlib inline" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 187, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# Read data\n", 35 | "df = pd.read_csv('data/Combined_News_DJIA.csv', parse_dates=True, index_col=0)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 188, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "" 49 | ] 50 | }, 51 | "execution_count": 188, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | }, 55 | { 56 | "data": { 57 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiAAAAF6CAYAAAAgdOMQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAG05JREFUeJzt3X+U3XV95/Hn/KhuMmEgwdQKtBVE3jYBW4qV3ZWWHBds\nqXvsKigrawEN/gDK4u5STsWoFbEV648Uj6TLsYGtRhaKxNPNwuq2/BA5UE9Bu9DAm+4WKkWByCQO\nTmI1c2f/+H5HL9eZZHIz93O/mXk+zpkz934+n+/3+/565iuvfL6f770DU1NTSJIklTTY7wIkSdLi\nYwCRJEnFGUAkSVJxBhBJklScAUSSJBVnAJEkScUZQCRJUnEGEEmSVJwBRJIkFTfc7wIAImIlcA+w\nNjPvrNtOB94HHAmMAdcCH8rMqbr/HGAdcBiwFbgoM++t+4aAK4G3AEuB24B3ZeaTJc9LkiTNrO8z\nIBHxKqrwcRQwHS5OAP4MuCwzDwZOA84F/lPdvwa4CjgbGAU2AVsi4pB6t+uAU4ATgMOBXcDni5yQ\nJEnaq74GkIg4lyo8XNbR9fPAhsy8BSAzHwa+CLy67j8PuD4z78nMycxcDzwNvL7ufzvwkcx8IjOf\nBS4G1kTEkT09IUmSNCf9ngG5FTgqM29sb8zMmzPzkun3EbEEeC1wf920CnigY19bgeMi4mCq2zI/\n6s/Mp6lu4xw372cgSZL2WV/XgGTmU3sbExHLgC8AE1TrOgAOqt+32wksq3/YQ78kSeqzRixCnU1E\nBFX4+DawJjOnQ8UE1eLSdiPANn4cPDr7lwLPzuW4U1NTUwMDA13VLEnSIjen/4A2NoBExG9SLRy9\nBvi9zGy1dT8IHNuxySpgS2buiIhv1f1b6339DLCi3m6vxsYmGBw0gEiStK+WLx+Z07hGBpCI+JfA\nZuCdmXndDEM2Apsj4kbgbuBCYGW9zXT/uoj4GvAMsB64IzMfncvxW60pWq2p/TsJSZI0q0YGEOA9\nwBDwqYj4VFv7VzLztZl5W0RcAGwAjqCa2TgtM3fU4y4Hfgq4i2q9yG3Am4pVL0mS9mhgasp/6Xfa\ntu1Z/0eRJKkLK1ceNKc1DP1+DFeSJC1CBhBJklScAUSSJBVnAJEkScUZQCRJUnEGEEmSVJwBRJIk\nFWcAkSRJxRlAJElScQYQSZJUnAFEkiQVZwCRJEnFGUAkSVJxBhBJklScAUSSJBVnAJEkScUZQCRJ\nUnEGEEmSVJwBRJIkFWcAkSRJxRlAJElScQYQSZJUnAFEkiQVZwCRJEnFGUAkSVJxBhBJklScAUSS\nJBVnAJEkScUZQCRJUnHD/S5AkppiYmKCRx55uN9lSD13zDEvY2RkpK81GEAkqfbIIw9z+Q0f5pDD\nD+13KVLP7HjiGd5/5ns5/vgT+lqHAUSS2hxy+KG84MgX9rsMacFzDYgkSSrOACJJkoozgEiSpOIM\nIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKKa8RHsUfESuAeYG1m3lm3nQhc\nBawGngauyMyNbducA6wDDgO2Ahdl5r113xBwJfAWYClwG/CuzHyy2ElJkqRZ9X0GJCJeRRU+jgKm\n6rblwC3AdcAosBb4ZEScXPevoQonZ9f9m4AtEXFIvdt1wCnACcDhwC7g80VOSJIk7VVfA0hEnEsV\nHi7r6Dod2JaZGzKzlZm31+PW1v3nAddn5j2ZOZmZ66lmSV5f978d+EhmPpGZzwIXA2si4sgen5Ik\nSZqDfs+A3AoclZk3drSvBh7oaHsIOG4P/VuB4yLiYKrbMj/qz8yngbG27SVJUh/1dQ1IZj41S9dB\nwM6Otp3Asvr1MmBilv7pMbP179Xg4ACDgwNzGSppARka6ve/yaQyhoYGGR7u7997IxahzuB7VGs3\n2i0FxuvXE/X7diPANn4cPDr7lwLPzuXgK1aMMDBgAJEWm9HRJf0uQSpidHQJy5eP9LWGpgaQB4HX\ndLStqtun+4+doX9LZu6IiG/V/VsBIuJngBVt2+/R2NiEMyDSIjQ+vqvfJUhFjI/vYvv2zhsF82Ou\nwaapAeRm4KMRcTFwNXAScBbwurp/I7A5Im4E7gYuBFYCm9v610XE14BngPXAHZn56FwO3mpN0WpN\nzde5SDpATE62+l2CVMTkZIvdu/v7997IG56ZOQacCrwR+A5wDdXnfNxZ998GXABsoFpceiZwWmbu\nqHdxOfA/gbuAx4HnAW8qeQ6SJGl2jZkByczBjvf3Uc18zDZ+E9WjuTP17QbeU/9IkqSGaeQMiCRJ\nWtgMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKk4A4gk\nSSrOACJJkoozgEiSpOIMIJIkqbjhfhew2ExMTPDIIw/3uwyp54455mWMjIz0uwxJDWUAKeyRRx7m\n/euv5+BDD+93KVLPfPeZJ7j83W/m+ONP6HcpkhrKANIHBx96OIe+6CX9LkOSpL5xDYgkSSrOACJJ\nkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCS\nJKk4A4gkSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAi\nSZKKM4BIkqTiDCCSJKm44X4XsCcRcQrwh8AxwARwI3BpZv4gIk4ErgJWA08DV2TmxrZtzwHWAYcB\nW4GLMvPewqcgSZJm0NgZkIhYAvwF8N8y82DgV4BfBy6NiOXALcB1wCiwFvhkRJxcb7uGKpycXfdv\nArZExCGFT0OSJM2gsQEEmAKeBYYiYggYAFpUMyGnA9/JzA2Z2crM26lCxtp62/OA6zPznsyczMz1\nVLMkbyh+FpIk6Sc0NoBk5veBM4EPAt8HvgkksJ7qtsv/6djkIeC4+vVq4IGO/q1t/ZIkqY8aG0Ai\n4meBzcCHgKXAscAqqkCyDNjZscnOup3698Qe+iVJUh81eRHqG4BvZ+bH6/dbI+JyqrUdnwM613Ms\nBcbr1xP1+3YjwLa5HHhwcIDBwYGuit6boaHGZj5pXg0NDTI8fGD9vXt9arFowvXZ5ACya4a23cAP\ngAeB13T0rarbqX8fO0P/lrkceMWKEQYGehNARkeX9GS/UtOMji5h+fKRfpexT7w+tVg04fpscgD5\nH8BHIuI9wJXAi6keq/0scDPw0Yi4GLgaOAk4C3hdve1GYHNE3AjcDVwIrKS6pbNXY2MTPZsBGR+f\nKVdJC8/4+C62b++8E9psXp9aLHp5fc412DQ2gGTmtyPiNcDHgN+lur3yWeCDmbk7Ik4F/hi4nOoJ\nl4sy885629si4gJgA3AE1YzIaZm5Yy7HbrWmaLWm5v2cACYnWz3Zr9Q0k5Mtdu8+sP7evT61WDTh\n+mxsAAHIzL8B1szSdx/VzMds226iejRXkiQ1jCuuJElScQYQSZJUnAFEkiQVZwCRJEnFGUAkSVJx\nBhBJklScAUSSJBVnAJEkScUZQCRJUnEGEEmSVJwBRJIkFWcAkSRJxRlAJElScQYQSZJUnAFEkiQV\nZwCRJEnFGUAkSVJxBhBJklScAUSSJBVnAJEkScUZQCRJUnEGEEmSVJwBRJIkFWcAkSRJxRlAJElS\ncQYQSZJUnAFEkiQVZwCRJEnFGUAkSVJxBhBJklTcPgeQiPi5iPiJ7SJiKCJOmJ+yJEnSQtbNDMhj\nwAtmaD8KuGu/qpEkSYvC8FwGRcSFwCVtTX8TEZMdw5YD/zhfhUmSpIVrTgEEuI5q1mMAeD9wAzDR\n1j8FfA+4aT6LkyRJC9OcAkhmTgAfBIgIgD+q2yRJkvbZXGdAfiQzfz8ilkbEicDzqGZF2vu/Ml/F\nSZKkhWmfA0hEvBbYBIzO0D0FDO1vUZIkaWHb5wACXAl8CfgwMD6/5UiSpMWgmwByNHBmZv7dfBcj\nSZIWh24+B+T/AivnuxBJkrR4dDMDcglwVUSsAx4C/rm9MzO/OR+FSZKkhaubAHJL/fuLM/S5CFWS\nJO1VNwHk1fNexSwiYgWwHvhNqttFdwDnZ+ZT9WPAVwGrgaeBKzJzY9u25wDrgMOArcBFmXlvqdol\nSdLsuvkckDt6UMdsvgB8h+p7ZlpUn8j6mYj4baqZmHXAfwVOBr4YEf8vM++MiDVU4eQ3gK8BFwFb\nIuLozNxRsH5JkjSDbj4H5FqqWy0zysy37VdFPz7OCcCJwE9n5vfqtrdTzWicAWzLzA318NsjYhOw\nFrgTOA+4PjPvqfvXR8Q7gDcAG5EkSX3VzS2YI3luAHke1QzFKPDf56Oo2iupbp28IyLOB0aA/wX8\nF6rbLg90jH8ImA4/q4HPdPRvBY6bx/okSVKXurkFs6azLSIGgQ3AtnmoadoK4OVUt1B+iSqAfBb4\nM+BJYGfH+J3Asvr1Mp77ZXmd/Xs0ODjA4ODA3gd2YWiomyefpQPP0NAgw8MH1t+716cWiyZcn93M\ngPyEzGxFxMeBu6jWZcyH6cd7352ZPwAmIuK9wF8D1wJLO8Yv5cefzDoxQ/8IcwxIK1aMMDDQmwAy\nOrqkJ/uVmmZ0dAnLl4/0u4x94vWpxaIJ1+e8BJDaC6n+Iz9ftlJ90d3zgR/UbdP1fgO4oGP8KuDB\n+vWDwLEz9G+Zy4HHxiZ6NgMyPr6rJ/uVmmZ8fBfbtx9YX5rt9anFopfX51yDTTeLUD/Ac9eADACH\nAP8e+N/7ur89+DLwKLAxIs6lmtH4MLAZ+DxweURcDFwNnAScBbyu3nYjsDkibgTuBi6k+vTWzXM5\ncKs1Ras16zrb/TI52erJfqWmmZxssXv3gfX37vWpxaIJ12c3N4De2vFzLtXndNwKnD9fhWXmbqrH\na3cDfw8k8E3gbZk5BpwKvJHqMd1rqD7n485629uoZkg2AGPAmcBpPoIrSVIzdLMI9cU9qGO2Y30b\nePMsffdRzXzMtu0mYFOPSpMkSfuhqzUgETEAvIbqKZUfUq3X+KvMnJzH2iRJ0gLVzRqQQ6jWZ7wC\n2EF1G2cUuD8iTvE2hyRJ2ptu1oB8DFgCvDwzV2TmIcDxwL8APjKfxUmSpIWpmwDyW8DvZOb0I69k\n5t9SPWny+vkqTJIkLVzdBJCfAp6aof1pqlsxkiRJe9RNALmfn/wQMOq2b+xfOZIkaTHo5imY9wJ3\nRMSJVB/yBfCrVN/Xctp8FSZJkhaufZ4Bqb/i/iTgW8CvU4WOE4A1mfmX81ueJElaiPY5gETErwBf\nAv4hM1dn5i8A/wjcFBGd378iSZL0E7p9DPcm4D1tbS8FbgE+MR9FSZKkha2bAPLLwJWZOf0NtdPf\n23Il8K/mqzBJkrRwdRNAxoGXzNB+OOB3WUuSpL3q5imYm4CrI+J84F5gCngl8Gng5nmsTZIkLVDd\nBJDLqGZAvtTRfjPwu/tdkSRJWvD2OYBk5gTwbyPiGH78bbgPZeYj812cJElamLqZAQGgDhyGDkmS\ntM+6WYQqSZK0XwwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKk4A4gkSSrOACJJkoozgEiSpOIM\nIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKk4A4gkSSrO\nACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpuOF+F7A3ETEE/BXwaGa+tW47\nEbgKWA08DVyRmRvbtjkHWAccBmwFLsrMe0vXLkmSZnYgzIB8ADgJmAKIiOXALcB1wCiwFvhkRJxc\n96+hCidn1/2bgC0RcUjpwiVJ0swaHUAi4tXA64EvAAN18+nAtszckJmtzLydKmSsrfvPA67PzHsy\nczIz11PNkryhcPmSJGkWjQ0gEfFC4DPAWcAu6hkQqtsuD3QMfwg4bg/9W9v6JUlSnzVyDUhEDAKf\nBT6emQ9ExFRb90HAzo5NdgLL6tfLgIk99O/V4OAAg4MDex/YhaGhxmY+aV4NDQ0yPHxg/b17fWqx\naML12cgAArwH2JmZn67ft6eBCarFpe2WAuNt/Us7+keAbXM9+IoVIwwM9CaAjI4u6cl+paYZHV3C\n8uUj/S5jn3h9arFowvXZ1ADyFuCwiNhev18KEBG/BVwKnNoxfhXwYP36QeDYGfq3zPXgY2MTPZsB\nGR/f1ZP9Sk0zPr6L7ds7JyObzetTi0Uvr8+5BptGBpDM/IX29xFxLTCVmW+LiEOBj0bExcDVVE/I\nnAW8rh6+EdgcETcCdwMXAiuBzXM9fqs1Ras1tfeBXZicbPVkv1LTTE622L37wPp79/rUYtGE6/OA\nu+GZmc9QzYC8EfgOcA3V53zcWfffBlwAbADGgDOB0zJzR38qliRJnRo5A9Jp+gPI2t7fRzXzMdv4\nTVSP5kqSpAY64GZAJEnSgc8AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKk4\nA4gkSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKK\nM4BIkqTiDCCSJKk4A4gkSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSp\nOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKk4A4gkSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmS\nijOASJKk4gwgkiSpOAOIJEkqbrjfBexJRPwi8DHgl4EfAl8C/nNmPhMRJwJXAauBp4ErMnNj27bn\nAOuAw4CtwEWZeW/hU5AkSTNo7AxIRCwBbgW+CrwQWAUcClwbEYcAtwDXAaPAWuCTEXFyve0aqnBy\ndt2/CdhSbydJkvqssQEE+Dng68Dlmbk7M8eAa4BXA6cD38nMDZnZyszbqULG2nrb84DrM/OezJzM\nzPVUsyRvKH8akiSpU2NvwWRmAq/taD4DuJ/qtssDHX0PAW+rX68GPtPRvxU4bp7LlCRJXWjyDMhz\nRMQVVIHkfOAgYKJjyE5gWf162V76JUlSHzV2BmRaRIwC1wLHA7+WmX8XERNA53qOpcB4/Xqift9u\nBNg2l2MODg4wODjQfdF7MDR0wGQ+ab8MDQ0yPHxg/b17fWqxaML12egAEhEvoVps+hjwinodCMCD\nwGs6hq+q26f7j52hf8tcjrtixQgDA70JIKOjS3qyX6lpRkeXsHz5SL/L2Cden1osmnB9NjaARMRy\n4DbgL4HzMnOqrftm4KMRcTFwNXAScBbwurp/I7A5Im4E7gYuBFYCm+dy7LGxiZ7NgIyP7+rJfqWm\nGR/fxfbtnXdCm83rU4tFL6/PuQabxgYQ4K3AzwJnAm+KiOn2qcwcjYhTgT8GLqd6wuWizLwTIDNv\ni4gLgA3AEVQzIqdl5o65HLjVmqLVmtr7wC5MTrZ6sl+paSYnW+zefWD9vXt9arFowvXZ2ACSmZ8A\nPrGH/vuoZj5m699E9WiuJElqGFdcSZKk4gwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKk4A4gk\nSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOIJEkqzgAiSZKKM4BI\nkqTiDCCSJKk4A4gkSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOASJKk4gwgkiSpOAOI\nJEkqzgAiSZKKM4BIkqTiDCCSJKk4A4gkSSrOACJJkoozgEiSpOIMIJIkqTgDiCRJKs4AIkmSijOA\nSJKk4gwgkiSpOAOIJEkqzgAiSZKKM4BIkqTiDCCSJKm44X4X0CsR8dPANcDJwG7gc8AlmTnZ18Ik\nSdKCngG5ARgHXgS8EjgFeG9fK5IkScACDSARcTTVzMelmfn9zHwU+BBwXn8rkyRJsEADCLAaGMvM\nJ9vaHgKOiIjRPtUkSZJqC3UNyEHAREfbzvr3MqpbM7MaHBxgcHCgF3UxNDTId595oif7lpriu888\nwdDQIMPDB9a/cYaGBtnxxDP9LkPqqR1PPNOI63NgamqqrwX0QkS8HrgmM1e2tR0H/C1wcGY+27fi\nJEnSgr0F8yBwaP0kzLRVwOOGD0mS+m9BzoAARMRXgH8C3gGsBP4C+PPMvLyvhUmSpAU7AwJwBtUa\nl0eBe4FbqZ6EkSRJfbZgZ0AkSVJzLeQZEEmS1FAGEEmSVJwBRJIkFWcAkSRJxRlAJElScQYQSZJU\n3EL9LhiJ+pNwr6H6ZuTdwOeASzJzsq+FSXqOiFgJ3AOszcw7+12PynAGRAvZDVRfPPgi4JXAKcB7\n+1qRpOeIiFdRhY+jAD+YahExgGhBioijqWY+Ls3M72fmo1SfhHtefyuTNC0izgU2AZf1uRT1gQFE\nC9VqYCwzn2xrewg4IiJG+1STpOe6FTgqM2/sdyEqzzUgWqgOAiY62nbWv5dR3ZqR1EeZ+VS/a1D/\nOAOihWoCWNrRNv3+2cK1SJI6GEC0UD0IHFo/CTNtFfB4ZhpAJKnPDCBakDLz74GvAusjYllEHAms\nA/60v5VJksAAooXtDKp1To8C91ItePtQXyuSJAEwMDXlY9eSJKksZ0AkSVJxBhBJklScAUSSJBVn\nAJEkScUZQCRJUnEGEEmSVJwBRJIkFWcAkSRJxRlAJElScQYQScVExGMR8YEut31xRLQi4l/vx/Gn\n9/Fr3e5D0vwwgEgqaar+kbTIGUAkSVJxw/0uQJIAIuL5wO8DbwKOAJ4Fvgz8TmaOtQ391Yj4E+Cl\nwDeACzLz6237eStwKfDzwGPAnwCfykxnXqQGcQZEUlN8FHgz8FbgaOBs4FRgXce4S6iCyi8BW4Gv\nRsSLACLiHcDH6v5V9ba/B3yk59VL2ifOgEhqiq8BN2XmXfX7xyPiy8DLO8a9PzNvBoiIdwL/BrgA\neF/98+HMvKEe+1hEHAx8OiLe1/MzkDRnBhBJjZCZmyLilIj4A6oZkFXAy4CvdAy9q22b3RFxP7A6\nIl4AHA5cEREfbBs/CDwfOBL4516eg6S58xaMpEaIiKuBG6jCwhbgLOB6YKBj6GTH+yGqYDH9/2fv\nBn6x7ec44BjgH3pSuKSuOAMiqe8i4lDgXcCZmfnnbe2rgPGO4a8AHqr7n1+/vzozn46IbcDRmXlN\n2z7OAM4Afru3ZyFpXxhAJJU0ALw0In6jo30S+C7w7yLi68AS4CJgNfD1jrF/FBHbqWY03gc8D/h0\n3Xcl8AcR8ThwC9VtnA3Alsz8YUT04JQkdcMAIqmkKeA/1D/t/olqluITwAPAU8CfAu8EroqIJW3b\nv5/qSZcXA38NnJKZOwAy8xMRsQv4j/WYp4CNPPdJGh/HlRpgYGrKa1GSJJXlIlRJklScAUSSJBVn\nAJEkScUZQCRJUnEGEEmSVJwBRJIkFWcAkSRJxRlAJElScQYQSZJUnAFEkiQVZwCRJEnF/X8EZEAj\nB27LVgAAAABJRU5ErkJggg==\n", 58 | "text/plain": [ 59 | "" 60 | ] 61 | }, 62 | "metadata": {}, 63 | "output_type": "display_data" 64 | } 65 | ], 66 | "source": [ 67 | "# Plot class distribution\n", 68 | "sns.countplot(x='Label', data=df)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 189, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
\n", 82 | "\n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | "
LabelTop1Top2Top3Top4Top5Top6Top7Top8Top9...Top16Top17Top18Top19Top20Top21Top22Top23Top24Top25
Date
2008-08-080b\"Georgia 'downs two Russian warplanes' as cou...b'BREAKING: Musharraf to be impeached.'b'Russia Today: Columns of troops roll into So...b'Russian tanks are moving towards the capital...b\"Afghan children raped with 'impunity,' U.N. ...b'150 Russian tanks have entered South Ossetia...b\"Breaking: Georgia invades South Ossetia, Rus...b\"The 'enemy combatent' trials are nothing but...b'Georgian troops retreat from S. Osettain cap......b'Georgia Invades South Ossetia - if Russia ge...b'Al-Qaeda Faces Islamist Backlash'b'Condoleezza Rice: \"The US would not act to p...b'This is a busy day: The European Union has ...b\"Georgia will withdraw 1,000 soldiers from Ir...b'Why the Pentagon Thinks Attacking Iran is a ...b'Caucasus in crisis: Georgia invades South Os...b'Indian shoe manufactory - And again in a se...b'Visitors Suffering from Mental Illnesses Ban...b\"No Help for Mexico's Kidnapping Surge\"
2008-08-111b'Why wont America and Nato help us? If they w...b'Bush puts foot down on Georgian conflict'b\"Jewish Georgian minister: Thanks to Israeli ...b'Georgian army flees in disarray as Russians ...b\"Olympic opening ceremony fireworks 'faked'\"b'What were the Mossad with fraudulent New Zea...b'Russia angered by Israeli military sale to G...b'An American citizen living in S.Ossetia blam...b'Welcome To World War IV! Now In High Definit......b'Israel and the US behind the Georgian aggres...b'\"Do not believe TV, neither Russian nor Geor...b'Riots are still going on in Montreal (Canada...b'China to overtake US as largest manufacturer'b'War in South Ossetia [PICS]'b'Israeli Physicians Group Condemns State Tort...b' Russia has just beaten the United States ov...b'Perhaps *the* question about the Georgia - R...b'Russia is so much better at war'b\"So this is what it's come to: trading sex fo...
2008-08-120b'Remember that adorable 9-year-old who sang a...b\"Russia 'ends Georgia operation'\"b'\"If we had no sexual harassment we would hav...b\"Al-Qa'eda is losing support in Iraq because ...b'Ceasefire in Georgia: Putin Outmaneuvers the...b'Why Microsoft and Intel tried to kill the XO...b'Stratfor: The Russo-Georgian War and the Bal...b\"I'm Trying to Get a Sense of This Whole Geor...b\"The US military was surprised by the timing ......b'U.S. troops still in Georgia (did you know t...b'Why Russias response to Georgia was right'b'Gorbachev accuses U.S. of making a \"serious ...b'Russia, Georgia, and NATO: Cold War Two'b'Remember that adorable 62-year-old who led y...b'War in Georgia: The Israeli connection'b'All signs point to the US encouraging Georgi...b'Christopher King argues that the US and NATO...b'America: The New Mexico?'b\"BBC NEWS | Asia-Pacific | Extinction 'by man...
2008-08-130b' U.S. refuses Israel weapons to attack Iran:...b\"When the president ordered to attack Tskhinv...b' Israel clears troops who killed Reuters cam...b'Britain\\'s policy of being tough on drugs is...b'Body of 14 year old found in trunk; Latest (...b'China has moved 10 *million* quake survivors...b\"Bush announces Operation Get All Up In Russi...b'Russian forces sink Georgian ships 'b\"The commander of a Navy air reconnaissance s......b'Elephants extinct by 2020?'b'US humanitarian missions soon in Georgia - i...b\"Georgia's DDOS came from US sources\"b'Russian convoy heads into Georgia, violating...b'Israeli defence minister: US against strike ...b'Gorbachev: We Had No Choice'b'Witness: Russian forces head towards Tbilisi...b' Quarter of Russians blame U.S. for conflict...b'Georgian president says US military will ta...b'2006: Nobel laureate Aleksander Solzhenitsyn...
2008-08-141b'All the experts admit that we should legalis...b'War in South Osetia - 89 pictures made by a ...b'Swedish wrestler Ara Abrahamian throws away ...b'Russia exaggerated the death toll in South O...b'Missile That Killed 9 Inside Pakistan May Ha...b\"Rushdie Condemns Random House's Refusal to P...b'Poland and US agree to missle defense deal. ...b'Will the Russians conquer Tblisi? Bet on it,...b'Russia exaggerating South Ossetian death tol......b'Bank analyst forecast Georgian crisis 2 days...b\"Georgia confict could set back Russia's US r...b'War in the Caucasus is as much the product o...b'\"Non-media\" photos of South Ossetia/Georgia ...b'Georgian TV reporter shot by Russian sniper ...b'Saudi Arabia: Mother moves to block child ma...b'Taliban wages war on humanitarian aid workers'b'Russia: World \"can forget about\" Georgia\\'s...b'Darfur rebels accuse Sudan of mounting major...b'Philippines : Peace Advocate say Muslims nee...
\n", 256 | "

5 rows × 26 columns

\n", 257 | "
" 258 | ], 259 | "text/plain": [ 260 | " Label Top1 \\\n", 261 | "Date \n", 262 | "2008-08-08 0 b\"Georgia 'downs two Russian warplanes' as cou... \n", 263 | "2008-08-11 1 b'Why wont America and Nato help us? If they w... \n", 264 | "2008-08-12 0 b'Remember that adorable 9-year-old who sang a... \n", 265 | "2008-08-13 0 b' U.S. refuses Israel weapons to attack Iran:... \n", 266 | "2008-08-14 1 b'All the experts admit that we should legalis... \n", 267 | "\n", 268 | " Top2 \\\n", 269 | "Date \n", 270 | "2008-08-08 b'BREAKING: Musharraf to be impeached.' \n", 271 | "2008-08-11 b'Bush puts foot down on Georgian conflict' \n", 272 | "2008-08-12 b\"Russia 'ends Georgia operation'\" \n", 273 | "2008-08-13 b\"When the president ordered to attack Tskhinv... \n", 274 | "2008-08-14 b'War in South Osetia - 89 pictures made by a ... \n", 275 | "\n", 276 | " Top3 \\\n", 277 | "Date \n", 278 | "2008-08-08 b'Russia Today: Columns of troops roll into So... \n", 279 | "2008-08-11 b\"Jewish Georgian minister: Thanks to Israeli ... \n", 280 | "2008-08-12 b'\"If we had no sexual harassment we would hav... \n", 281 | "2008-08-13 b' Israel clears troops who killed Reuters cam... \n", 282 | "2008-08-14 b'Swedish wrestler Ara Abrahamian throws away ... \n", 283 | "\n", 284 | " Top4 \\\n", 285 | "Date \n", 286 | "2008-08-08 b'Russian tanks are moving towards the capital... \n", 287 | "2008-08-11 b'Georgian army flees in disarray as Russians ... \n", 288 | "2008-08-12 b\"Al-Qa'eda is losing support in Iraq because ... \n", 289 | "2008-08-13 b'Britain\\'s policy of being tough on drugs is... \n", 290 | "2008-08-14 b'Russia exaggerated the death toll in South O... \n", 291 | "\n", 292 | " Top5 \\\n", 293 | "Date \n", 294 | "2008-08-08 b\"Afghan children raped with 'impunity,' U.N. ... \n", 295 | "2008-08-11 b\"Olympic opening ceremony fireworks 'faked'\" \n", 296 | "2008-08-12 b'Ceasefire in Georgia: Putin Outmaneuvers the... \n", 297 | "2008-08-13 b'Body of 14 year old found in trunk; Latest (... \n", 298 | "2008-08-14 b'Missile That Killed 9 Inside Pakistan May Ha... \n", 299 | "\n", 300 | " Top6 \\\n", 301 | "Date \n", 302 | "2008-08-08 b'150 Russian tanks have entered South Ossetia... \n", 303 | "2008-08-11 b'What were the Mossad with fraudulent New Zea... \n", 304 | "2008-08-12 b'Why Microsoft and Intel tried to kill the XO... \n", 305 | "2008-08-13 b'China has moved 10 *million* quake survivors... \n", 306 | "2008-08-14 b\"Rushdie Condemns Random House's Refusal to P... \n", 307 | "\n", 308 | " Top7 \\\n", 309 | "Date \n", 310 | "2008-08-08 b\"Breaking: Georgia invades South Ossetia, Rus... \n", 311 | "2008-08-11 b'Russia angered by Israeli military sale to G... \n", 312 | "2008-08-12 b'Stratfor: The Russo-Georgian War and the Bal... \n", 313 | "2008-08-13 b\"Bush announces Operation Get All Up In Russi... \n", 314 | "2008-08-14 b'Poland and US agree to missle defense deal. ... \n", 315 | "\n", 316 | " Top8 \\\n", 317 | "Date \n", 318 | "2008-08-08 b\"The 'enemy combatent' trials are nothing but... \n", 319 | "2008-08-11 b'An American citizen living in S.Ossetia blam... \n", 320 | "2008-08-12 b\"I'm Trying to Get a Sense of This Whole Geor... \n", 321 | "2008-08-13 b'Russian forces sink Georgian ships ' \n", 322 | "2008-08-14 b'Will the Russians conquer Tblisi? Bet on it,... \n", 323 | "\n", 324 | " Top9 \\\n", 325 | "Date \n", 326 | "2008-08-08 b'Georgian troops retreat from S. Osettain cap... \n", 327 | "2008-08-11 b'Welcome To World War IV! Now In High Definit... \n", 328 | "2008-08-12 b\"The US military was surprised by the timing ... \n", 329 | "2008-08-13 b\"The commander of a Navy air reconnaissance s... \n", 330 | "2008-08-14 b'Russia exaggerating South Ossetian death tol... \n", 331 | "\n", 332 | " ... \\\n", 333 | "Date ... \n", 334 | "2008-08-08 ... \n", 335 | "2008-08-11 ... \n", 336 | "2008-08-12 ... \n", 337 | "2008-08-13 ... \n", 338 | "2008-08-14 ... \n", 339 | "\n", 340 | " Top16 \\\n", 341 | "Date \n", 342 | "2008-08-08 b'Georgia Invades South Ossetia - if Russia ge... \n", 343 | "2008-08-11 b'Israel and the US behind the Georgian aggres... \n", 344 | "2008-08-12 b'U.S. troops still in Georgia (did you know t... \n", 345 | "2008-08-13 b'Elephants extinct by 2020?' \n", 346 | "2008-08-14 b'Bank analyst forecast Georgian crisis 2 days... \n", 347 | "\n", 348 | " Top17 \\\n", 349 | "Date \n", 350 | "2008-08-08 b'Al-Qaeda Faces Islamist Backlash' \n", 351 | "2008-08-11 b'\"Do not believe TV, neither Russian nor Geor... \n", 352 | "2008-08-12 b'Why Russias response to Georgia was right' \n", 353 | "2008-08-13 b'US humanitarian missions soon in Georgia - i... \n", 354 | "2008-08-14 b\"Georgia confict could set back Russia's US r... \n", 355 | "\n", 356 | " Top18 \\\n", 357 | "Date \n", 358 | "2008-08-08 b'Condoleezza Rice: \"The US would not act to p... \n", 359 | "2008-08-11 b'Riots are still going on in Montreal (Canada... \n", 360 | "2008-08-12 b'Gorbachev accuses U.S. of making a \"serious ... \n", 361 | "2008-08-13 b\"Georgia's DDOS came from US sources\" \n", 362 | "2008-08-14 b'War in the Caucasus is as much the product o... \n", 363 | "\n", 364 | " Top19 \\\n", 365 | "Date \n", 366 | "2008-08-08 b'This is a busy day: The European Union has ... \n", 367 | "2008-08-11 b'China to overtake US as largest manufacturer' \n", 368 | "2008-08-12 b'Russia, Georgia, and NATO: Cold War Two' \n", 369 | "2008-08-13 b'Russian convoy heads into Georgia, violating... \n", 370 | "2008-08-14 b'\"Non-media\" photos of South Ossetia/Georgia ... \n", 371 | "\n", 372 | " Top20 \\\n", 373 | "Date \n", 374 | "2008-08-08 b\"Georgia will withdraw 1,000 soldiers from Ir... \n", 375 | "2008-08-11 b'War in South Ossetia [PICS]' \n", 376 | "2008-08-12 b'Remember that adorable 62-year-old who led y... \n", 377 | "2008-08-13 b'Israeli defence minister: US against strike ... \n", 378 | "2008-08-14 b'Georgian TV reporter shot by Russian sniper ... \n", 379 | "\n", 380 | " Top21 \\\n", 381 | "Date \n", 382 | "2008-08-08 b'Why the Pentagon Thinks Attacking Iran is a ... \n", 383 | "2008-08-11 b'Israeli Physicians Group Condemns State Tort... \n", 384 | "2008-08-12 b'War in Georgia: The Israeli connection' \n", 385 | "2008-08-13 b'Gorbachev: We Had No Choice' \n", 386 | "2008-08-14 b'Saudi Arabia: Mother moves to block child ma... \n", 387 | "\n", 388 | " Top22 \\\n", 389 | "Date \n", 390 | "2008-08-08 b'Caucasus in crisis: Georgia invades South Os... \n", 391 | "2008-08-11 b' Russia has just beaten the United States ov... \n", 392 | "2008-08-12 b'All signs point to the US encouraging Georgi... \n", 393 | "2008-08-13 b'Witness: Russian forces head towards Tbilisi... \n", 394 | "2008-08-14 b'Taliban wages war on humanitarian aid workers' \n", 395 | "\n", 396 | " Top23 \\\n", 397 | "Date \n", 398 | "2008-08-08 b'Indian shoe manufactory - And again in a se... \n", 399 | "2008-08-11 b'Perhaps *the* question about the Georgia - R... \n", 400 | "2008-08-12 b'Christopher King argues that the US and NATO... \n", 401 | "2008-08-13 b' Quarter of Russians blame U.S. for conflict... \n", 402 | "2008-08-14 b'Russia: World \"can forget about\" Georgia\\'s... \n", 403 | "\n", 404 | " Top24 \\\n", 405 | "Date \n", 406 | "2008-08-08 b'Visitors Suffering from Mental Illnesses Ban... \n", 407 | "2008-08-11 b'Russia is so much better at war' \n", 408 | "2008-08-12 b'America: The New Mexico?' \n", 409 | "2008-08-13 b'Georgian president says US military will ta... \n", 410 | "2008-08-14 b'Darfur rebels accuse Sudan of mounting major... \n", 411 | "\n", 412 | " Top25 \n", 413 | "Date \n", 414 | "2008-08-08 b\"No Help for Mexico's Kidnapping Surge\" \n", 415 | "2008-08-11 b\"So this is what it's come to: trading sex fo... \n", 416 | "2008-08-12 b\"BBC NEWS | Asia-Pacific | Extinction 'by man... \n", 417 | "2008-08-13 b'2006: Nobel laureate Aleksander Solzhenitsyn... \n", 418 | "2008-08-14 b'Philippines : Peace Advocate say Muslims nee... \n", 419 | "\n", 420 | "[5 rows x 26 columns]" 421 | ] 422 | }, 423 | "execution_count": 189, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "# Check the data\n", 430 | "df.head()" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 213, 436 | "metadata": { 437 | "collapsed": false 438 | }, 439 | "outputs": [ 440 | { 441 | "name": "stdout", 442 | "output_type": "stream", 443 | "text": [ 444 | "['Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15', 'Top16', 'Top17', 'Top18', 'Top19', 'Top20']\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "# Select only the top N news to use as features of the classifier. N ranges from 1 to 25.\n", 450 | "# In this case, N = 20. \n", 451 | "columns = ['Top' + str(i+1) for i in range(20)]\n", 452 | "print columns\n", 453 | "\n" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 214, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "# Create a new column with the Top N news joined.\n", 465 | "df['joined'] = df[columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 215, 471 | "metadata": { 472 | "collapsed": true 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "# Create a new dataframe with only Label and joined columns\n", 477 | "df1 = df[['Label', 'joined']].copy()" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 216, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [ 487 | { 488 | "data": { 489 | "text/html": [ 490 | "
\n", 491 | "\n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | "
Labeljoined
Date
2008-08-080b\"Georgia 'downs two Russian warplanes' as cou...
2008-08-111b'Why wont America and Nato help us? If they w...
2008-08-120b'Remember that adorable 9-year-old who sang a...
2008-08-130b' U.S. refuses Israel weapons to attack Iran:...
2008-08-141b'All the experts admit that we should legalis...
\n", 532 | "
" 533 | ], 534 | "text/plain": [ 535 | " Label joined\n", 536 | "Date \n", 537 | "2008-08-08 0 b\"Georgia 'downs two Russian warplanes' as cou...\n", 538 | "2008-08-11 1 b'Why wont America and Nato help us? If they w...\n", 539 | "2008-08-12 0 b'Remember that adorable 9-year-old who sang a...\n", 540 | "2008-08-13 0 b' U.S. refuses Israel weapons to attack Iran:...\n", 541 | "2008-08-14 1 b'All the experts admit that we should legalis..." 542 | ] 543 | }, 544 | "execution_count": 216, 545 | "metadata": {}, 546 | "output_type": "execute_result" 547 | } 548 | ], 549 | "source": [ 550 | "# Take a look\n", 551 | "df1.head()" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 217, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [ 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "(1611, 2)\n", 566 | "(378, 2)\n" 567 | ] 568 | } 569 | ], 570 | "source": [ 571 | "# According to the author of the dataset, the data should be split as it:\n", 572 | "# Train set: from 2008-08-08 to 2014-12-31 \n", 573 | "# Test set: from 2015-01-02 to 2016-07-01\n", 574 | "\n", 575 | "train = df1.ix['2008-08-08':'2014-12-31'].shape\n", 576 | "test = df1.ix['2015-01-02':'2016-07-01'].shape\n", 577 | "\n", 578 | "print train\n", 579 | "print test" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 218, 585 | "metadata": { 586 | "collapsed": false 587 | }, 588 | "outputs": [], 589 | "source": [ 590 | "from sklearn.feature_extraction.text import TfidfVectorizer" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 219, 596 | "metadata": { 597 | "collapsed": false 598 | }, 599 | "outputs": [], 600 | "source": [ 601 | "# Create a tfidf object. Remove english stop words and use 10000 features.\n", 602 | "vect = TfidfVectorizer(max_features=10000, stop_words='english')" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 220, 608 | "metadata": { 609 | "collapsed": true 610 | }, 611 | "outputs": [], 612 | "source": [ 613 | "# Transform the joined column into a tfidf sparse matrix\n", 614 | "X = vect.fit_transform(df1['joined'])" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 221, 620 | "metadata": { 621 | "collapsed": true 622 | }, 623 | "outputs": [], 624 | "source": [ 625 | "from sklearn.decomposition import TruncatedSVD\n", 626 | "from sklearn.pipeline import make_pipeline\n", 627 | "from sklearn.preprocessing import Normalizer" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 222, 633 | "metadata": { 634 | "collapsed": true 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "# Use tfidf followed by svd is known as lsa or lsi.\n", 639 | "svd = TruncatedSVD(1000)\n", 640 | "normalizer = Normalizer(copy=False)\n", 641 | "lsa = make_pipeline(svd, normalizer)\n" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 223, 647 | "metadata": { 648 | "collapsed": true 649 | }, 650 | "outputs": [], 651 | "source": [ 652 | "# Apply lsa\n", 653 | "X = lsa.fit_transform(X)" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": 224, 659 | "metadata": { 660 | "collapsed": false 661 | }, 662 | "outputs": [ 663 | { 664 | "name": "stdout", 665 | "output_type": "stream", 666 | "text": [ 667 | "(1611, 1000)\n", 668 | "(378, 1000)\n" 669 | ] 670 | } 671 | ], 672 | "source": [ 673 | "# Split data into train and test\n", 674 | "X_train = X[:1611]\n", 675 | "print X_train.shape\n", 676 | "X_test = X[1611:]\n", 677 | "print X_test.shape\n" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 225, 683 | "metadata": { 684 | "collapsed": true 685 | }, 686 | "outputs": [], 687 | "source": [ 688 | "# Get labels\n", 689 | "y_train = df1['Label'].ix['2008-08-08':'2014-12-31']\n", 690 | "y_test = df1['Label'].ix['2015-01-02':'2016-07-01']" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 226, 696 | "metadata": { 697 | "collapsed": false 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n", 702 | "from sklearn.neighbors import KNeighborsClassifier\n", 703 | "from sklearn.svm import LinearSVC" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 227, 709 | "metadata": { 710 | "collapsed": false 711 | }, 712 | "outputs": [], 713 | "source": [ 714 | "# Create for different classifiers. Random Forest, KNN, Support Vector Machine and an Ensemble of these 3.\n", 715 | "rf = RandomForestClassifier()\n", 716 | "knn = KNeighborsClassifier(n_neighbors=3)\n", 717 | "svm = LinearSVC()\n", 718 | "vc = VotingClassifier(estimators=[('rf',rf), ('knn',knn), ('svm',svm)])" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 228, 724 | "metadata": { 725 | "collapsed": false 726 | }, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/plain": [ 731 | "VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", 732 | " max_depth=None, max_features='auto', max_leaf_nodes=None,\n", 733 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 734 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 735 | " n_...ax_iter=1000,\n", 736 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 737 | " verbose=0))],\n", 738 | " n_jobs=1, voting='hard', weights=None)" 739 | ] 740 | }, 741 | "execution_count": 228, 742 | "metadata": {}, 743 | "output_type": "execute_result" 744 | } 745 | ], 746 | "source": [ 747 | "# Train them\n", 748 | "rf.fit(X_train,y_train)\n", 749 | "knn.fit(X_train,y_train)\n", 750 | "svm.fit(X_train,y_train)\n", 751 | "vc.fit(X_train,y_train)" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 229, 757 | "metadata": { 758 | "collapsed": false 759 | }, 760 | "outputs": [ 761 | { 762 | "name": "stdout", 763 | "output_type": "stream", 764 | "text": [ 765 | "Accuracy rf is 0.497354497354\n", 766 | "Accuracy knn is 0.529100529101\n", 767 | "Accuracy svm is 0.468253968254\n", 768 | "Accuracy vc is 0.515873015873\n" 769 | ] 770 | } 771 | ], 772 | "source": [ 773 | "# Check the accuracies\n", 774 | "\n", 775 | "ac_rf = rf.score(X_test, y_test)\n", 776 | "ac_knn = knn.score(X_test, y_test)\n", 777 | "ac_svm = svm.score(X_test, y_test)\n", 778 | "ac_vc = vc.score(X_test, y_test)\n", 779 | "\n", 780 | "print \"Accuracy rf is\", ac_rf\n", 781 | "print \"Accuracy knn is\", ac_knn\n", 782 | "print \"Accuracy svm is\", ac_svm\n", 783 | "print \"Accuracy vc is\", ac_vc" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "# Conclusion\n", 791 | "\n", 792 | "The low accuracy achieved with the experiments indicates that probably there is no correlation between Reddit's Top News and Dow Jones index. Maybe a more complex model can find an underlying pattern. However there are too few data to try deep learning. " 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": { 799 | "collapsed": true 800 | }, 801 | "outputs": [], 802 | "source": [] 803 | } 804 | ], 805 | "metadata": { 806 | "kernelspec": { 807 | "display_name": "Python 2", 808 | "language": "python", 809 | "name": "python2" 810 | }, 811 | "language_info": { 812 | "codemirror_mode": { 813 | "name": "ipython", 814 | "version": 2 815 | }, 816 | "file_extension": ".py", 817 | "mimetype": "text/x-python", 818 | "name": "python", 819 | "nbconvert_exporter": "python", 820 | "pygments_lexer": "ipython2", 821 | "version": "2.7.10" 822 | } 823 | }, 824 | "nbformat": 4, 825 | "nbformat_minor": 1 826 | } 827 | --------------------------------------------------------------------------------