├── .gitignore ├── LICENSE ├── MNIST ├── .ipynb_checkpoints │ └── MNIST-checkpoint.ipynb └── MNIST.ipynb ├── README.md ├── chapter01 ├── Simple_tokenization.py └── wsj-short.txt ├── fortest.py ├── lstm ├── Word embedding.ipynb └── keras lstm.ipynb └── nltk_tutorial ├── Readme.md └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | lstm/20_newsgroup/ 2 | lstm/glove.6B/ 3 | MNIST/.ipython_checkpoints/ 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MNIST/.ipynb_checkpoints/MNIST-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "using samples to train and validate." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 6, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.datasets import fetch_mldata\n", 19 | "from sklearn.cross_validation import train_test_split\n", 20 | "import matplotlib.pyplot as plt\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 7, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Shape of Data: (10000, 784)\n", 35 | "Shape of Target (10000,)\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "mnist = fetch_mldata('MNIST original')\n", 41 | "data = mnist.data[0:10000]\n", 42 | "target= mnist.target[0:10000]\n", 43 | "print \"Shape of Data: \", data.shape\n", 44 | "print \"Shape of Target\", target.shape\n", 45 | "X, kaggle_x, Y, kaggle_y = train_test_split(data, target, train_size= 0.8)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 12, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [ 55 | { 56 | "ename": "SyntaxError", 57 | "evalue": "invalid syntax (, line 1)", 58 | "output_type": "error", 59 | "traceback": [ 60 | "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m def showtest()\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "def showtest():\n", 66 | " sample = data[0]\n", 67 | " print \"Sample shape: \", sample.shape\n", 68 | "\n", 69 | " %matplotlib inline\n", 70 | "\n", 71 | " sample = sample.reshape(28,28)\n", 72 | " plt.imshow(sample, cmap='gray')\n", 73 | " plt.show()\n", 74 | "showtest()" 75 | ] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 2", 81 | "language": "python", 82 | "name": "python2" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 2 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython2", 94 | "version": "2.7.12" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 1 99 | } 100 | -------------------------------------------------------------------------------- /MNIST/MNIST.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Use pipeline for MNIST" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 35, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from sklearn.datasets import fetch_mldata\n", 19 | "from sklearn.cross_validation import train_test_split\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "from sklearn.grid_search import GridSearchCV\n", 22 | "from sklearn.pipeline import Pipeline, FeatureUnion" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "Using samples to train and validate. We now use 10000 samples for train and test. " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 42, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "Shape of Data: (70000, 784)\n", 44 | "Shape of Target (70000,)\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "mnist = fetch_mldata('MNIST original')\n", 50 | "data = mnist.data\n", 51 | "target= mnist.target\n", 52 | "print \"Shape of Data: \", data.shape\n", 53 | "print \"Shape of Target\", target.shape\n", 54 | "X, kaggle_x, Y, kaggle_y = train_test_split(data, target, train_size= 0.8)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "Show test " 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 14, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "Sample shape: (784,)\n" 76 | ] 77 | }, 78 | { 79 | "data": { 80 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWEAAAFfCAYAAACfj30KAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJztvX2sbO9V3/d9zvs599x77di1fzQ0wsZxEhRhUVOoFVxc\nORKJK5lEiSAOEjUoUikkipBaEFJUO9AGhQjLDeAqUQIBFZBIKApIxj8Kwk7Mm1unJBiKEdTGgPn9\n8Eu49557zsx52/1jzpr7nTVrPXvPOTNnn5n5fqRHe+9n5pzZ8/adtdezXkrTNBBCCNEPG32fgBBC\nrDMSYSGE6BGJsBBC9IhEWAghekQiLIQQPSIRFkKIHpEICyFEj0iEhRCiRyTCQgjRI1t9n0Ap5WUA\nvgLAxwEM+j0bIYSYC3sAPg/A803TfKZ2x4WJcCnlmwD8DwCeA/DvAfydpmn+r+CuXwHghxd1HkII\n0SNfA+BHandYiDuilPLVAL4bwDsAfBFGIvx8KeXlwd0/vohzEEKIO8DH2+6wKJ/wNwP4J03T/FDT\nNL8J4BsAHAP4+uC+ckEIIVaVVn2buwiXUrYBvB7Az9lcMyrV9rMA3jDvxxNCiGVmEZbwywFsAnjR\nzb+IkX9YCCHEFbcZolYAqHixEEIQixDhTwO4APBKN/8KTFvHQgix1sxdhJumOQPwYQBvtrlSSrk6\n/sV5P54QQiwzi4oTfheAHyylfBjAhzCKljgA8C8W9HhCCLGULESEm6b5sauY4G/HyC3xqwC+omma\nTy3i8YQQYlkpfTf6LKX85xi5L4QQYtV4fdM0/652BxXwEUKIHpEICyFEj0iEhRCiRyTCQgjRIxJh\nIYToEYmwEEL0iERYCCF6RCIshBA9IhEWQogekQgLIUSPSISFEKJHJMJCCNEjEmEhhOgRibAQQvSI\nRFgIIXpEIiyEED0iERZCiB6RCAshRI9IhIUQokckwkII0SMSYSGE6BGJsBBC9IhEWAghekQiLIQQ\nPSIRFkKIHpEICyFEj0iEhRCiR7b6PgEh7iKllOqc7c+6bbsNAJqmmdr329ptXf4mOxa3j0RYCKKU\nMiGO2f7GxsZ4btb92lzTNLi8vETTNFPDz/Ox7UdzPABM7fs5cbtIhIW4wgtiJpSlFGxubmJjY2M8\n+Njf1nWUUnB5eTkhqLVxcXHRum/Hkah7cQZkGfeBRFiIK7zw1rabm5vY2trC5uZmOPxt9je1442N\njbGAXlxcjEft+Pz8fOK2aH5jY2PKUvYWMyAB7guJsBAEC3FmrZrIzjJYmLPt5ubmWDy7bNvG2dkZ\nzs/PxxY2W9dsdbObRUJ8+0iEhQCmXBCZa8HEcnt7e2psbW2l83YbC7M/NhFmAa0d++HnvZvj4uJi\nwrfNXFxc9PCqC0AiLMQYL8TebcBuiJ2dHezs7GB7e3u8nx37kc1vbm6G4pqN09PTqWFibuILYGwB\nl1LGQsw0TZOKs1g8EmEhCG8NR/5eFtrd3d3xNtv3wpwJtokwi2oktjY3HA4xHA7H+1tbWxgOhxMC\nDIxElq1gv1AnAe4XibAQV2TuCO+/NcvVxHZvb6+6zcTZb7e2tkJxzbbD4RCDwQCDwWDCAt7YeJaD\nxVYwz21ubk6Fu0mI+0EiLAQR+YZZiE2ETTj39vawt7eH/f39qX3berHO9re3tzEYDCYE1vb9MYuv\nd0HYAptZwN7fa7eZEEuA+0UiLATq4uujIbw7Yn9/f2IcHBxMHEdCHQ0T4S7DRLvNAvYi7OODffyz\nuH0kwmKtyISGF91s4S1aPDM3BAtt234muv42Hy3hIyf84OgHv+/F1fzI5nc2wZY7on8kwmLl8SnH\nfmuxv36xze+zX9e7HbwLguf4f3DYGgunP9foRyFLLeb7s9/aHnNvb2/CrWE+ZbaeLy8vxzHFihW+\nXSTCYqWJUo6jYULXtsDGczb8sZ9nMWdr1/tx7XyjmGTOcItSqyOftT1u5M5g8TcBZneGuD0kwmKl\nyepA+H0TrmxxLVts6zJ8aBq7F7wvF8CUVWsCnD0nFmG/aGiLeScnJzg+Ph5b4F6AbV7cPhJhsbJE\nVc98CrLte0uYfboHBwcT+97F0ObCiJI0apawt269AEeJJPwYu7u7Y7fD3t4eTk9Px+IfCbD5iyXC\n/SARFitNZjX6mhDegjThvXfv3njY8f7+/oSo+sSLKCvOpyqzJex9wnyekQDzc/AuCEvoMPG15A57\nTBbgi4uLiQU7Dm8Tt8fcRbiU8g4A73DTv9k0zRfM+7GEaCMTYL9lEbZFtXv37uHw8BCHh4cT+ybC\ntZoRPJcV7fFWOZ9vJsBc6MeSO+zcs/Tm8/Pzschy6NrZ2RmGw+HEQqG4fRZlCX8EwJsB2E/8+YIe\nR4hWsvhffzkfuSNMfO/fvz8eBwcHM1VQi9Kf+QcgWpizZAqes1KXW1tb40pqW1tbnSupRQI8GAzG\nPxQS4X5YlAifN03zqQX9byE6EUUR8GC3QOSOYAF+8ODBeBwcHEyVoPSxvNHiW608pj9vE0SrgGai\n7Yu1b29vhzWEfelL4JkL4vz8fCzAli4duUbE7bAoEf7TpZQ/ADAA8EsAvq1pmt9b0GMJUcUvzEVi\nmVnCLMIPHz7Ew4cPce/evaprI7J0fQJFlFDhz5Wt4qhdUVTknTtr8Jzdny3gk5OTse9alnB/LEKE\nfxnA2wF8FMDnAHgngH9TSvnzTdM8XcDjCZGSWcK1ehDsirh3796ECL/0pS/FwcFBGmURZbDVEkV4\ny+dr8cBG1BOOreJaOyQW4NPT07EAWwyz+YRlBffD3EW4aZrn6fAjpZQPAfhdAF8F4Afm/XhCAHE6\ncpRFxgtmvM9Zbpx2nEVJdLFsb2JZsnhntDX/5GOLE+ZEkixu2dod+ccSi2HhIWpN0zwqpfwWgNcs\n+rHEepClHvs5nz2WhZLt7Ozg4OBg7PO1aAhLzGBr0Vu4kVh2EVCgu7Bl96s17Ixeq2iBMOoSwmFq\nta2EeT4sXIRLKYcAPh/ADy36scT6EFmdUSpyVM83qu977969CQHm4jvR4lVt1PDCFQlZF3GriWG2\n0Je5YtglwyLcpTvzLOcsYhYRJ/yPAPwURi6IPwng72MUovaj834ssX5EwpttOYMtq/PA0RD3798f\nxwKbCPuaD20iXCNrLd+l5XxN+LK/Yf+yj8ioCbH9z6grM3dn5seO/NeiG4uwhD8XwI8AeBmATwH4\nIID/smmazyzgscQa4he+stAvKzvJ9SB8xTP2/3q/r/0dRxBwFEEkvpkYR0KbbaO/qc1F85k1HLkj\nfLIJgHCxjx/Hbo8y7KI5kbOIhbm3zft/CuHhzLIoHdmXpoxqQUTH/raaJWznwefTBl/O10S4TZAj\nC7T2Os3ijgCexRR7S5/F10dx+Aw/CXE3VDtCLB01YWFrNbKEfR0IG5mVzCKcZbjVtkxmDd9UjLNQ\nN97O4o7gIu8sxvZ4FrvsxdcLr4S4GxJhsVREC3JZplpkCXMmHNeEODg4SBt3WhgXR0fw+fj9mnXq\nRTc65vvW9v1js4XKr5Xtd3VHWFxxKSVtjRQ978w9IepIhMXSkVnCXBzHhCWyhE2AuR7E/v5+2Jae\nw9myqmf+3DyZbzeKOKhZw34u8kf716iLO8K/Zvy//YIbL9JllrCEeDYkwmIpyVwRUUPOyBLmehBc\nlCeqhMbbNhHOaAv5yqzgtsU7JotXjhYvs9fLuniwAHtLnW9vE10JcjsSYbFUZBaet+Z8kfbMErZ0\n5P39/bQgjy/OM0smXJsAdbGGs609f9uP/MM1103mjvBNP+3cstKbLMSyhmdHIizuNJF1F9V98INb\nFXFBnsgvbDWCowgLv9/V8o3wghulGPv71Y5ZFO0YwNTCmf+xYuHlTiD2g1XKqDtzVIAoey7eX8zn\nKkGuIxEWvVFb1Mq2ZuH6wVEMNmduB5+AYQtuvs1QVnznJsJrROIbbWfZzyx1m+PXja8WOEHFOm9c\nXFyMBXMwGIwLwtvt2b79XVZMiJ+fiJEIi1sjEzPvYoiG3c8W29oaa+7u7o5F+P79+xO1IHzsr6/5\nWzuH6xBZi1Glsy4lKXmb+a0t1hfAhLuCS3Za+yMWUrv/YDAYt0bKxvb2Nk5PT6vlNLmMpsiRCIuF\n0xZBUKtI5vd9xEOt3bzVA87qQfh+b21W8DzEOCs3yeLlC7JnW1+IyOb4dWVx9SIcCfDm5uZYhIfD\n4bhhKG95n7t48LnZa3V+fj72MUuMYyTCYqHUQqh436/gZ8fRYluUZMHdMXgbZcFFvs+2SmmzULOE\nvYVr7Yhsy/s8d35+PmH929/b62klKXmO3REmlrzIZ1b0YDDAYDAYCy7vm899OBxic3Nz6tw2NjZw\nfv6sm5l1BeF4YzGJRFjcCm3+3yiEKqoH0bUtPW95n+tBcBv4zPpelE84cjeYJdnFF3t2doa9vb2x\nmHoBNsHNLGH+m42NjYmFOhNhG7u7u+N9Dt/b3NwcuydMkDl0jZM+RI5EWNwq3sdrWy++2ZZLUHJX\n5MzqNbH2ropoYa6rb/q6RO4IFmDufsHDLv/9sEt9rufAAsz+WLZ0vQuCBXh/fx8nJyc4OTmZaIHE\nVw78mvFiIL8+dl6bm5sTjUbFNBJhsTCymNVoLgqjiganFpvLwXy/VgeCi7LXFu+6ivBNaVuYYyuY\nLWFzAXiXgImydyewAJu1G1nC/DfRYp29tsfHx6EA89VDFDttFvDFxcWNw/rWAYmwuBUyd4R3RWR1\nIGzfuyOi1vS2GMduh6i7RibCfG5+/7pkIWreJeFF2LsGeLALggV1e3t7wkq2+/jQNU7tZt+ub38U\nWcD+vePnyAuMvtaGmEYiLBZKFl3grc22OhCZCHsB5rG3t1ftK8f/d3Nzc+qcs+1NaLOEvRVsLgE/\nBoPB+Jy8vzdaeLP78L5FU3i3yPHx8cTVgo8i8T50/7zsuUQ/bmIaibBYOFmoVyTEWR0I3y+uJsRW\nE4LbEvliNd7S9tZaFskxK23xwd4nzEJsFq816bRhQmyvmc9+s9Az7xNmP26WuXd5eTnlrvGvEy++\n8fNkAT47Oxv/nQS4jkRYLIQui1z2hfY1H6Kt7fsUZK4J7Kuj7e7uti70sWW3aLK05cwiPjs7m3BJ\nmPg+ffoUx8fHE9avCbBFT3DUhC3amRiaRZyJo+8iEvl1I7cKi693XYgcibBYCFHEQyZ+kavAD7uN\ns+B8EgbX/vUikmXD9fWaeIsUGAmbd79kLoCaK8AL+enp6UTmXJvfO3MV1a5S/Ln2+TovGxJhsRD4\nUjlyA/B+VoAnmucsOBZhX4C9LRX5Nl+HzO/tQ7a6iHBkmUY+Zo60sPjd6HXw/ytbNM2qr/n3NjtH\nCXGORFgsBF6Nz4SWfZldt+wDtq1PRebLaS9+WSjabbwefC5WgMeTCXBmDbMV7QWY/ctZNiJXY7Pz\n5PONfkC8FSxL+GZIhMVCiFbtfcWzKGysbd6HpnHacmYJZ1bgbYmDf8xIfO1coquFSIB9XG600Mfu\nCO8S4nKYfI58LpH1HrkjMmtd/uBuSITFQohW7jlbjfcjUc4EOyvY46ujZcLVtz+YLVebL6WM43kj\ny7Lm27b/5YXYuyMse42tcD4H9k3zebUJcRRlElnsIkciLBaCWXw+ftUX2WELlgXXz3XZZpXR2sZt\nvBaRBcvzJsJZWFgWnwt0W5izNGbvi247t0iAfTRL9IMhAe6ORFgshMgS9r3euL4DC6kXVi+40eJd\nlFjQpwsiej1qQmflHrsKcG1hLvIJX15eThT04XMw65gtY39+fC616AhZwrMjERYLwb6wnBrL9R44\nvte7JmrD1zCoDS8C2f6isccxsTWBMp8sz0Xilvlas4W5yB0RFc+xx7bH9+fcFiEhn/B8kAiLhcAW\nlBWUYUuYkyu40A4Xao/mrHiMt8yiuWihibd+f5GvheH9wibEllDRxSdcs4SzhbnMAjYhznzCPpoi\nsoIVHXEzJMJiIbS5IzjpwkTYL7JF++znrS26+ct+f263Df8A+HRfw16vaLGLxW3WhTkr2sPnEkVK\nREKdWcJd4pmjiAsxjURYLIS2hTmu9eDr/dZGFN5Vo2sN20iA5gGLbpsgzcsSZgG2v+fHYFG9uLho\nXaxri46QAN8MibCYmS6X99niWRSC1rVi16z4cLBoe5372PPMRtuiVPTD4H26vqiPHyywmQ+WLVz7\nX/45bGxsdG7IKTFdDBJhMRNZtIGf8ynIkfD6GGG/6DaPcKeooHo22DdamwMwZR1mx14UeevnTAz9\nAlu02MZCnP1QRZXb+H1kizYS4S6vt3y/N0ciLDpTs/b8cZSqnCVk+DoRbXUIZiGqWMbCFM1Freh5\neAHz+/x68Xnw+UT7tfKWkSWcuSei/2ePZfflc+bbo3PvcszzEuTZkAiLmYhWzKP9rCBP5orgxptZ\n/drrkrWZr7Wd562fA54lVfD28vIS29vbE0LHeGs6srojd0QmxF2y5/ic+f0zAfYlL+eJhLgbEmHR\nmdqCjR9ZgZ4oI44L72StdK5DJHIsdFnHY+8G8APA+Hmcn59jZ2dnytLkULTsfLLzyyxiL8Y+CiR7\nrnZfL8BnZ2fjHw9fAN6/77VjcTMkwmImorAl7we1iIg2VwRbwVEarA/NuolPuE3Y/NZGdAxg3EJo\nd3d3QsB87C2fgz8n7yqZZVGO28tHz5X/fmdnZyrO194T/kGatyUsuiERFp3xi3BZJhV/0bNawV6I\no9q083BHZJZhJrp+4cvPnZ+fA0DYuYJfj0jUMvGtddioiTG/JpkFzff1Auw7cNTe8+y4DVnN7UiE\nxUx4V0QkvmwJ1xbjsqI7baFXsxL5WzPrsssopYQ93Dj21otazQ/sBTg616hOMP+/SITtftvb21OJ\nM1ELpOtER4ibIxEWMxH5hKMsqq4WsIlwloo8jxC1SOi8hWu92Xib7ZdSplwQbGVG0QZ2Ll402wS4\nZgnz/+T/YQtu9l5YKJvvRxcJMb/P0Xsv5o9EWHQmSkrIqmq1xQmzJWz1IGrjpjHCmZXI9RV4DIfD\niS3Pmwjba8KvgflZu7gjuizG1Sz26IfFBJivSuw98c1AMxGW2N4uEmExE16AIyFus4KjRbmoDkTN\nCi6lhJfPmfBlPmEvwsPhsHXYuUQLXZllWXNHdImKiASZ/97eCxNgP7iSnf2wsM+5bWFOlvHikAgL\nAHHqsZ/zoWZR+JntP3z4EA8ePMD9+/cnmnFa7WAT7Mjl4EU3ywTzGWhe6MzN4K1atm55LrKIeZjl\nWEoZC/hwOAxD66zQUM3F4MfZ2RkePXqER48e4fHjx3jy5AmOjo5wfHyMk5OT8bnaeXjh5loQ/geS\nn7OP9vBZgfaaR+4mc7fY32StqvxrYrWM/XsVvX/rhkRYVFOQ+disqVrRdRv379/HgwcPxh2RuQ+c\nfTkzAfbn5anF2fKxieRgMJiwZPnY9lnc+JI/Oi6ljAV7e3t7LOJRbHMUiZFtz87OxuJr26dPn+Lp\n06c4Pj6eOleLwri8vBzXgDAh9lcq/EPCbghOSPE+YXa1cHYdx0X7q5pap2z/fvn3kt/fdUIivOZE\nFmgkjJaA0dbnzUpP3r9/f1wl7fDwcKotvfWBix6/tghXu5T3C16np6cYDAYYDAY4OTkZ7/txcnIy\nEYLGl/6RYLIlbAKc1XDg/5uFwvGxWb+2PTo6wtOnT8fnbxa7WcIWk2yPbQLsB4t3JMLeCvaWsNWX\n8Pc9PT0NLeHIIua/9T+YPLduSIRF6ov1qchRn7hsWKlK655hrYyijsiziDAwXRchSzE2ET45OWkd\n1gIoyqTz283Nzan+bSzCjF3+Z9a190ub5cvD3BFsCXPa8uXlZZrFaPPsVmHh9z9i0WeCLWEv1sPh\nMEw/jyxh/qE0/72fs/d3nZAIrzmRANdSkVmEuVecFWr3cya+bAlzXHCbLzjzB2d+Vp43d8PJyQmO\nj4/Hwx8fHx+POxJHllq0b+KZJZXYfbe3tydEtrZ/enpaPcfBYDDlm85+MP1cV0s48wlHAlxKSTtl\nR35heyz7W4sw4bl1E2BAIiyu8F+6KC2ZIxt8myLuGXd4eDgWar/N2tLbOdi2y8JclF3GW3Y3mJCx\ndcn7p6enU4/hYT8mC7D3A7OlvrW1NRY/FsJs8c8s3mxr7giroha5kKL9yCfshZhfb7aCLfoicld4\nd4SPhvGWMP9Q+ve5aZq1FOOZRbiU8kYA/yOA1wP4HAB/pWman3T3+XYAfwvASwD8AoD/vmma3775\n6Yp54/2+kfhGYU5Rrzjz/5oIZ8062R3hFwCjLZNliEU+V7aET05OxqJrflbeci2GbJGS59kSZvH1\nPw62KJZFaPA2Gn5Rkd0RbX58HizekQC3uSOieXZR1Sxhs4ZNeK3+hn/d2txQq8p1LOF7AH4VwPcD\n+HF/YynlWwH8bQD/LYCPAfifATxfSvlzTdOc+vuL/smE2KcS2xfMd002AbaQtAcPHoQLNb54u7cg\nI+s3E2JvBUfJDJE7wha7/OLX6elp+CMUDQBTx3ZeTdNMnJeFh3URV58YEoXIeXdE24+F7Uehdpk7\ngt0ZTdNMiTBHYnRdmLPQNj4/i3XmH691ZGYRbprmfQDeBwAl/tn6uwC+o2man7q6z9cCeBHAXwHw\nY9c/VbEovABH9SB863qzhM3yffDgAR48eICHDx/i4cOHU/V2+X9wPG1kCfN5eaLkiyjxgkV4MBhM\nuB9MgC0M7MmTJ1PuBf8DxMfAMxG2S2c+J/5BsMgEjsbIjjnyoW0Rz9KWIxdOtPVheLMszPljc09Y\nuFpkDUcLc77okBdgWcJzoJTyKgDPAfg5m2ua5nEp5VcAvAES4TuHv5yNAvR92mvkE75//z4ePnyI\nl7zkJXj48GFYkCcaPpqgy5fQX/az6LG1590RJsQmwibEjx8/Hsf8+h8LvpS2WFd+3drOp5QShslF\n+8PhcMq3Hfm7bT/zm0f7bXHCPla3JsAc0XB6etoqwPyaRiK8zgIMzH9h7jkADUaWL/Pi1W3iDjKL\nJWzuiKh1vVnCL33pS8dWblvY26xfvJolzJYiX/bXBNgy1KzaWJQJaNtMgCPxtdeslBKGxEWhc4PB\nIExj9qF3NrL3MiKKjuCMuah2BFv65prwI1qYy8Q4cj/4cDotzC2OgpE4i1umdolfSplyFdSGxQDz\noltkAdXCz7wl5Mm+fDYf+X+zug98uc/+Vu9btZEtcHl8eBy7V1iAt7ZGX68sYYTPK0pF9guQUXbZ\nLK9flLFW+xsmE3cOYeMfap/Us7+/DwDj18hHk/CP2boxbxF+ASPBfSUmreFXAPh/5vxYokLkF4zm\n2NXQZglazC/XgPAF2aOsMaNNdKN9PxeJLwtwlJIc1U3IFqU448z8rkxUtcxXLOOtuSP4R6G2QJbF\nJ2ev36JhoeRjvt3XluC1A1vAtRBAH9LnBdi7p9aBuYpw0zQfK6W8AODNAP4DAJRSHgD4UgDfN8/H\nEjm1FXMWY4uA4NjOLPh+d3d3ogiP7w3HBXlqi23ApMDypWckOH4blaDMLGCfZRaVb/SX+CwGmf/S\nFqY2NzfTqmVcxCcKQ/NuAW/9ZpbrbQqxdwvwMX+G2BqOLOH9/f0JEebXNnrN1803fJ044XsAXoOR\nxQsAry6lvA7AZ5um+T0A7wbw90opvw3g4wC+A8DvA/jXczlj0Qm/4OaHzfsvDi++mf+XEzTYHRGF\nnPmIB48XES8wkejwsYlWFMIV1YXwl/yRPzTyM2er+Ga1et85H/M+gImFQr94mFnlkRXcp5/UBJgt\nY/4seRE2AebnCmDKDcERJddZI1gFrmMJfzGAn8fIx9sA+O6r+R8E8PVN03xXKeUAwD/BKFnj3wL4\ny4oRvl0y0fULZJElbF8gv2V3BGe+dXVHGJlPMrL+/GArOPMHeyFui5FlCzRKJODboypltf1SSpiq\nHFnlmQXc9yIVCzALMdDNEjY3kP0vYHpBcx7dtZeV68QJfwBA1XHTNM07AbzzeqckbkokwFltCG+9\n1CqkeRH2KchZDQWjtp8JkJ9vW5CLXBFRqcrMErZzj1by+Tl2HQAmHjOqnubPxV6T2riNz5AX3myb\n+YT5x+bi4mLitWQBthjtdRRgQLUjVhovwn5YKnLbqrYfvDDnIyJ8FhlQX3CLxDbyjXLhnDa/cOSS\naHMBWMhXtGJvoVS+TGQWcuevBrrUE+b6DTV3TB90sYR9dETWw85bwBZN0vUqahWRCK8gmRvCXy53\nCSviKmg1S9j7hIHJFXUvxJnFGy1Q8Zy3hP2iF1vC1o0isoazcDA+v0h8vZ898rn7qwEOaavtR/7w\naHtb+IW4WXzCZ2dn2NvbC3/sWIStFChHk6wbEuEVJXJHRItHNZ+wWb6+LGVtYc4eZxafby021s9F\nPmET2Kxoe5cyjia4fNy2uOmjQLL5KPEie561qwa/fxtEEREsxt4dYeGOmYXPFjBnKUaurHVBIrzC\nRBZalgmX+YU5My6KjogW5vyXyAtHZv12GVGNiMgdkfmEa9li7JLw4hrtR9tors2/m1nAtdfwNvFX\nMm0Lc+fn59jd3Z34YbEfI5/VaJ+9zJW1DkiEV5CaFRzVhvAhal6AbURtjLKqaPalywQ4soL5UjWb\na8uWiyzhWSqI+dexy2s9C3dNYLuS/bBG7ogoFdrcOvye1VxZ64REeMVg8eX02ayimff52nE0uCaw\n//JkftCu/lD+4mathS4vLzEYDMaF2K0ehG//4xfivPB6N0Qmgl3EcRkEdBH4q4PI5eULQPmrJh9t\n4l0564JEeIWIBJhdDX7fUpG5D5wXYp8hV3NB2GWrXwH3oVjRXCS80f5wOByLrhXm4S1bvlkYmLfS\nxM2I/OJRJI6fi3zo64hEeMnxH97o8pAX0fx+1hcuEuKsUI/35bEI+4W0aPg0Yj943rscoi7KtS4S\nXa1gMRvRIqYXYG/1+qundRViifCSEn1go5AhH3qWZcFlAswi7Kuq1RI0fGlHdhH41j4+XrY2sloR\nUXeKLBoii0YQ16dmBUuI60iEl5DI+uX9KG6TF9t4y4LLgswCbFvvY84iIrwlHC2csXjWCpn746xJ\npm8N5AXA0BPpAAAgAElEQVS4VsRcQnx9sjC9SITlkoiRCC8ZUSgU72cizAV4IrHNMuPYEs7a/fAX\nin3CHA86HA7TYuZt2WTelxy5NMwH7N0ckd9ZlvD88Z+/yB0RDYmwRHhpqYlx5BNm94MtxnHyBW8j\nAd7d3Z26pOQtn0OUFWWWr0U18LDaAtkCXtcFvmhEfmGJ8HyJElW6WMJyR4yQCC8RmR/Y38Z1ISJ3\nhPWFu3fvXlioh8WXR1YvoeYTZkvYrGBruGkjKmiTjWzBLpqvWcIS4PmTJQdFvmEvwLKExVJRy9qq\nLczxApwJMVu60T4fZ/4/HiZskU+Yux5z2/m2CmN8Wy311x9n/mUvxBLjm5FZwfxZzCxhL77rKMYS\n4SUnEuKaO4It4fv370/VD46Obb8r7BNmdwTH9nLbefbnRmLMJSijFN+s4pidQybAClGbHz5ErevC\n3DonaRgS4SUjcj/ULOGsQ/Lh4SEODw8nhJcTMqI5YFrosuMulrC1m7cICRZbX/uXG2G2vQaGuUWi\npA8J8HyIrNhMiKMQtcgvvG5IhJeELq6AUkpamCcrzpMlcviEjM3NzbDUYzY4tTgbZhV7S9hveZ9L\nKPrXwh+bCHMmHlvAckXMh2ytossQEuGlI7Mg7JiLtPPgVGXfRy4SXO4VFy241eJ6z8/P8eTJk/Ew\n/6+v81BLMa65ESLRjfb5h8JXbpP4iruCRHhJ8GKbRSl4AWbLNuqgzPeNesVxIgZHHfh4XB+3az5f\nFuKs2E6UVFGLaqgJL8/VavbKEhZ3BYnwEtEW/mOWcDSiNvZcjCeqcBVlwnHsby1jzaxfbwV7Ebbs\ntiySwbsT7Dwy4eU57zbJxFdCLPpEIrwk+PCfKBc/8gdnFrANE91IgLP4Xy7KzSnIvG8xwCbANixC\ngu8fVVLLylzy69G2jYrHK11Z3DUkwktETYhtsGuhizXs/cC1pou1LDifisy1fv0+938zf7BPKY7q\nC3OXBi4qzlu/7y1gCbG4a0iEl4AsGJ6t36hTRmYFszXshZz3/cKcDzuzDDgWWS+20fb4+HjCEvYC\nHA0W0Exw/X7XdkJC9IlEeInwi3KRBZtZwpk7Iius0paOzAkYPgvu6Ohowu0Q7XMn5IuLi9R1ECVW\nsHj6MCd/XItrlgiLu4BEeEmoWcLsishihLPh8/ijrRG5I3wGnGXBRYXWs/5vLMKzWq5ZrCm7LKKt\n3BHiriARXiIin3DUx6tLhIRZwl2TQIDJhbkoC+7Jkyd49OgRHj16NFV4vTY46sG2XQWzS8B/Fgkh\nARZ3AYnwkhBZwZkQd/UL7+zsVC/n/W0+Tph9wlwLgkU4KsDuB0c9GJFAZqLZJsQSW3GXkQgvCd4C\nzjLhuB4wZ8X5zDgbNcuQxZH9wb7VPPd7M6uYO1xwpwvf9eL8/PzGr41Edr6wK2eWvxHXQyK8BEQZ\ncVGkg5Wr9AXbOSkjS8JoK8pzeXnZyarl0ZZ+LPHsh7bFTJ7Te7R4JMJLAhfnYRH2hdd91wxfI8KL\ncBZD648vLy87dU6O0pmz9GNx95nFKpY1fD0kwksAh6RFIsy94w4ODnB4eDhhCZsImwsisoSzwug2\nZ52Ou46ox5uKqd9dagJ6HfeE6I5EeEkwIY6ad/q+cd4dwYV6OBEDmLaEa1lrUXPNrq6ITIhFP9Rc\nENn99X4tBonwkpAVao/6xtm4jiWc9W2LLOE2V0TW/00CfHe5qRDLJTE7EuElwMLTOCmD3RG+b1y2\nMNfFJ5zVB44qp7VZwl3qQIj+6JLoMsttsz6OGCERXhKyvnHmD+a+cTYXWcJZdETWINM32qwNFmHf\nySIa4u4yTyHWj20difASwIka0cKc7xvnIyY4OsK7I2qWsBffLotxNiwVOetqIUv4biA/cP9IhJeE\nrs07rYOyjx/27Yu6WsJdrN4sC66teI6+4HeD6wixzXdZ4JM7oo5EeEloC1Gz0LTDw8OpLDrOrONa\nwUC7JZyJbptbwousajbcbbhGSHSb3rPFIRG+I9SsB18fgkXYxwrv7+9PFe6JumYwbAnfRIA5LE1f\n2tWnVjY0SvqJakPb360zEuEe6VLBDMBE1pvvkBzVD/aCy4txUR+2bEGORZj7wXEyhi/Ivu5fqGWl\n5h6qzUeCy22p+LOk5J0YiXCP8IIbb/0cF+HxTTqzwjxRv7ioSLtP1PC+4KiRp7d6fT1gsVzU3rOu\nwlyLNffi63sJrrsYS4R7JCpNGQ1eYIuEuM31EAmw4S0YbjnPVjCXpvTuB1nBy8t1BNjfJ3I5+Hjz\nSIglwCMkwj3BboGoaSfvZ1ZwJL7cPbmrENeiImqWcPZlEsvPLOLcRYgza1hp7BLhXvGWsG/Yafuz\nWMLb29tTbY98087IJZEtzHlLOEtNXvcv0jIyqw84uy2LsMkEOPrcrPPnZ6P9LpOUUt5YSvnJUsof\nlFIuSylvdbf/wNU8j/fO75RXB/b/+m7JPgKi5guu+YSjxp1MmwB38QnLCl4driPAtq35hKN1BFnC\nI65jCd8D8KsAvh/Ajyf3+WkAbwdg3/jhNR5n5cn6xXG0w9bWVlWIvRhbRlytc3LXbDmzeGs+Ybkj\nlpOaoHa9P9/mr6i6uCMkxCNmFuGmad4H4H0AUPJUmGHTNJ+6yYmtOiyInIgR9YibdVGORZe3kU/4\nptERfmFuHb9Eq8BNFuiiz1AXd0QmwOv2GZrZHdGRN5VSXiyl/GYp5T2llD+xoMdZaqJFuVqTzsga\nvmmcMNC+MBdZwZlPGFi/L9Eq01WAa0Jci47w4Y3r+NlZxMLcT2PkpvgYgM8H8J0A3ltKeUOzjq9w\nhS6WcLYoF/mF7W+zBBCeM6IQtcgv7LsnZ/49vcV3nyi6oe0+bURV8iJLOPMJr/OV1NxFuGmaH6PD\nXy+l/BqA3wHwJgA/P+/HW2a8SyISY+8jriVl2LD/Xdu2rWjXBNhbwuv+Jeob/lHN9nmhlhds/Y80\n0C7SfJy5Hzizkj9DUYo7f4bWkYWHqDVN87FSyqcBvAYS4TGRtRrdJ7Nk26xdo2maiQIsti2ldCrY\nY1+ewWDQGiEhEb5dsgxLP1dKqba84nUEw7sHsmNetB0MBhgMBjg+Pp4YT58+HW9tnz9P9lla16up\nhYtwKeVzAbwMwB8u+rGWnax+RJsQ2xxvmexys2YBewvGvjSZNbPOlsxt4d/bUkpo3UZzWQfuqLyp\nv0qq7dtnxAT45ORkPFh4TXyPj4/Ht7MIs0W8bswswqWUexhZtfaJeHUp5XUAPns13oGRT/iFq/v9\nQwC/BeD5eZzwutEmvJkFbNT8fSacUQcNjhG2L1hb1pws4cWQXSX50Ea/z3NRy6us0D8wHffrj234\nH2oWYBbho6OjCQHmz5Ovvrdun6HrWMJfjJFbobka3301/4MAvhHAFwL4WgAvAfBJjMT3f2qa5uzG\nZ7vi1Czf6Pbs76IoBT9Xs4SjTLm2WOF1vZRcJJn4GpxpGcWX8zZzR0Qtr2pxv7y9vLwM3RFsBR8f\nH+Po6AhHR0cTAlxzR6wb14kT/gDqoW1/6fqns55EPuEuroiaL9kLsPflAZOhabP4hJW2fPtErqda\nWKPfRiLMlnAkxFG4mT/2V0uZO8JE2Lu2orKo6/YZUu2InsnEl/czwc38xpEAR0IcxQZH1rB9waJ6\nwmzBrKMVsyjaIh7svfYizPHkvJ3FEgbyBAyfcOF/qNkSNhE+OjrC06dPcXJyMpX4433C6ybAgET4\nThHF8rYt0vm/zfBinH3Bok4a9gWLOmnIEl4stcVX9glbpIN1WvHbWXzCtc+HH5EAZz7hk5OTqTZY\n/FlaV5eWRPiOkPl/+fauURNtrojaFyzzCQ8Gg6nFOxXxWQw1XzBva30HreWVbWeJjgDqQsxXTJkl\nfHJyMhEZcXR0VP0MaWFO9EL0xfLbmt+3tlCXiW+02BJlynl3xHA4nLoUzbpriPnQ5XNhC3M+w9Ka\nv3I37q5xwl0s4cxlxVawDVuYGw6HU24NP9bRpSURvmO0RURkosz3zaIi+ItVs3KyhbnaAo18wosh\nE2IAUxmWUQduE+DDw0McHByMreRZLOFaNpy/Wqq5I4bDYRjmFqUvrxMS4TtCbfGl68io+YO7ZM2Z\nGPtQJeX+z5/svY/mIleEWcFsAZsAszuC645w0X/g2eclS0fOfqR5eDE+Pj7G6enpVNJHNNYNifAa\nk4mz32Ziu85fnEXALgZfgjQ6Pjw8xL1799LBbgizgn1oGqc2s+UbdVeJYsej5AufgKEf6ToS4TWg\nZnHUUlK7XDZKkG+Gv4Jp6zfI+23iy/uZGyKKDc6sXhNa3loWnA9jjGJ/Zf3GSIRXmNoHvO1LEbkr\nIp+yvkjXI1ts9Vlw7PP1KcldxNf2fUlUHyMMTFZEy/oMmquBi/WwJVxLwKiJ7zp/hiTCK0r0gc6s\n4Eh4M8u3JsDr+iWahbbwM7/Y5ov285z5e3lrwx9HzQCi+GB2R3jfr8+IM59vmwhHQmxEx+uGRHhN\naPvgdxHktkUVMRtZHDgnYLSlIkeWL/uB7TgS8qh6WuYTjoSYq6LN6o6wxzPW+fMjEV4Dsg97TXwj\nl0T2N6IbXVKRfQKGZcFF6cg1d4QX46iyWmQJtxVz8jWDZ1mY4xBGXT09QyK84kQCXFugqy3ORffN\n/p/IaUtFZp8w14Pw6chti3I8fH1hP4Dpynrn5+dTVnDNEo7qTWcp7dnnch2RCK8gmT8429aENxNh\n/tvsMcUzZk1F5noQOzs7aSpyJsR++FZGURxy5BPO3BEc/+uLtNcW5hi5I0ZIhFeYmvDaNrOAIzFu\ns17W+YvUlS6pyFEWHCdhcCJG1+HJfqizEDWfkMFCzPNmCfu05Fom3Lp/biTCa0ZmxXaJkqhZ2KI7\nNSFmd0RmCdfcDpllDCCNBfdzUYZclJr89OnTqdjhWSxhMUIivKK0uQ7aBDdaoBPXIyvI5Pej8LSo\nKE9tAY7Tk3n4lHMvjnx71m3bW8Emxny/WnSEiJEIryD+i8WXhryftbDnL46+QDm1BTbb51TkKCWZ\n57gXXJdFN6sD4WtAsFvh9PQUm5ubYSU0XwnP9p8+fYonT57g6OhoYmvFeHw0RK3xqz4/7UiEVwy2\ndqMMKB6+u4EvTanLyJxZC+1EXZD9dmtrK8x6y7YWKWFpyCbqLMJnZ2fY2NiYcC2waPq5s7OzcflJ\nXw/YesbVRJh/6JVR2Q2J8AriLy+jYtwmwmrceX3Ymo2sXh/tkMXqsgvCF91h94Of4yw4+z/2Q2Dv\n/enpKQCkHVP83Onp6VQ94Gj4bskq8n99JMIrCLsjasW4I3eELOFuRMKbuRksS813RY66JEfhZZG/\n1xIw+H+xO8Led2D0eeACPL55qz82S5fTk6Njn5gRfX4kxO1IhFeQKNsti/v0Quy/QFqQy/ECHA2O\n9+W042js7OxMCS0vxvn9qMIaW8Ln5+djQeZFNb8fzflqadHWwtGi9QZfaU/kSIRXEG8JszXsV7zl\nE74ekRXMGWkskBzlwC4EP6JY4Np+lnjhw81KKVOxvRzh4Oe4dnDNfcHtirJWRbKE25EIryi+DkCt\nK4J3R/jOt/oSTRLF9bL4sgD7hIto+LZEXUeUsWhbn+Fo9R5scY0X3fwcXxm1DS+4WWSEPkM5EuEV\nI4r/rVXF6hKmJqaJfMImwDyyTshcB4IHpyX7fT7e29ubeo+jOG+bZ0vYGm9mI+qCHPl9+cc6K/yk\nH/F2JMIrSOaOiKpiZe4IfYnaiXzCPurBBDiydqP92tbvRyLJ7z2LpgmwhZs9efJkPB4/fjxxzJ8B\n39jVb33WXZYCL3IkwitIlAHlY0IjdwRfXvIXTEzT5g+OylGykEYLbiywXnCjwVc1wDMXFICpzDdv\nCZv4Pnr0CI8fP57YZzHPUpv5s5FlZfo5ESMRXkGyVNRZFuZkCbeTuSM4/CwqR8kFeKJmnG0uC5sf\nDodTtYCjjDnuB2d+XxbhR48e4Y//+I/HWxNycTtIhFcIn7HFlhnHpJ6fn2NnZwcXFxcYDocT7W58\nuBNngq06bZlv/LpmYWZs/fqwsyjUjKMeTFyj1kP2Y2oZcE3ThK3m/ZwdP378eCILjmN9eS1AP7i3\nj0R4xahZZtvb21Or2dz4MRJiK/i9DkTRDlkERK3tEPuBd3Z20uiGaNGN60BEacgmyhb765Mpshhg\n8/ceHR1NpB6bCNtnQtw+EuEVJBJiFmD293LzR868YhFaB0vYXzlELea5xkMW8+vnM3dC5Pvd3d2d\nWNTjjheRq8FXM8uacFrZSVuUi9KO7TMhS/j2kQj3TFvfsev+T++KuLi4wPb29lQoUWQJswisgwAb\nVmwnajPP+77vWxTzGw0T2ux4Z2dnqgWRF15zS2xsbEy1GfLJF9Exz3PdEFnC/SERXkEylwS7IYCR\nheWLwGR+4VXHv2a+pgNvrci6F1O/5SpnUaNOtprNFxzVGuZFVr6dBZYTLqK5yHURWcLi9pEIrxhR\n7OrW1tZUtINZWJlP2Ftj64BvsunrPdi2i4uBoxjY5eN9xr6ehC/96EPDeM5bwr78JA9frIejY2QJ\n94tEeAXxVt3l5SW2trYmLB0T1swVsW7uiMwSjixWq/HQlnjhi65H0RN8bO8VhxZG4Ya25dbz7PM1\nMeYsOA5FjOpIS4T7QyK8gngBjuJ9TWDZIoss4XVxRwDTPmG2WL2LwbcQyort7O3tTcUNZ8OKr1tc\nN1vFUXH+aNHNhu+M4WtJc1y4WcJamOsHifAdpi1eNerqYPDCnLeATWxKKRPuCBaEdfMJA5OvGbsi\nIus3i/n1Y29vL6wnEdWZ2NjYGGcuctq575Rh1qz3CfuUZE5HrlU747BFcftIhO8YNxW8KIvLRJh9\nxRby5C1hHyO8LnHCbe4IDidrazvE2/39/WqpSz6282ABjjLgLAuOfcJsCZsA23jy5ElYZMdvJcL9\nIBFeQUxM7PJya2trarHu8vISGxsbE/7OdY6OAKY7HnsRZgHOxuHh4VQzTt9tgwWftwaHoQHP4oTN\nEubW894S9oV5LDU5cjPI9XA3kAivMN4qbppmwrJtK7fIl9qDwWBqdT4r3JJ9udu+9Ox+yeaiEC6/\n7+ey/8n729vbnbpZ2JYtXY6CyOKsuc6vnRtfodixr+3h05A5K45Tk9tKkoq7i0T4jjEP68SLDAsx\nP44Px/ILTycnJ7h379640lpUTSvb8vPwz8nfFvm22/zfNYvS7/P/8//bjre3tzsXVfeREN6v7pMs\n/HsSvUf2WniB9enHvt+bDzVTPejlQyK8BES1Wm2et0wXEcsWodgK5voCUQFxX0w8+vJn5+stwUxE\nvdB6X6qfs/k28eaxtbWV1u3NUox9LLBPN+b3zr8W0ftmxdcjAY72vSUcVcKTCN99JMJ3mNoXqIsQ\nA7klDMTuCO+KMCuYLay2wV1+a/VlowVDL7SZ8GaDb/f/o2Y1b21theKabaNaEb7wDoCJK4PInePr\n9kaFeLKOx9yUk90R6u+2XEiEV5TapTgv+NRq3vpL3KzVje/wYJfimfD44zbL1kcV1Oo68H5kIWfi\nbkV5utSCqHVM9u6Irq4bTsDILN6oSE9kCXORJonw3WcmES6lfBuAvwrgzwI4AfCLAL61aZrfovvs\nAngXgK8GsAvgeQDf2DTNH83rpEU3TAhMcCwigoXQF6WxeFi/yOPjVKN9LrWYWXvRbSy0mUXL1q6v\n5ZDVeYjCwLJj/iGqdUbmer/ZD4C3hLsOFuFMeLkyGqcie3eELOHlYVZL+I0AvgfA/331t98J4GdK\nKX+uaZqTq/u8G8BfBvDXADwG8H0Afvzqb8UtwwtAJsAmxgBSn7AVduHiLpwoYNutrS2cnp6GYVaZ\nEGci3CWpIbNAI+uURTET9igkrW3r/6//f5ElHPVn83Pn5+dhNERteEvYd0AWd5+ZRLhpmrfwcSnl\n7QD+CMDrAXywlPIAwNcD+BtN03zg6j5fB+D/LaV8SdM0H5rLWa8QXRdtZqEtrpet4cgnHF3WAhj7\nHbe3t3F6epoWf/eX2v6y3B+z4Na2PpXYF0/3I8tMywQ/qu0QzdmiXy0qw2BLN/OrsyuntjDHVrDV\nBNbC3PJzU5/wSwA0AD57dfz6q//5c3aHpmk+Wkr5BIA3AJAI3zJRaBS7BiKfcGRNlVIm4la91QdM\nV/zyW3OJ2P3sXHwN39rISkJGPlyffOKFPRL52mCfb/b6MvYas+Xr/eq+EWstNM2Lsb0XWTEe+YSX\ng2uLcBl9+t4N4INN0/zG1fRzAE6bpnns7v7i1W3iFmGfsD/OLOHIp2gWnm9/FCUkmOhmKbIsxKUU\nXF5eTolwVsfXu0zaavlakfTMfxv5c7uItV94zCx9fi3Y8vXFcyIRzgSYhZirodWEWNxtbmIJvwfA\nFwD4sg73LRhZzGKBsOWTWcAsmKWUCT/r7u7u1JfXBDhafIpcENwBwtclMAE28bVLeC/AmVuAa/l2\naQ/PGWyRRRtVMcsWA/kYwIS4ZrHTkRCz4PKwuhBdFubMEmb/fJYpJxG++1xLhEsp3wvgLQDe2DTN\nJ+mmFwDslFIeOGv4FRhZw6JH2AI2MeboALvcj0KcohjdSNxsRNZwtl/zx0YiXGsDz8eZ2NaiGrJw\nNv7xAjAlwLXh6/j6xU2+LfL/RtEQw+FwSsx9D0G5I5aDmUX4SoC/EsCXN03zCXfzhwGcA3gzgJ+4\nuv9rAfwpAL90s1MV14GFNzv2lq6JqLeszWqNIhXYFzsYDDoLsPmEaxYwz2U93Hy/Ngsl67LoF/m3\ngWfWPe/b7d7HGy202T7Xg4hcCHx75hPmRTi2etm3zNavrODlYdY44fcAeBuAtwJ4Wkp55dVNj5qm\nGTRN87iU8s8BvKuU8h8BPAHwjwH8giIj7g7eIvZhWpEAR1awF8bhcDgWxKxUYmYJR66IaK7W3djv\ne/91LVrCXCve4o1+iJqmCRNYsm3UUija54VPvx/FBEc/AvzayhJeDma1hL8BI9/u+9381wH4oav9\nbwZwAeBfYZSs8T4A33T9UxQ3xVu/NgcgjJf1VpQXYRZfFuDT09NxjHFtcc5vowSMLCGjayxvtIiY\n+XrZvx2Jr8f7d6NIBzvm0pM1kc2E2m+9CLMQR/5ocfeZNU64tcJ30zRDAH/naog7QiTENs8CnLkg\nIt/v6ekpdnZ2JsTXRlcB5jhh78Nty4hrc2HwYpvPkvNz/Pp4IfbHtvjYZbCbgcPKouFrQNT2fRih\nt4QlxMuDakesEZlFbEIUuSB40c5boz6Djve7CjC7I2aJZOgyaott0eIb+39rMc+Xl5fVMDMfAVEr\nR+n9vlFaeDaXRWVIgJcLifCa4UPUvAjbffiy/eLiAtvb2+MvPwtwVk/Ch2hFW3ZHzCqukZ83cj9E\nFdqiOYND6KLBLoDa87d9jnio1YEwIW5zcfDIztGGWA4kwitG5nbI7seCa/MbG6MedCZsJjzb29ud\nLsG5UWUmxJE7IhPjKKEici1kNRw4zCw7jixHO0e/8NXFSvUNOa0rso/15f2Tk5PO5UItDK32Aydr\neDmQCK8Ys3zpvBXI1dY2NzdDK7BrVEBNdP0+uzy84Pr9LmUpI6HNhj1vPi97HSPrl61gnygRzVlD\nThNb7gnn909OTlqt2ywCIhvi7iMRXjG6WsJ8fxau2pe7zTLzC0SR4GbuiMgajmJ6u4qrF9m2rQlb\nloLtLf3MF+63XoStIScPmzcRzl4rP8fnafvZnLi7SIRXjFm/dD4bzNwS/H9sP7LG/Ip8ZgV3sYRr\nPl7b5/O18/fPJ3qOba+Bfy3seUd+4CjTLTvmrsgsvEdHR1Pbk5OT8HX3+9FxNifuPhLhHmFrhYWL\nrcpaF4tsdBWpLgLG58qX/Sa6kduiZgVHcxwilyVT2O2zvrb+dc62l5eXrSFhbWnHmSVsrgYfjhZ1\nSRbrh0S4J7wA+AWg8/Pzsfh0XQQygY4sO8728hZfV6KUZ7/Axc/PHsuLsJ+Lul5kroXsdYyO2y7r\nef7i4mJCFGv70WufjdPT07AbhqqdCUMi3CPeAmbrl0Vpa2trpi9+FgUQ+Umvg/8/Jr6Xl886dtQW\ni7qIcFQ8x792fj8S4MxK93M+vTjLZDNBzuKD/ZajI3xChkRYABLhXvFC4S3h64iwWcLmMojiY9ki\nnhX/t5z+bM/JP79s1Z5FuRZWxo/jX79s6xcTvZvHLyKa/5ZrNXj3ge1z66e2uF7+v1lTTonweiMR\n7plMiE2ESykzW8JsSZqlCTxra3RdATYyv7L9f2/p8vOM5qJQs5orIhL6aD/6Ycuqn5nF2pbRdnJy\nMmHFRv+T58wl4S1pibAwJMI9EgnwxsbG2Jo18ekiwuwn5qQFFiZgur3RrHhBZGFvs3iz/SyLreYP\nbhN5toK71njIMtn88XA4nLKms5A9LmXpF+3Ozs4kwkIi3Bd+ccgL8awizP5Ji1pommchZ1kY1ixE\nC3PZfdpcBTznF/f8fvZY0WsZva5ZmjEfWxRDNjjWdzgcVkP1/H72YylLWAAS4d5hwbDwMi88s1jB\nZgn7LzVHR9z0C18TX3tOs2y7Jl1Er1sU6eBfU59kEYWSDQaDicQJ3vdzJsJti33+RyDyG5sIi/VF\nItwjmSXsEwaiMLXaiATY/m+0gDYL0aKen8uiFbL9KJQuO47+vs0K9qnG0TBL1ydR+OSKp0+fYjAY\npOIf7UcLgb70pCzh9UUi3CP2RbVIhouLi/A+Gxsbna3g8/Pzib9nEfN+1FlgS7ctK83//7Zj/z+y\nrf/bzB/sE18iK9iHnVmBHRPc2jARjh47mvPnxD8SPCfWE4lwz9iXMBI2+6JubGyEdQraLGH2rfpL\n5+vQJr7+edVoE+I2uoqcd0dwkXXemsvh6OgIT548GQ8+tv3BYJA+j9pxm1tGrCcS4R7xIlLKs8Li\ndjuAKRHx3Rq4rc/GxsZUl4nz83Nsb2+Py1FGro/IF33dxbwu942s28ii9NvM7xrVr8hif6N97/fl\nxQmEKCIAAAxrSURBVDgfqjYcDju/FkK0IRHuicivaUJs+4ZZcybAg8EAOzs7Ey18TCjPz8+neq1F\n/dds1MLD/PY6z63r/TOLNrNuayFiNu/dDj4Ljo/NHcGi69OL57GoKYRHItwjme/Qw37N4XCInZ0d\nnJycTHSPsP9nIsyiWzv2zS+j4uizhrT5ULYuRJZsdJwlR0R1je31qnU2tnlfYD1LqhBi3kiE7wA1\nIbaFOXZHDAaDiTRf/h9nZ2cTrd+jdvA8ajV8gesJ6nUt4SzpwQuvTwvOjjn6wXcujroY+1ZDvrOx\nFtDEIpAI94yFaLEA20Ia8Cy8zATYFzcHMHFZbh2Qd3d3x8JrLen93NnZ2VS3Yu64zG6KWZhVuLOF\ntEhkswpyUdRIFo4WDXZRcI0IJVWIRSMR7hEWYAATAmzzdsyWsImw/Q8WLxZbP0xwTIBNkG3RzoTQ\n8A1AZ3les94/CyfzoXhd28Ff5zZvLbNP2DIRJcJi3kiEe8Z/qU2AObbXFtwsG44tYE6NNeHY3d3F\n3t7eWHz39vYmwtq8ZbmzszMRvmaPbQXbZxWe67owvBBnHSvYpVCr/dulB1w078VaPmGxSCTCPZOJ\nFYshCxMvwkWVugaDAfb29jAcDrG3tzcW4L29vSmrkn2tHFvMJTSvI8I3EWAvwrUFtbbRltTij2su\nELkjxKKQCPeICa3t+zmOz+WOGSbAvABlURM7OztTK/9egL2wsIVnFvDm5ua1hec6PmETYhbHyFc7\nyz7X/eXnnu3XKqJJhMWikAj3jI8XjtKDvVh7a9EW1GyBbX9/f2Jlny+rvQjzIpwJsP2v6wrPvC3h\nWm1fv8/b6DlHRXT8D1ItKUSIeSMRvgNEqa1ZVTL2AUfNMLe3t0N/qBcc7+NkF4RFStyWJQw882/7\n8DK2cH1d31rNXyu+3lZ43bsafPZetC/EPJEI31EiYbZ9i6Kw2sMcM2y1I/gSvzb8/2eiVOY2aoV8\nMlhgo5RhnmsT3kiE22KP2ScuxG0jEV4iogy7LMWZoyl8XDFb1FFEArsAjo+PcXBwgKOjo/S8Zim6\nE8GJEva42b53Q/j28Wz5+3KR8yhiJMS8kQgvGW1pzpzcEaUde/9rJsBmne7v72N/fx97e3vj/z/P\n5wJgnAXYpclmVAGNRZj7v/GPjBdgXyxIiL6QCC8Rbdl1RhTSZn+fZaX59F0Lb7N4YxPhRRCFoNXC\n0rLEisgS9vUnJMDiriERXhJ85ESUZWeYyHK6MfuHffovW8A+tZmPa7T1gKvhkzHaRi2Rw4ecZb5w\nCbG4K0iEl4hssY7TnIFnMcXeajYB3t7eHteN8DHGXGnN78/TFcHPo60eRJTlVvsb747g5x+Jr4RY\n9IlEeEnxyR1RXza7nWOLrXMzxxVHRXy45rAdL4oojrdLFlvb8MIrd4S4i0iEl4gosSNK6LBedewD\n3tzcxPn5+VRcsYlxVtKSj7sya5xwW+nKLKysbT4SXQmxuGtIhJcE7xP2876RJwuwjyWOCrf7ou5+\nfxYRnhUfLucjGmpz0XFtEU4CLO4aEuElok2IgWdWqC3YcbNP3vq5LvuLfF7su+2y7TJn/7vLVoi+\nkAgvGW0pznbctm2bu26Tz+tSSxG+6W3ZYwlxF5AIrwCRZSyEWA4Wd40phBCiFYmwEEL0iERYCCF6\nRCIshBA9IhEWQogemUmESynfVkr5UCnlcSnlxVLKT5RSXuvu8/5SyiWNi1LKe+Z72kIIsRrMagm/\nEcD3APhSAH8RwDaAnyml7NN9GgD/FMArATwH4HMAfMvNT1UIIVaPmeKEm6Z5Cx+XUt4O4I8AvB7A\nB+mm46ZpPnXjsxNCiBXnpj7hl2Bk+X7WzX9NKeVTpZRfK6X8A2cpCyGEuOLaGXNllMv6bgAfbJrm\nN+imHwbwuwA+CeALAXwXgNcC+Os3OE8hhFhJbpK2/B4AXwDgL/Bk0zT/jA5/vZTyAoCfLaW8qmma\nj93g8YQQYuW4ljuilPK9AN4C4E1N0/xhy91/BUAB8JrrPJYQQqwyM1vCVwL8lQC+vGmaT3T4ky/C\nyG/cJtZCCLF2zCTCV/G+bwPwVgBPSymvvLrpUdM0g1LKqwH8TQDvBfAZAK8D8C4AH2ia5iPzO20h\nhFgNZrWEvwEjq/b9bv7rAPwQgFOM4of/LoB7AH4PwL8E8L/c6CyFEGJFmTVOuOpDbprm9wG86SYn\nJIQQ64RqRwghRI9IhIUQokckwkII0SMSYSGE6BGJsBBC9IhEWAghekQiLIQQPSIRFkKIHpEICyFE\nj0iEhRCiRyTCQgjRIxJhIYToEYmwEEL0iERYCCF6RCIshBA9IhEWQogekQgLIUSPSISFEKJHJMJC\nCNEjEmEhhOgRibAQQvTIXRDhvb5PQAghFkSrvt0FEf68vk9ACCEWxOe13aE0TXML51E5gVJeBuAr\nAHwcwKDXkxFCiPmwh5EAP980zWdqd+xdhIUQYp25C+4IIYRYWyTCQgjRIxJhIYToEYmwEEL0yJ0U\n4VLKN5VSPlZKOSml/HIp5b/o+5zmQSnlHaWUSzd+o+/zug6llDeWUn6ylPIHV8/jrcF9vr2U8slS\nynEp5f8spbymj3O9Dm3Pr5TyA8F7+d6+zrcrpZRvK6V8qJTyuJTyYinlJ0opr3X32S2lfF8p5dOl\nlCellH9VSnlFX+c8Cx2f3/vd+3ZRSnlPX+d850S4lPLVAL4bwDsAfBGAfw/g+VLKy3s9sfnxEQCv\nBPDc1fiyfk/n2twD8KsAvgnAVIhNKeVbAfxtAP8dgC8B8BSj93HnNk/yBlSf3xU/jcn38m23c2o3\n4o0AvgfAlwL4iwC2AfxMKWWf7vNuAP8NgL8G4L8C8J8C+PFbPs/r0uX5NQD+KZ69d58D4Ftu+Tzp\nbJrmTg0Avwzgf6XjAuD3AXxL3+c2h+f2DgD/ru/zWMDzugTwVjf3SQDfTMcPAJwA+Kq+z3dOz+8H\nAPwffZ/bHJ7by6+e35fR+zQE8FfpPn/m6j5f0vf53vT5Xc39PIB39X1uNu6UJVxK2QbwegA/Z3PN\n6FX7WQBv6Ou85syfvrrE/Z1Syv9eSvnP+j6heVNKeRVGFga/j48B/ApW530EgDddXfL+ZinlPaWU\nP9H3CV2Dl2BkGX726vj1ALYw+d59FMAnsJzvnX9+xteUUj5VSvm1Uso/cJbyrbLV1wMnvBzAJoAX\n3fyLGP0aLzu/DODtAD6K0SXQOwH8m1LKn2+a5mmP5zVvnsPogx+9j8/d/ukshJ/G6BL9YwA+H8B3\nAnhvKeUNV4bDnaeUUjByPXywaRpbm3gOwOnVjyazdO9d8vwA4IcB/C5GV2tfCOC7ALwWwF+/9ZPE\n3RPhjILcL7c0NE3zPB1+pJTyIYw+DF+F0eXtqrMS7yMANE3zY3T466WUXwPwOwDehNHl7jLwHgBf\ngG7rEsv43tnz+ws82TTNP6PDXy+lvADgZ0spr2qa5mO3eYLA3VuY+zSAC4wc5swrMG1VLT1N0zwC\n8FsAliZqoCMvYPSlXYv3EQCuvryfxpK8l6WU7wXwFgBvaprmk3TTCwB2SikP3J8s1Xvnnt8fttz9\nVzD6vPby3t0pEW6a5gzAhwG82eauLineDOAX+zqvRVFKOcToUrbtQ7JUXAnSC5h8Hx9gtGK9cu8j\nAJRSPhfAy7AE7+WVQH0lgP+6aZpPuJs/DOAck+/dawH8KQC/dGsneQNanl/EF2Fk5ffy3t1Fd8S7\nAPxgKeXDAD4E4JsBHAD4F32e1DwopfwjAD+FkQviTwL4+xh94H+0z/O6DqWUexhZDuVq6tWllNcB\n+GzTNL+HkS/u75VSfhujCnnfgVGUy7/u4XRnpvb8rsY7MPIJv3B1v3+I0VXN89P/7e5wFQ/7NgBv\nBfC0lGJXK4+aphk0TfO4lPLPAbyrlPIfATwB8I8B/ELTNB/q56y70/b8SimvBvA3AbwXwGcAvA4j\nzflA0zQf6eOcew/PSMJKvhGjL+4JRr++X9z3Oc3pef0oRkJ0gtFq848AeFXf53XN5/LlGIX+XLjx\n/XSfd2K0+HGMkTi9pu/znsfzw6hM4fswEuABgP8PwP8G4D/p+7w7PK/oOV0A+Fq6zy5GsbafxkiE\n/yWAV/R97vN4fgA+F8D7AXzq6nP5UYwWVQ/7OmeVshRCiB65Uz5hIYRYNyTCQgjRIxJhIYToEYmw\nEEL0iERYCCF6RCIshBA9IhEWQogekQgLIUSPSISFEKJHJMJCCNEjEmEhhOgRibAQQvTI/w+O+HZO\nU1A1rgAAAABJRU5ErkJggg==\n", 81 | "text/plain": [ 82 | "" 83 | ] 84 | }, 85 | "metadata": {}, 86 | "output_type": "display_data" 87 | } 88 | ], 89 | "source": [ 90 | "def showtest():\n", 91 | " sample = data[0]\n", 92 | " print \"Sample shape: \", sample.shape\n", 93 | "\n", 94 | " %matplotlib inline\n", 95 | "\n", 96 | " sample = sample.reshape(28,28)\n", 97 | " plt.imshow(sample, cmap='gray')\n", 98 | " plt.show()\n", 99 | "\n", 100 | "showtest()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "### Use pipeline and gridsearchCV to extra features" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "1. KNN model to classify" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 44, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "Fitting 3 folds for each of 1 candidates, totalling 3 fits\n" 129 | ] 130 | }, 131 | { 132 | "name": "stderr", 133 | "output_type": "stream", 134 | "text": [ 135 | "/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/univariate_selection.py:113: UserWarning: Features [ 0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19 20 21\n", 136 | " 22 23 24 25 26 27 28 29 30 31 32 51 52 53 54 55 56 57\n", 137 | " 82 83 84 85 111 112 140 168 392 476 504 532 560 615 643 644 671 672\n", 138 | " 673 698 699 700 701 726 727 728 729 730 753 754 755 756 757 758 759 780\n", 139 | " 781 782 783] are constant.\n", 140 | " UserWarning)\n", 141 | "/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/univariate_selection.py:113: UserWarning: Features [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17\n", 142 | " 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 52 53 54\n", 143 | " 55 56 57 58 82 83 84 85 111 112 140 141 168 196 224 364 448 476\n", 144 | " 504 532 560 587 588 616 644 645 671 672 673 699 700 701 727 728 729 730\n", 145 | " 731 753 754 755 756 757 758 759 780 781 782 783] are constant.\n", 146 | " UserWarning)\n", 147 | "/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/univariate_selection.py:113: UserWarning: Features [ 0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19 20 21\n", 148 | " 22 23 24 25 26 27 28 29 30 31 32 52 53 54 55 56 57 59\n", 149 | " 82 83 84 85 111 112 113 140 168 476 504 532 560 644 671 672 673 699\n", 150 | " 700 701 727 728 729 730 753 754 755 756 757 758 759 780 781 782 783] are constant.\n", 151 | " UserWarning)\n", 152 | "[Parallel(n_jobs=-1)]: Done 3 out of 3 | elapsed: 20.6min finished\n", 153 | "/usr/local/lib/python2.7/dist-packages/sklearn/feature_selection/univariate_selection.py:113: UserWarning: Features [ 0 1 2 3 4 5 6 7 8 9 10 11 16 17 18 19 20 21\n", 154 | " 22 23 24 25 26 27 28 29 30 31 32 52 53 54 55 56 57 82\n", 155 | " 83 84 85 111 112 140 168 476 504 532 560 644 671 672 673 699 700 701\n", 156 | " 727 728 729 730 753 754 755 756 757 758 759 780 781 782 783] are constant.\n", 157 | " UserWarning)\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "from sklearn.neighbors import KNeighborsClassifier\n", 163 | "from sklearn.decomposition import PCA\n", 164 | "from sklearn.feature_selection import SelectKBest\n", 165 | "\n", 166 | "# This dataset is way too high-dimensional. Better do PCA:\n", 167 | "pca = PCA(n_components=2)\n", 168 | "\n", 169 | "# Maybe some original features where good, too?\n", 170 | "selection = SelectKBest(k=1)\n", 171 | "\n", 172 | "combined_features = FeatureUnion([\n", 173 | " (\"PCA\", PCA()), \n", 174 | " (\"univ_select\", SelectKBest())\n", 175 | "])\n", 176 | "\n", 177 | "pipeline = Pipeline([\n", 178 | " ('features', combined_features),\n", 179 | " ('KNN', KNeighborsClassifier())\n", 180 | "])\n", 181 | "\n", 182 | "param_grid ={\n", 183 | " 'features__PCA__svd_solver': ['full', 'arpack', 'randomized'],\n", 184 | " 'features__PCA__n_components':[1,2,3],\n", 185 | " 'features__univ_select__k':[1,2],\n", 186 | "}\n", 187 | "\n", 188 | "\n", 189 | "cv = GridSearchCV(\n", 190 | " pipeline, param_grid={}, n_jobs=-1, verbose=1, cv=3, scoring='accuracy'\n", 191 | ")\n", 192 | "knncvscore = cv.fit(X, Y)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 45, 198 | "metadata": { 199 | "collapsed": false 200 | }, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "('Knn CV Score is: ', GridSearchCV(cv=3, error_score='raise',\n", 207 | " estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,\n", 208 | " transformer_list=[('PCA', PCA(copy=True, n_components=None, whiten=False)), ('univ_select', SelectKBest(k=10, score_func=))],\n", 209 | " transformer_weights=None)), ('KNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 210 | " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", 211 | " weights='uniform'))]),\n", 212 | " fit_params={}, iid=True, n_jobs=-1, param_grid={},\n", 213 | " pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=1))\n", 214 | "0.9725\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "from sklearn.metrics import accuracy_score\n", 220 | "predictions = cv.predict(kaggle_x)\n", 221 | "accscore = accuracy_score(predictions, kaggle_y)\n", 222 | "\n", 223 | "print(\"Knn CV Score is: \", knncvscore)\n", 224 | "print accscore" 225 | ] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 2", 231 | "language": "python", 232 | "name": "python2" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 2 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython2", 244 | "version": "2.7.12" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 1 249 | } 250 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP_Learning 2 | Code for CSSI course Natural Language Processing. 3 | 4 | Part for NLTK tool. 5 | 6 | -------------------------------------------------------------------------------- /chapter01/Simple_tokenization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Lich_Amnesia 3 | # @Email: alwaysxiaop@gmail.com 4 | # @Date: 2016-08-25 00:29:27 5 | # @Last Modified time: 2016-09-01 02:14:09 6 | # @FileName: Simple_tokenization.py 7 | # @Changed 8 | import nltk 9 | import platform 10 | import re 11 | if (platform.system() == 'Windows'): 12 | nltk.data.path.append("D:\Work\Judge\Project\\nltk_data") 13 | else: 14 | nltk.data.path.append("/home/vagrant/Project/nltk_data") 15 | 16 | # open txt file and make every line to array 17 | 18 | # [ ['The','company' ] ['This', 'company' ]] 19 | lines = [line.strip() for line in open("wsj-short.txt")] 20 | line = [] 21 | 22 | num = 0 23 | for l in lines: 24 | num += len(re.findall("[\w']+|[.,!?;]+", l)) 25 | print("Question 6: " + str(num)) 26 | 27 | 28 | # for problem 7 29 | num = 0 30 | for ll in lines: 31 | num = num + len(re.findall("(Jan\.)|(Feb\.)|(Mar\.)|(Apr\.)|(Jun\.)|(Jul\.)|(Aug\.)|(Sep\.)|(Oct\.)|(Nov\.)|(Dec\.)|([\w']+)|([.,!?;]+)", ll)) 32 | print("Question 7: " + str(num)) 33 | -------------------------------------------------------------------------------- /chapter01/wsj-short.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LichAmnesia/NLP_Learning/60e9263cdbc43dd7efabc8609b21600bf89e879e/chapter01/wsj-short.txt -------------------------------------------------------------------------------- /fortest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Lich_Amnesia 3 | # @Email: alwaysxiaop@gmail.com 4 | # @Date: 2016-09-01 11:50:12 5 | # @Last Modified time: 2016-09-01 11:50:35 6 | # @FileName: fortest.py -------------------------------------------------------------------------------- /lstm/Word embedding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 30, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "'''This script loads pre-trained word embeddings (GloVe embeddings)\n", 12 | "into a frozen Keras Embedding layer, and uses it to\n", 13 | "train a text classification model on the 20 Newsgroup dataset\n", 14 | "(classication of newsgroup messages into 20 different categories).\n", 15 | "GloVe embedding data can be found at:\n", 16 | "http://nlp.stanford.edu/data/glove.6B.zip\n", 17 | "(source page: http://nlp.stanford.edu/projects/glove/)\n", 18 | "20 Newsgroup data can be found at:\n", 19 | "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html\n", 20 | "'''\n", 21 | "\n", 22 | "from __future__ import print_function\n", 23 | "import os\n", 24 | "import numpy as np\n", 25 | "np.random.seed(1337)\n", 26 | "\n", 27 | "from keras.preprocessing.text import Tokenizer\n", 28 | "from keras.preprocessing.sequence import pad_sequences\n", 29 | "from keras.utils.np_utils import to_categorical\n", 30 | "from keras.layers import Dense, Input, Flatten\n", 31 | "from keras.models import Sequential\n", 32 | "from keras.layers import Dense, Dropout, Activation, Embedding\n", 33 | "from keras.layers import Conv1D, MaxPooling1D, Embedding\n", 34 | "from keras.models import Model\n", 35 | "from keras.layers import LSTM, SimpleRNN, GRU\n", 36 | "import sys" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 8, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "Indexing word vectors.\n", 51 | "Found 400000 word vectors.\n", 52 | "Processing text dataset\n", 53 | "Found 19997 texts.\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "BASE_DIR = '/home/lich/Workspace/Learning'\n", 59 | "GLOVE_DIR = BASE_DIR + '/glove.6B/'\n", 60 | "TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'\n", 61 | "MAX_SEQUENCE_LENGTH = 1000\n", 62 | "MAX_NB_WORDS = 20000\n", 63 | "EMBEDDING_DIM = 100\n", 64 | "VALIDATION_SPLIT = 0.2\n", 65 | "batch_size = 32\n", 66 | "\n", 67 | "# first, build index mapping words in the embeddings set\n", 68 | "# to their embedding vector\n", 69 | "\n", 70 | "print('Indexing word vectors.')\n", 71 | "\n", 72 | "embeddings_index = {}\n", 73 | "f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))\n", 74 | "for line in f:\n", 75 | " values = line.split()\n", 76 | " word = values[0]\n", 77 | " coefs = np.asarray(values[1:], dtype='float32')\n", 78 | " embeddings_index[word] = coefs\n", 79 | "f.close()\n", 80 | "\n", 81 | "print('Found %s word vectors.' % len(embeddings_index))\n", 82 | "\n", 83 | "# second, prepare text samples and their labels\n", 84 | "print('Processing text dataset')\n", 85 | "\n", 86 | "texts = [] # list of text samples\n", 87 | "labels_index = {} # dictionary mapping label name to numeric id\n", 88 | "labels = [] # list of label ids\n", 89 | "for name in sorted(os.listdir(TEXT_DATA_DIR)):\n", 90 | " path = os.path.join(TEXT_DATA_DIR, name)\n", 91 | " if os.path.isdir(path):\n", 92 | " label_id = len(labels_index)\n", 93 | " labels_index[name] = label_id\n", 94 | " for fname in sorted(os.listdir(path)):\n", 95 | " if fname.isdigit():\n", 96 | " fpath = os.path.join(path, fname)\n", 97 | " if sys.version_info < (3,):\n", 98 | " f = open(fpath)\n", 99 | " else:\n", 100 | " f = open(fpath, encoding='latin-1')\n", 101 | " texts.append(f.read())\n", 102 | " f.close()\n", 103 | " labels.append(label_id)\n", 104 | "\n", 105 | "print('Found %s texts.' % len(texts))\n", 106 | "\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 9, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "Found 214909 unique tokens.\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "# finally, vectorize the text samples into a 2D integer tensor\n", 126 | "tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)\n", 127 | "tokenizer.fit_on_texts(texts)\n", 128 | "sequences = tokenizer.texts_to_sequences(texts)\n", 129 | "\n", 130 | "word_index = tokenizer.word_index\n", 131 | "print('Found %s unique tokens.' % len(word_index))\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 22, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "1654\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "print(len(sequences[0]))\n", 151 | "# print(texts[0])" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 21, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "29\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "\n", 171 | "print(word_index[\"cantaloupe\"])" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 14, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "Shape of data tensor: (19997, 1000)\n", 186 | "Shape of label tensor: (19997, 20)\n", 187 | "Preparing embedding matrix.\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "\n", 193 | "\n", 194 | "data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)\n", 195 | "\n", 196 | "labels = to_categorical(np.asarray(labels))\n", 197 | "print('Shape of data tensor:', data.shape)\n", 198 | "print('Shape of label tensor:', labels.shape)\n", 199 | "\n", 200 | "# split the data into a training set and a validation set\n", 201 | "indices = np.arange(data.shape[0])\n", 202 | "np.random.shuffle(indices)\n", 203 | "data = data[indices]\n", 204 | "labels = labels[indices]\n", 205 | "nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])\n", 206 | "\n", 207 | "x_train = data[:-nb_validation_samples]\n", 208 | "y_train = labels[:-nb_validation_samples]\n", 209 | "x_val = data[-nb_validation_samples:]\n", 210 | "y_val = labels[-nb_validation_samples:]\n", 211 | "\n", 212 | "print('Preparing embedding matrix.')" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 23, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "3999\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "print(nb_validation_samples)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 24, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "(20001, 100)\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "\n", 251 | "\n", 252 | "# prepare embedding matrix\n", 253 | "nb_words = min(MAX_NB_WORDS, len(word_index))\n", 254 | "embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))\n", 255 | "for word, i in word_index.items():\n", 256 | " if i > MAX_NB_WORDS:\n", 257 | " continue\n", 258 | " embedding_vector = embeddings_index.get(word)\n", 259 | " if embedding_vector is not None:\n", 260 | " # words not found in embedding index will be all-zeros.\n", 261 | " embedding_matrix[i] = embedding_vector\n", 262 | "\n", 263 | "print(embedding_matrix.shape)\n", 264 | " \n", 265 | " \n" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 27, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [ 275 | { 276 | "name": "stdout", 277 | "output_type": "stream", 278 | "text": [ 279 | "Training model.\n", 280 | "(15998, 1000)\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "# load pre-trained word embeddings into an Embedding layer\n", 286 | "# note that we set trainable = False so as to keep the embeddings fixed\n", 287 | "embedding_layer = Embedding(nb_words + 1,\n", 288 | " EMBEDDING_DIM,\n", 289 | " weights=[embedding_matrix],\n", 290 | " input_length=MAX_SEQUENCE_LENGTH,\n", 291 | " trainable=False)\n", 292 | "\n", 293 | "print('Training model.')\n", 294 | "\n", 295 | "# train a 1D convnet with global maxpooling\n", 296 | "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", 297 | "embedded_sequences = embedding_layer(sequence_input)\n", 298 | "x = Conv1D(128, 5, activation='relu')(embedded_sequences)\n", 299 | "x = MaxPooling1D(5)(x)\n", 300 | "x = Conv1D(128, 5, activation='relu')(x)\n", 301 | "x = MaxPooling1D(5)(x)\n", 302 | "x = Conv1D(128, 5, activation='relu')(x)\n", 303 | "x = MaxPooling1D(35)(x)\n", 304 | "x = Flatten()(x)\n", 305 | "x = Dense(128, activation='relu')(x)\n", 306 | "\n", 307 | "preds = Dense(len(labels_index), activation='softmax')(x)\n", 308 | "\n", 309 | "model = Model(sequence_input, preds)\n", 310 | "model.compile(loss='categorical_crossentropy',\n", 311 | " optimizer='rmsprop',\n", 312 | " metrics=['acc'])\n", 313 | "print(x_train.shape)\n" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 38, 319 | "metadata": { 320 | "collapsed": false 321 | }, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "20000 1000\n", 328 | "(15998, 1000) (15998, 20) (3999, 1000) (3999, 20)\n", 329 | "[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.\n", 330 | " 0. 0.]\n" 331 | ] 332 | } 333 | ], 334 | "source": [ 335 | "print(nb_words, MAX_SEQUENCE_LENGTH)\n", 336 | "print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)\n", 337 | "print(y_train[0])" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 37, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "Build model...\n", 352 | "Train...\n", 353 | "Train on 15998 samples, validate on 3999 samples\n", 354 | "Epoch 1/5\n", 355 | "15998/15998 [==============================] - 881s - loss: 0.1980 - acc: 0.9500 - val_loss: 0.1962 - val_acc: 0.9500\n", 356 | "Epoch 2/5\n", 357 | "15998/15998 [==============================] - 866s - loss: 0.1967 - acc: 0.9500 - val_loss: 0.1951 - val_acc: 0.9500\n", 358 | "Epoch 3/5\n", 359 | "15998/15998 [==============================] - 831s - loss: 0.1953 - acc: 0.9500 - val_loss: 0.1938 - val_acc: 0.9500\n", 360 | "Epoch 4/5\n", 361 | "15998/15998 [==============================] - 813s - loss: 0.1941 - acc: 0.9500 - val_loss: 0.1925 - val_acc: 0.9500\n", 362 | "Epoch 5/5\n", 363 | "15998/15998 [==============================] - 812s - loss: 0.1939 - acc: 0.9500 - val_loss: 0.1922 - val_acc: 0.9500\n", 364 | "3999/3999 [==============================] - 63s \n", 365 | "Test score: 0.192154919208\n", 366 | "Test accuracy: 0.949999988079\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "embedding_layer = Embedding(nb_words + 1,\n", 372 | " EMBEDDING_DIM,\n", 373 | " weights=[embedding_matrix],\n", 374 | " input_length=MAX_SEQUENCE_LENGTH,\n", 375 | " trainable=False,\n", 376 | " dropout=0.2)\n", 377 | "batch_size = 32\n", 378 | "\n", 379 | "print('Build model...')\n", 380 | "# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n", 381 | "# embedded_sequences = embedding_layer()\n", 382 | "model = Sequential()\n", 383 | "model.add(embedding_layer)\n", 384 | "model.add(LSTM(100, dropout_W=0.2, dropout_U=0.2)) # try using a GRU instead, for fun\n", 385 | "model.add(Dense(1))\n", 386 | "model.add(Activation('sigmoid'))\n", 387 | "model.add(Dense(len(labels_index), activation='softmax'))\n", 388 | "\n", 389 | "# try using different optimizers and different optimizer configs\n", 390 | "model.compile(loss='binary_crossentropy',\n", 391 | " optimizer='adam',\n", 392 | " metrics=['accuracy'])\n", 393 | "\n", 394 | "print('Train...')\n", 395 | "model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=5,\n", 396 | " validation_data=(x_val, y_val))\n", 397 | "score, acc = model.evaluate(x_val, y_val,\n", 398 | " batch_size=batch_size)\n", 399 | "print('Test score:', score)\n", 400 | "print('Test accuracy:', acc)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 28, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [ 410 | { 411 | "name": "stdout", 412 | "output_type": "stream", 413 | "text": [ 414 | "Train on 15998 samples, validate on 3999 samples\n", 415 | "Epoch 1/2\n", 416 | "15998/15998 [==============================] - 368s - loss: 1.4332 - acc: 0.5437 - val_loss: 0.1743 - val_acc: 0.9417\n", 417 | "Epoch 2/2\n", 418 | "15998/15998 [==============================] - 378s - loss: 0.1929 - acc: 0.9364 - val_loss: 0.1477 - val_acc: 0.9487\n" 419 | ] 420 | }, 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "" 425 | ] 426 | }, 427 | "execution_count": 28, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "\n", 434 | "# happy learning!\n", 435 | "model.fit(x_train, y_train, validation_data=(x_val, y_val),\n", 436 | " nb_epoch=2, batch_size=128)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": true 444 | }, 445 | "outputs": [], 446 | "source": [] 447 | } 448 | ], 449 | "metadata": { 450 | "kernelspec": { 451 | "display_name": "Python 2", 452 | "language": "python", 453 | "name": "python2" 454 | }, 455 | "language_info": { 456 | "codemirror_mode": { 457 | "name": "ipython", 458 | "version": 2 459 | }, 460 | "file_extension": ".py", 461 | "mimetype": "text/x-python", 462 | "name": "python", 463 | "nbconvert_exporter": "python", 464 | "pygments_lexer": "ipython2", 465 | "version": "2.7.12" 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 1 470 | } 471 | -------------------------------------------------------------------------------- /lstm/keras lstm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "Using TensorFlow backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from __future__ import print_function\n", 20 | "import numpy as np\n", 21 | "np.random.seed(1337) # for reproducibility\n", 22 | "\n", 23 | "from keras.preprocessing import sequence\n", 24 | "from keras.utils import np_utils\n", 25 | "from keras.models import Sequential\n", 26 | "from keras.layers import Dense, Dropout, Activation, Embedding\n", 27 | "from keras.layers import LSTM, SimpleRNN, GRU\n", 28 | "from keras.datasets import imdb" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": true 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "max_features = 20000\n", 40 | "maxlen = 80\n", 41 | "batch_size = 32" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "load data\n", 56 | "25000 train sequences\n", 57 | "25000 test sequences\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "print(\"load data\")\n", 63 | "(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)\n", 64 | "print(len(X_train), 'train sequences')\n", 65 | "print(len(X_test), 'test sequences')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "(25000, 80)\n", 80 | "(25000, 80)\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "print(X_train.shape)\n", 86 | "print(X_test.shape)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "Pad sequences (samples x time)\n", 101 | "X_train shape: (25000, 80)\n", 102 | "X_test shape: (25000, 80)\n", 103 | "y_train shape: (25000,)\n", 104 | "y_test shape: (25000,)\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "\n", 110 | "print('Pad sequences (samples x time)')\n", 111 | "X_train = sequence.pad_sequences(X_train, maxlen=maxlen)\n", 112 | "X_test = sequence.pad_sequences(X_test, maxlen=maxlen)\n", 113 | "print('X_train shape:', X_train.shape)\n", 114 | "print('X_test shape:', X_test.shape)\n", 115 | "print('y_train shape:', y_train.shape)\n", 116 | "print('y_test shape:', y_test.shape)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "\n", 128 | "print('Build model...')\n", 129 | "model = Sequential()\n", 130 | "model.add(Embedding(max_features, 128, dropout=0.2))\n", 131 | "model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) # try using a GRU instead, for fun\n", 132 | "model.add(Dense(1))\n", 133 | "model.add(Activation('sigmoid'))\n", 134 | "\n", 135 | "# try using different optimizers and different optimizer configs\n", 136 | "model.compile(loss='binary_crossentropy',\n", 137 | " optimizer='adam',\n", 138 | " metrics=['accuracy'])\n", 139 | "\n", 140 | "print('Train...')\n", 141 | "model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15,\n", 142 | " validation_data=(X_test, y_test))\n", 143 | "score, acc = model.evaluate(X_test, y_test,\n", 144 | " batch_size=batch_size)\n", 145 | "print('Test score:', score)\n", 146 | "print('Test accuracy:', acc)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "\n" 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "kernelspec": { 163 | "display_name": "Python 2", 164 | "language": "python", 165 | "name": "python2" 166 | }, 167 | "language_info": { 168 | "codemirror_mode": { 169 | "name": "ipython", 170 | "version": 2 171 | }, 172 | "file_extension": ".py", 173 | "mimetype": "text/x-python", 174 | "name": "python", 175 | "nbconvert_exporter": "python", 176 | "pygments_lexer": "ipython2", 177 | "version": "2.7.12" 178 | } 179 | }, 180 | "nbformat": 4, 181 | "nbformat_minor": 1 182 | } 183 | -------------------------------------------------------------------------------- /nltk_tutorial/Readme.md: -------------------------------------------------------------------------------- 1 | ## Edit the nltk_data path 2 | To modify the path, simply append to the list of possible paths: 3 | ```python 4 | import nltk 5 | nltk.data.path.append("/home/yourusername/path/") 6 | ``` 7 | Or in windows: 8 | ```python 9 | import nltk 10 | nltk.data.path.append("C:\somewhere\path") 11 | ``` -------------------------------------------------------------------------------- /nltk_tutorial/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Lich_Amnesia 3 | # @Email: alwaysxiaop@gmail.com 4 | # @Date: 2016-08-24 23:53:57 5 | # @Last Modified time: 2016-08-25 14:30:47 6 | # @FileName: test.py 7 | 8 | import nltk 9 | # nltk.data.path.append("/home/vagrant/Project/nltk_data") 10 | import platform 11 | import re 12 | if (platform.system() == 'Windows'): 13 | nltk.data.path.append("D:\Work\Judge\Project\\nltk_data") 14 | else: 15 | nltk.data.path.append("/home/vagrant/Project/nltk_data") 16 | 17 | 18 | # define a sentence 19 | sentence = """At eight o'clock on Thuesday evening, 20 | Lich didn't feel good.""" 21 | 22 | # get tokens of this sentence 23 | tokens = nltk.word_tokenize(sentence) 24 | print(tokens) 25 | 26 | # get tags 27 | tag = nltk.pos_tag(tokens) 28 | print(tag) 29 | 30 | entities = nltk.chunk.ne_chunk(tag) 31 | print(entities) 32 | --------------------------------------------------------------------------------