├── .DS_Store ├── aula2.2 ├── .DS_Store └── aula2_2.ipynb ├── aula3.1 ├── .DS_Store └── aula3_1.ipynb ├── aula3.2 ├── .DS_Store └── aula3_2.ipynb ├── aula3.3 ├── .DS_Store └── aula3_3.ipynb ├── aula1.2 └── Aula1.ipynb ├── aula1.3 └── aula1_3.ipynb ├── aula1.4 └── aula1_4.ipynb ├── aula2.3 └── aula2_3.ipynb ├── aula3.4 └── aula3_4.ipynb ├── aula4.1 └── aula4_1.ipynb ├── aula4.2 └── aula4_2.ipynb ├── aula4.3 └── aula4_3.ipynb ├── aula5.1 └── aula5_1.ipynb ├── aula5.2 └── aula5_2.ipynb ├── aula5.3 └── aula5_3.ipynb └── aula5.4 └── aula5_4.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alura-es-cursos/1766-nlp-parte1/master/.DS_Store -------------------------------------------------------------------------------- /aula2.2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alura-es-cursos/1766-nlp-parte1/master/aula2.2/.DS_Store -------------------------------------------------------------------------------- /aula3.1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alura-es-cursos/1766-nlp-parte1/master/aula3.1/.DS_Store -------------------------------------------------------------------------------- /aula3.2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alura-es-cursos/1766-nlp-parte1/master/aula3.2/.DS_Store -------------------------------------------------------------------------------- /aula3.3/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alura-es-cursos/1766-nlp-parte1/master/aula3.3/.DS_Store -------------------------------------------------------------------------------- /aula1.2/Aula1.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Aula1.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyN4PHgDehaK8HjAOWPbiBv3"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","metadata":{"id":"RlMrNcDhifwZ"},"source":["import pandas as pd\n","pd.set_option('display.max_rows', None)\n","pd.set_option('display.max_columns', None)\n","pd.set_option('display.width', None)\n","pd.set_option('display.max_colwidth', None)\n","import warnings\n","warnings.filterwarnings(\"ignore\")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"Xvu4B5ndijmx"},"source":["df = pd.read_csv('/content/sample_data/colombian_elections.csv')"],"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /aula1.3/aula1_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula1.3.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | } 139 | ] 140 | } -------------------------------------------------------------------------------- /aula1.4/aula1_4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula1.4.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /aula2.2/aula2_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula2.2.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | } 253 | ] 254 | } -------------------------------------------------------------------------------- /aula2.3/aula2_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula2.3.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | } 316 | ] 317 | } -------------------------------------------------------------------------------- /aula3.1/aula3_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula3.1.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | } 344 | ] 345 | } -------------------------------------------------------------------------------- /aula3.2/aula3_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula3.2.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | } 417 | ] 418 | } -------------------------------------------------------------------------------- /aula3.3/aula3_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula3.2.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | } 417 | ] 418 | } -------------------------------------------------------------------------------- /aula3.4/aula3_4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula3.4.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | } 495 | ] 496 | } -------------------------------------------------------------------------------- /aula4.1/aula4_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula4.1.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | } 521 | ] 522 | } -------------------------------------------------------------------------------- /aula4.2/aula4_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula4.2.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "ne-BxXgz57Dr" 525 | }, 526 | "source": [ 527 | "from nltk import tokenize" 528 | ], 529 | "execution_count": null, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "_eSYNSIN59ej" 536 | }, 537 | "source": [ 538 | "frase2 = 'el candidato quiere aprender LNP!'\n", 539 | "\n", 540 | "token_espacio = tokenize.WhitespaceTokenizer()\n", 541 | "\n", 542 | "frase_token = token_espacio.tokenize(frase2)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "9J2cg7wO6Ayu" 551 | }, 552 | "source": [ 553 | "print(frase_token)" 554 | ], 555 | "execution_count": null, 556 | "outputs": [] 557 | } 558 | ] 559 | } -------------------------------------------------------------------------------- /aula4.3/aula4_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula4.3.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "ne-BxXgz57Dr" 525 | }, 526 | "source": [ 527 | "from nltk import tokenize" 528 | ], 529 | "execution_count": null, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "_eSYNSIN59ej" 536 | }, 537 | "source": [ 538 | "frase2 = 'el candidato quiere aprender LNP!'\n", 539 | "\n", 540 | "token_espacio = tokenize.WhitespaceTokenizer()\n", 541 | "\n", 542 | "frase_token = token_espacio.tokenize(frase2)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "9J2cg7wO6Ayu" 551 | }, 552 | "source": [ 553 | "print(frase_token)" 554 | ], 555 | "execution_count": null, 556 | "outputs": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "metadata": { 561 | "id": "9XTPQNph6c8y" 562 | }, 563 | "source": [ 564 | "frase_token = token_espacio.tokenize(palabras)\n", 565 | "\n", 566 | "frequencia = nltk.FreqDist(frase_token)\n", 567 | "\n", 568 | "df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())})" 569 | ], 570 | "execution_count": null, 571 | "outputs": [] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "metadata": { 576 | "id": "mSTKKEOE6g3b" 577 | }, 578 | "source": [ 579 | "df_frequencia.head()" 580 | ], 581 | "execution_count": null, 582 | "outputs": [] 583 | } 584 | ] 585 | } -------------------------------------------------------------------------------- /aula5.1/aula5_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula5.1.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "ne-BxXgz57Dr" 525 | }, 526 | "source": [ 527 | "from nltk import tokenize" 528 | ], 529 | "execution_count": null, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "_eSYNSIN59ej" 536 | }, 537 | "source": [ 538 | "frase2 = 'el candidato quiere aprender LNP!'\n", 539 | "\n", 540 | "token_espacio = tokenize.WhitespaceTokenizer()\n", 541 | "\n", 542 | "frase_token = token_espacio.tokenize(frase2)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "9J2cg7wO6Ayu" 551 | }, 552 | "source": [ 553 | "print(frase_token)" 554 | ], 555 | "execution_count": null, 556 | "outputs": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "metadata": { 561 | "id": "9XTPQNph6c8y" 562 | }, 563 | "source": [ 564 | "frase_token = token_espacio.tokenize(palabras)\n", 565 | "\n", 566 | "frequencia = nltk.FreqDist(frase_token)\n", 567 | "\n", 568 | "df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())})" 569 | ], 570 | "execution_count": null, 571 | "outputs": [] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "metadata": { 576 | "id": "mSTKKEOE6g3b" 577 | }, 578 | "source": [ 579 | "df_frequencia.head()" 580 | ], 581 | "execution_count": null, 582 | "outputs": [] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "metadata": { 587 | "id": "mIXGvBxx7Rzt" 588 | }, 589 | "source": [ 590 | "def histo_barras(texto, columna_texto, cantidad): \n", 591 | " \n", 592 | " plt.figure(figsize=(12,8))\n", 593 | "\n", 594 | " palabras = ' '.join([palabras for palabras in df['texto']])\n", 595 | "\n", 596 | " frase_token = token_espacio.tokenize(palabras)\n", 597 | "\n", 598 | " frequencia = nltk.FreqDist(frase_token)\n", 599 | "\n", 600 | " df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())}) \n", 601 | "\n", 602 | " df_frequencia = df_frequencia.nlargest(columns='frequencia', n = cantidad)\n", 603 | "\n", 604 | " ax = sns.barplot(data=df_frequencia, x='palabra', y='frequencia', color='gray')\n", 605 | "\n", 606 | " ax.set(ylabel = 'conteo')\n", 607 | "\n", 608 | " plt.show()" 609 | ], 610 | "execution_count": null, 611 | "outputs": [] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "metadata": { 616 | "id": "vopAPhqw7U1O" 617 | }, 618 | "source": [ 619 | "histo_barras(df, \"texto\", 30)" 620 | ], 621 | "execution_count": null, 622 | "outputs": [] 623 | } 624 | ] 625 | } -------------------------------------------------------------------------------- /aula5.2/aula5_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula5.2.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "ne-BxXgz57Dr" 525 | }, 526 | "source": [ 527 | "from nltk import tokenize" 528 | ], 529 | "execution_count": null, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "_eSYNSIN59ej" 536 | }, 537 | "source": [ 538 | "frase2 = 'el candidato quiere aprender LNP!'\n", 539 | "\n", 540 | "token_espacio = tokenize.WhitespaceTokenizer()\n", 541 | "\n", 542 | "frase_token = token_espacio.tokenize(frase2)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "9J2cg7wO6Ayu" 551 | }, 552 | "source": [ 553 | "print(frase_token)" 554 | ], 555 | "execution_count": null, 556 | "outputs": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "metadata": { 561 | "id": "9XTPQNph6c8y" 562 | }, 563 | "source": [ 564 | "frase_token = token_espacio.tokenize(palabras)\n", 565 | "\n", 566 | "frequencia = nltk.FreqDist(frase_token)\n", 567 | "\n", 568 | "df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())})" 569 | ], 570 | "execution_count": null, 571 | "outputs": [] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "metadata": { 576 | "id": "mSTKKEOE6g3b" 577 | }, 578 | "source": [ 579 | "df_frequencia.head()" 580 | ], 581 | "execution_count": null, 582 | "outputs": [] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "metadata": { 587 | "id": "mIXGvBxx7Rzt" 588 | }, 589 | "source": [ 590 | "def histo_barras(texto, columna_texto, cantidad): \n", 591 | " \n", 592 | " plt.figure(figsize=(12,8))\n", 593 | "\n", 594 | " palabras = ' '.join([palabras for palabras in df['texto']])\n", 595 | "\n", 596 | " frase_token = token_espacio.tokenize(palabras)\n", 597 | "\n", 598 | " frequencia = nltk.FreqDist(frase_token)\n", 599 | "\n", 600 | " df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())}) \n", 601 | "\n", 602 | " df_frequencia = df_frequencia.nlargest(columns='frequencia', n = cantidad)\n", 603 | "\n", 604 | " ax = sns.barplot(data=df_frequencia, x='palabra', y='frequencia', color='gray')\n", 605 | "\n", 606 | " ax.set(ylabel = 'conteo')\n", 607 | "\n", 608 | " plt.show()" 609 | ], 610 | "execution_count": null, 611 | "outputs": [] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "metadata": { 616 | "id": "vopAPhqw7U1O" 617 | }, 618 | "source": [ 619 | "histo_barras(df, \"texto\", 30)" 620 | ], 621 | "execution_count": null, 622 | "outputs": [] 623 | } 624 | ] 625 | } -------------------------------------------------------------------------------- /aula5.3/aula5_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula5.3.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "ne-BxXgz57Dr" 525 | }, 526 | "source": [ 527 | "from nltk import tokenize" 528 | ], 529 | "execution_count": null, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "_eSYNSIN59ej" 536 | }, 537 | "source": [ 538 | "frase2 = 'el candidato quiere aprender LNP!'\n", 539 | "\n", 540 | "token_espacio = tokenize.WhitespaceTokenizer()\n", 541 | "\n", 542 | "frase_token = token_espacio.tokenize(frase2)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "9J2cg7wO6Ayu" 551 | }, 552 | "source": [ 553 | "print(frase_token)" 554 | ], 555 | "execution_count": null, 556 | "outputs": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "metadata": { 561 | "id": "9XTPQNph6c8y" 562 | }, 563 | "source": [ 564 | "frase_token = token_espacio.tokenize(palabras)\n", 565 | "\n", 566 | "frequencia = nltk.FreqDist(frase_token)\n", 567 | "\n", 568 | "df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())})" 569 | ], 570 | "execution_count": null, 571 | "outputs": [] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "metadata": { 576 | "id": "mSTKKEOE6g3b" 577 | }, 578 | "source": [ 579 | "df_frequencia.head()" 580 | ], 581 | "execution_count": null, 582 | "outputs": [] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "metadata": { 587 | "id": "mIXGvBxx7Rzt" 588 | }, 589 | "source": [ 590 | "def histo_barras(texto, columna_texto, cantidad): \n", 591 | " \n", 592 | " plt.figure(figsize=(12,8))\n", 593 | "\n", 594 | " palabras = ' '.join([palabras for palabras in df['texto']])\n", 595 | "\n", 596 | " frase_token = token_espacio.tokenize(palabras)\n", 597 | "\n", 598 | " frequencia = nltk.FreqDist(frase_token)\n", 599 | "\n", 600 | " df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())}) \n", 601 | "\n", 602 | " df_frequencia = df_frequencia.nlargest(columns='frequencia', n = cantidad)\n", 603 | "\n", 604 | " ax = sns.barplot(data=df_frequencia, x='palabra', y='frequencia', color='gray')\n", 605 | "\n", 606 | " ax.set(ylabel = 'conteo')\n", 607 | "\n", 608 | " plt.show()" 609 | ], 610 | "execution_count": null, 611 | "outputs": [] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "metadata": { 616 | "id": "vopAPhqw7U1O" 617 | }, 618 | "source": [ 619 | "histo_barras(df, \"texto\", 30)" 620 | ], 621 | "execution_count": null, 622 | "outputs": [] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "metadata": { 627 | "id": "s7YZ-z3J70VP" 628 | }, 629 | "source": [ 630 | "from nltk.corpus import stopwords" 631 | ], 632 | "execution_count": null, 633 | "outputs": [] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "metadata": { 638 | "id": "2xAQ00BT8J-s" 639 | }, 640 | "source": [ 641 | "irrelevantes = nltk.corpus.stopwords.words('spanish')\n", 642 | "\n", 643 | "frase_process = list()\n", 644 | "\n", 645 | "for opinion in df.texto:\n", 646 | " nueva_frase = list()\n", 647 | " palabras_texto = token_espacio.tokenize(opinion)\n", 648 | " for palabra in palabras_texto:\n", 649 | " if palabra not in irrelevantes:\n", 650 | " nueva_frase.append(palabra)\n", 651 | " frase_process.append(' '.join(nueva_frase))\n", 652 | "\n", 653 | "df['tratamiento_1'] = frase_process " 654 | ], 655 | "execution_count": null, 656 | "outputs": [] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "metadata": { 661 | "id": "xOdZpy7i8Mki" 662 | }, 663 | "source": [ 664 | "df.head()" 665 | ], 666 | "execution_count": null, 667 | "outputs": [] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "metadata": { 672 | "id": "9gS5WohD8PlL" 673 | }, 674 | "source": [ 675 | "clasificador(df, \"tratamiento_1\", \"sentiment\")" 676 | ], 677 | "execution_count": null, 678 | "outputs": [] 679 | } 680 | ] 681 | } -------------------------------------------------------------------------------- /aula5.4/aula5_4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "aula5.4.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "dr-kjMPUrfCn" 23 | }, 24 | "source": [ 25 | "import pandas as pd\n", 26 | "pd.set_option('display.max_rows', None)\n", 27 | "pd.set_option('display.max_columns', None)\n", 28 | "pd.set_option('display.width', None)\n", 29 | "pd.set_option('display.max_colwidth', None)\n", 30 | "import warnings\n", 31 | "warnings.filterwarnings(\"ignore\")" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "M8NSaAocriEo" 40 | }, 41 | "source": [ 42 | "df = pd.read_csv('/content/sample_data/colombian_elections.csv')" 43 | ], 44 | "execution_count": null, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "_HGAVlSSri8M" 51 | }, 52 | "source": [ 53 | "df = df[['tweetText','polarity_value']].rename(columns={\"tweetText\":\"texto\",\"polarity_value\":\"sentimiento\"})" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "AVilnlMOrlnb" 62 | }, 63 | "source": [ 64 | "df.head()" 65 | ], 66 | "execution_count": null, 67 | "outputs": [] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "4ik-aoQQrpO_" 73 | }, 74 | "source": [ 75 | "df['sentimiento'].value_counts()" 76 | ], 77 | "execution_count": null, 78 | "outputs": [] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "metadata": { 83 | "id": "yqzEp7d7rtci" 84 | }, 85 | "source": [ 86 | "from sklearn.model_selection import train_test_split" 87 | ], 88 | "execution_count": null, 89 | "outputs": [] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "p5TcTZ3Jrv4X" 95 | }, 96 | "source": [ 97 | "X_train, X_test, y_train,y_test = train_test_split(df.texto,df.sentimiento,random_state = 41)" 98 | ], 99 | "execution_count": null, 100 | "outputs": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "metadata": { 105 | "id": "9iAwVXaUryE5" 106 | }, 107 | "source": [ 108 | "X_train" 109 | ], 110 | "execution_count": null, 111 | "outputs": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "clmJfHiUry8f" 117 | }, 118 | "source": [ 119 | "from sklearn.linear_model import LinearRegression\n", 120 | "from sklearn.linear_model import LogisticRegression" 121 | ], 122 | "execution_count": null, 123 | "outputs": [] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "AzZEsFrvr29B" 129 | }, 130 | "source": [ 131 | "regression = LinearRegression()\n", 132 | "regression.fit(X_train,y_train)\n", 133 | "acc = regression.score(X_test,y_test)\n", 134 | "print(acc)" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "Tn_gHwp3suP0" 143 | }, 144 | "source": [ 145 | "df[2:3]" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "qayfItKAsw4J" 154 | }, 155 | "source": [ 156 | "df['sentiment'] = df['sentimiento'].replace([\"N\",\"P\"],[0,1])" 157 | ], 158 | "execution_count": null, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "id": "580QmbeQsy_e" 165 | }, 166 | "source": [ 167 | "df.shape" 168 | ], 169 | "execution_count": null, 170 | "outputs": [] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "id": "-YeouQvKs0_E" 176 | }, 177 | "source": [ 178 | "df.head()" 179 | ], 180 | "execution_count": null, 181 | "outputs": [] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "WL_hJo-Ate11" 187 | }, 188 | "source": [ 189 | "from sklearn.feature_extraction.text import CountVectorizer" 190 | ], 191 | "execution_count": null, 192 | "outputs": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "metadata": { 197 | "id": "Wp4RJV5ztg3R" 198 | }, 199 | "source": [ 200 | "texto = [\"Las propuestas son buenas\", \"Las propuestas son malas\"]\n", 201 | "\n", 202 | "vectorizer = CountVectorizer(lowercase=False)\n", 203 | "\n", 204 | "bag_of_words = vectorizer.fit_transform(texto)" 205 | ], 206 | "execution_count": null, 207 | "outputs": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "metadata": { 212 | "id": "c7siuIvcthlR" 213 | }, 214 | "source": [ 215 | "print(bag_of_words.shape)" 216 | ], 217 | "execution_count": null, 218 | "outputs": [] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "metadata": { 223 | "id": "5bxaOPNEtj0d" 224 | }, 225 | "source": [ 226 | "vectorizer.get_feature_names()" 227 | ], 228 | "execution_count": null, 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "metadata": { 234 | "id": "n-YN70C_tmyC" 235 | }, 236 | "source": [ 237 | "matrix = pd.DataFrame.sparse.from_spmatrix(bag_of_words, columns = vectorizer.get_feature_names())" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2KlXL09QtpWb" 246 | }, 247 | "source": [ 248 | "matrix" 249 | ], 250 | "execution_count": null, 251 | "outputs": [] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "WDrvTZEJvk8g" 257 | }, 258 | "source": [ 259 | "vectorizer2 = CountVectorizer(lowercase=False, max_features = 100)\n", 260 | "\n", 261 | "bag_of_words2 = vectorizer2.fit_transform(df.texto)" 262 | ], 263 | "execution_count": null, 264 | "outputs": [] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "metadata": { 269 | "id": "Nk0Wz4yvvnBQ" 270 | }, 271 | "source": [ 272 | "print(bag_of_words2.shape)" 273 | ], 274 | "execution_count": null, 275 | "outputs": [] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "metadata": { 280 | "id": "JPMrZQbwvpU-" 281 | }, 282 | "source": [ 283 | "X_train, X_test, y_train,y_test = train_test_split(bag_of_words2,df.sentiment,random_state = 41)" 284 | ], 285 | "execution_count": null, 286 | "outputs": [] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "metadata": { 291 | "id": "e7RCEk-2vrgX" 292 | }, 293 | "source": [ 294 | "regression = LinearRegression()\n", 295 | "regression.fit(X_train,y_train)\n", 296 | "acc = regression.score(X_test,y_test)\n", 297 | "print(acc)" 298 | ], 299 | "execution_count": null, 300 | "outputs": [] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "metadata": { 305 | "id": "z4OAc1opvuFd" 306 | }, 307 | "source": [ 308 | "regression = LogisticRegression()\n", 309 | "regression.fit(X_train,y_train)\n", 310 | "acc = regression.score(X_test,y_test)\n", 311 | "print(acc)" 312 | ], 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "metadata": { 319 | "id": "WSk1i52HxTm7" 320 | }, 321 | "source": [ 322 | "def clasificador(texto, columna_texto, categoria):\n", 323 | " vectorizer = CountVectorizer(lowercase=False, max_features = 400)\n", 324 | " bag_of_words = vectorizer.fit_transform(texto[columna_texto])\n", 325 | " X_train, X_test, y_train,y_test = train_test_split(bag_of_words,texto[categoria],random_state = 41)\n", 326 | " regression = LogisticRegression()\n", 327 | " regression.fit(X_train,y_train)\n", 328 | " return regression.score(X_test,y_test)" 329 | ], 330 | "execution_count": null, 331 | "outputs": [] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "metadata": { 336 | "id": "SWRq_PrBxXBA" 337 | }, 338 | "source": [ 339 | "clasificador(df, \"texto\", \"sentiment\")" 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "metadata": { 347 | "id": "PLApD0-ax9S2" 348 | }, 349 | "source": [ 350 | "from wordcloud import WordCloud" 351 | ], 352 | "execution_count": null, 353 | "outputs": [] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "metadata": { 358 | "id": "Akv_WP8Jx_wQ" 359 | }, 360 | "source": [ 361 | "palabras = ' '.join([palabras for palabras in df['texto']])" 362 | ], 363 | "execution_count": null, 364 | "outputs": [] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "metadata": { 369 | "id": "ohhipZYbyCBs" 370 | }, 371 | "source": [ 372 | "palabras" 373 | ], 374 | "execution_count": null, 375 | "outputs": [] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "metadata": { 380 | "id": "RS0fdGKdyEL2" 381 | }, 382 | "source": [ 383 | "len(palabras)" 384 | ], 385 | "execution_count": null, 386 | "outputs": [] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "metadata": { 391 | "id": "W18Mv1_gyHPU" 392 | }, 393 | "source": [ 394 | "import matplotlib.pyplot as plt\n", 395 | "%matplotlib inline \n", 396 | "\n", 397 | "def plot_cloud(wordcloud):\n", 398 | " plt.figure(figsize=(10,5))\n", 399 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 400 | " plt.axis('off')" 401 | ], 402 | "execution_count": null, 403 | "outputs": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "metadata": { 408 | "id": "EaKziQPzyKS9" 409 | }, 410 | "source": [ 411 | "wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 412 | "plot_cloud(wordcloud)" 413 | ], 414 | "execution_count": null, 415 | "outputs": [] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "metadata": { 420 | "id": "8GV4mEBK2N1u" 421 | }, 422 | "source": [ 423 | "df_pos = df.query(\"sentimiento == 'P'\")" 424 | ], 425 | "execution_count": null, 426 | "outputs": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "metadata": { 431 | "id": "KG8ydxdZ2OYh" 432 | }, 433 | "source": [ 434 | "df_pos.head()" 435 | ], 436 | "execution_count": null, 437 | "outputs": [] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "metadata": { 442 | "id": "GdU3bMPg2Qnl" 443 | }, 444 | "source": [ 445 | "def plot_cloud_neg(texto):\n", 446 | " df_neg = df.query(\"sentimiento == 'N'\")\n", 447 | " palabras = ' '.join([palabras for palabras in df_neg['texto']])\n", 448 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 449 | " plt.figure(figsize=(10,5))\n", 450 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 451 | " plt.axis('off')" 452 | ], 453 | "execution_count": null, 454 | "outputs": [] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "metadata": { 459 | "id": "FfoFWgou2TS6" 460 | }, 461 | "source": [ 462 | "def plot_cloud_pos(texto):\n", 463 | " df_pos = df.query(\"sentimiento == 'P'\")\n", 464 | " palabras = ' '.join([palabras for palabras in df_pos['texto']])\n", 465 | " wordcloud = WordCloud(width=800, height=500, collocations=False).generate(palabras)\n", 466 | " plt.figure(figsize=(10,5))\n", 467 | " plt.imshow(wordcloud, interpolation = 'bilinear')\n", 468 | " plt.axis('off')" 469 | ], 470 | "execution_count": null, 471 | "outputs": [] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "metadata": { 476 | "id": "kx_cEVie34YC" 477 | }, 478 | "source": [ 479 | "plot_cloud_neg(df)" 480 | ], 481 | "execution_count": null, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "id": "DGQCXBe_36xK" 488 | }, 489 | "source": [ 490 | "plot_cloud_pos(df)" 491 | ], 492 | "execution_count": null, 493 | "outputs": [] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "metadata": { 498 | "id": "gs4YMc8z43T6" 499 | }, 500 | "source": [ 501 | "import nltk \n", 502 | "\n", 503 | "frase =['el candidato es bueno', 'el candidato es malo']\n", 504 | "\n", 505 | "frequencia = nltk.FreqDist(frase)" 506 | ], 507 | "execution_count": null, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "metadata": { 513 | "id": "n83k7XwW4-Hb" 514 | }, 515 | "source": [ 516 | "frequencia" 517 | ], 518 | "execution_count": null, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "metadata": { 524 | "id": "ne-BxXgz57Dr" 525 | }, 526 | "source": [ 527 | "from nltk import tokenize" 528 | ], 529 | "execution_count": null, 530 | "outputs": [] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "metadata": { 535 | "id": "_eSYNSIN59ej" 536 | }, 537 | "source": [ 538 | "frase2 = 'el candidato quiere aprender LNP!'\n", 539 | "\n", 540 | "token_espacio = tokenize.WhitespaceTokenizer()\n", 541 | "\n", 542 | "frase_token = token_espacio.tokenize(frase2)" 543 | ], 544 | "execution_count": null, 545 | "outputs": [] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "metadata": { 550 | "id": "9J2cg7wO6Ayu" 551 | }, 552 | "source": [ 553 | "print(frase_token)" 554 | ], 555 | "execution_count": null, 556 | "outputs": [] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "metadata": { 561 | "id": "9XTPQNph6c8y" 562 | }, 563 | "source": [ 564 | "frase_token = token_espacio.tokenize(palabras)\n", 565 | "\n", 566 | "frequencia = nltk.FreqDist(frase_token)\n", 567 | "\n", 568 | "df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())})" 569 | ], 570 | "execution_count": null, 571 | "outputs": [] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "metadata": { 576 | "id": "mSTKKEOE6g3b" 577 | }, 578 | "source": [ 579 | "df_frequencia.head()" 580 | ], 581 | "execution_count": null, 582 | "outputs": [] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "metadata": { 587 | "id": "mIXGvBxx7Rzt" 588 | }, 589 | "source": [ 590 | "def histo_barras(texto, columna_texto, cantidad): \n", 591 | " \n", 592 | " plt.figure(figsize=(12,8))\n", 593 | "\n", 594 | " palabras = ' '.join([palabras for palabras in df['texto']])\n", 595 | "\n", 596 | " frase_token = token_espacio.tokenize(palabras)\n", 597 | "\n", 598 | " frequencia = nltk.FreqDist(frase_token)\n", 599 | "\n", 600 | " df_frequencia = pd.DataFrame({'palabra':list(frequencia.keys()), 'frequencia':list(frequencia.values())}) \n", 601 | "\n", 602 | " df_frequencia = df_frequencia.nlargest(columns='frequencia', n = cantidad)\n", 603 | "\n", 604 | " ax = sns.barplot(data=df_frequencia, x='palabra', y='frequencia', color='gray')\n", 605 | "\n", 606 | " ax.set(ylabel = 'conteo')\n", 607 | "\n", 608 | " plt.show()" 609 | ], 610 | "execution_count": null, 611 | "outputs": [] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "metadata": { 616 | "id": "vopAPhqw7U1O" 617 | }, 618 | "source": [ 619 | "histo_barras(df, \"texto\", 30)" 620 | ], 621 | "execution_count": null, 622 | "outputs": [] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "metadata": { 627 | "id": "s7YZ-z3J70VP" 628 | }, 629 | "source": [ 630 | "from nltk.corpus import stopwords" 631 | ], 632 | "execution_count": null, 633 | "outputs": [] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "metadata": { 638 | "id": "2xAQ00BT8J-s" 639 | }, 640 | "source": [ 641 | "irrelevantes = nltk.corpus.stopwords.words('spanish')\n", 642 | "\n", 643 | "frase_process = list()\n", 644 | "\n", 645 | "for opinion in df.texto:\n", 646 | " nueva_frase = list()\n", 647 | " palabras_texto = token_espacio.tokenize(opinion)\n", 648 | " for palabra in palabras_texto:\n", 649 | " if palabra not in irrelevantes:\n", 650 | " nueva_frase.append(palabra)\n", 651 | " frase_process.append(' '.join(nueva_frase))\n", 652 | "\n", 653 | "df['tratamiento_1'] = frase_process " 654 | ], 655 | "execution_count": null, 656 | "outputs": [] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "metadata": { 661 | "id": "xOdZpy7i8Mki" 662 | }, 663 | "source": [ 664 | "df.head()" 665 | ], 666 | "execution_count": null, 667 | "outputs": [] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "metadata": { 672 | "id": "9gS5WohD8PlL" 673 | }, 674 | "source": [ 675 | "clasificador(df, \"tratamiento_1\", \"sentiment\")" 676 | ], 677 | "execution_count": null, 678 | "outputs": [] 679 | } 680 | ] 681 | } --------------------------------------------------------------------------------