├── Session1 ├── concat year month day and product permutation.ipynb ├── df.clipboard_convert table to df.ipynb ├── dropna threshold percentage row and column.ipynb ├── filter by groupby.ipynb ├── get rid of Unnamed in df.ipynb ├── groupby agg multindex drop level.ipynb ├── insert and change column order in pandas.ipynb └── regex eliminate string and convert string to float.ipynb ├── Session2 ├── If_Reads_Numerical_Values_As_Object_-_pandas.read_csv.ipynb ├── Quick_One-Hot_Encoding_with_Pandas.ipynb ├── Quick_Web_Scraping_with_Pandas.ipynb ├── create others values in pandas column.ipynb ├── map function and assign numbers to category (factorize) and boolean.ipynb └── select columns by slicing pandas.ipynb ├── Session3 ├── Deal_with_zip_files.ipynb ├── Parquet_and_Pickle_instead_of_CSV.ipynb ├── count words in row.ipynb ├── pd.cut pd.qcut.ipynb ├── query_Dataframe.ipynb └── transform sum pandas column.ipynb ├── Session4 ├── load autotime for every cell runtime and select_dtypes.ipynb └── swifter apply fastest run.ipynb └── Session5 ├── pd.to_numeric errors coerce.ipynb └── random create dataframe.ipynb /Session1/concat year month day and product permutation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "3d1a84ee", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | "
yearmonthdaydata
2017-01-01201711-0.773084
2017-01-02201712-0.239255
2017-01-032017130.557760
2017-02-01201721-0.952520
2017-02-022017220.269137
2017-02-03201723-0.581972
2016-01-01201611-0.724717
2016-01-022016120.886955
2016-01-03201613-1.272199
2016-02-01201621-0.165526
2016-02-022016222.148330
2016-02-03201623-0.091643
\n", 123 | "
" 124 | ], 125 | "text/plain": [ 126 | " year month day data\n", 127 | "2017-01-01 2017 1 1 -0.773084\n", 128 | "2017-01-02 2017 1 2 -0.239255\n", 129 | "2017-01-03 2017 1 3 0.557760\n", 130 | "2017-02-01 2017 2 1 -0.952520\n", 131 | "2017-02-02 2017 2 2 0.269137\n", 132 | "2017-02-03 2017 2 3 -0.581972\n", 133 | "2016-01-01 2016 1 1 -0.724717\n", 134 | "2016-01-02 2016 1 2 0.886955\n", 135 | "2016-01-03 2016 1 3 -1.272199\n", 136 | "2016-02-01 2016 2 1 -0.165526\n", 137 | "2016-02-02 2016 2 2 2.148330\n", 138 | "2016-02-03 2016 2 3 -0.091643" 139 | ] 140 | }, 141 | "execution_count": 2, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "import numpy as np\n", 148 | "import pandas as pd\n", 149 | "\n", 150 | "from itertools import product\n", 151 | "\n", 152 | "datecols = ['year', 'month', 'day']\n", 153 | "\n", 154 | "df = pd.DataFrame(list(product([2017, 2016], [1, 2], [1, 2, 3])),\n", 155 | " columns=datecols)\n", 156 | "\n", 157 | "df['data'] = np.random.randn(len(df))\n", 158 | "\n", 159 | "df.index = pd.to_datetime(df[datecols])\n", 160 | "df" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "43f7309f", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3 (ipykernel)", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.9.13" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 5 193 | } 194 | -------------------------------------------------------------------------------- /Session1/df.clipboard_convert table to df.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "06e38720", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "year\tmonth\tday\tdata\n", 11 | "2017-01-01\t2017\t1\t1\t-1.343786\n", 12 | "2017-01-02\t2017\t1\t2\t-0.314145\n", 13 | "2017-01-03\t2017\t1\t3\t0.639521\n", 14 | "2017-02-01\t2017\t2\t1\t0.355203\n", 15 | "2017-02-02\t2017\t2\t2\t-0.959005" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "id": "9e2ad1f1", 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/html": [ 27 | "
\n", 28 | "\n", 41 | "\n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | "
yearmonthdaydata
2017-01-01201711-1.343786
2017-01-02201712-0.314145
2017-01-032017130.639521
2017-02-012017210.355203
2017-02-02201722-0.959005
\n", 89 | "
" 90 | ], 91 | "text/plain": [ 92 | " year month day data\n", 93 | "2017-01-01 2017 1 1 -1.343786\n", 94 | "2017-01-02 2017 1 2 -0.314145\n", 95 | "2017-01-03 2017 1 3 0.639521\n", 96 | "2017-02-01 2017 2 1 0.355203\n", 97 | "2017-02-02 2017 2 2 -0.959005" 98 | ] 99 | }, 100 | "execution_count": 1, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "import pandas as pd \n", 107 | "df = pd.read_clipboard()\n", 108 | "df" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "cec8dab3", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3 (ipykernel)", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.9.13" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 5 141 | } 142 | -------------------------------------------------------------------------------- /Session1/dropna threshold percentage row and column.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "id": "36609d75", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | "
ABC
02.02.03.0
16.02.0NaN
2NaNNaNNaN
39.05.0NaN
49.08.0NaN
\n", 68 | "
" 69 | ], 70 | "text/plain": [ 71 | " A B C\n", 72 | "0 2.0 2.0 3.0\n", 73 | "1 6.0 2.0 NaN\n", 74 | "2 NaN NaN NaN\n", 75 | "3 9.0 5.0 NaN\n", 76 | "4 9.0 8.0 NaN" 77 | ] 78 | }, 79 | "execution_count": 10, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "import pandas as pd\n", 86 | "import numpy as np\n", 87 | "df = pd.DataFrame(dict(A=[2,6,np.nan,9,9],\n", 88 | " B=[2,2,np.nan,5,8], \n", 89 | " C=[3,np.nan,np.nan,np.nan,np.nan]))\n", 90 | "df" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 28, 96 | "id": "1955b2ae", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/html": [ 102 | "
\n", 103 | "\n", 116 | "\n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | "
0
1
2
3
4
\n", 140 | "
" 141 | ], 142 | "text/plain": [ 143 | "Empty DataFrame\n", 144 | "Columns: []\n", 145 | "Index: [0, 1, 2, 3, 4]" 146 | ] 147 | }, 148 | "execution_count": 28, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "df.dropna(thresh=5, axis='columns')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "id": "defe7bb4", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "3" 167 | ] 168 | }, 169 | "execution_count": 8, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "df.shape[1]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 6, 181 | "id": "bc65a8ed", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/html": [ 187 | "
\n", 188 | "\n", 201 | "\n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | "
ABC
02.02.03.0
\n", 219 | "
" 220 | ], 221 | "text/plain": [ 222 | " A B C\n", 223 | "0 2.0 2.0 3.0" 224 | ] 225 | }, 226 | "execution_count": 6, 227 | "metadata": {}, 228 | "output_type": "execute_result" 229 | } 230 | ], 231 | "source": [ 232 | "df.dropna(thresh=df.shape[1]*0.8, axis='rows')\n" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 9, 238 | "id": "d713a4f7", 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "5" 245 | ] 246 | }, 247 | "execution_count": 9, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "df.shape[0]" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 24, 259 | "id": "6e9159be", 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/html": [ 265 | "
\n", 266 | "\n", 279 | "\n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | "
ABC
02.02.03.0
16.02.0NaN
2NaNNaNNaN
39.05.0NaN
49.08.0NaN
\n", 321 | "
" 322 | ], 323 | "text/plain": [ 324 | " A B C\n", 325 | "0 2.0 2.0 3.0\n", 326 | "1 6.0 2.0 NaN\n", 327 | "2 NaN NaN NaN\n", 328 | "3 9.0 5.0 NaN\n", 329 | "4 9.0 8.0 NaN" 330 | ] 331 | }, 332 | "execution_count": 24, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "df.dropna(thresh=1, axis='columns')" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "id": "b249cdca", 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "import pandas as pd\n", 349 | "df = pd.DataFrame(dict(A=[2,6,3],\n", 350 | " B=[2,2,6], \n", 351 | " C=[3,2,3]))\n", 352 | "df.to_csv(\"datas.csv\",index=False)\n", 353 | "\n", 354 | "df = pd.read_csv(\"datas.csv\")\n", 355 | "df" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "2782b875", 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "df = pd.read_csv(\"datas.csv\", index_col=False)\n", 366 | "df" 367 | ] 368 | } 369 | ], 370 | "metadata": { 371 | "kernelspec": { 372 | "display_name": "Python 3 (ipykernel)", 373 | "language": "python", 374 | "name": "python3" 375 | }, 376 | "language_info": { 377 | "codemirror_mode": { 378 | "name": "ipython", 379 | "version": 3 380 | }, 381 | "file_extension": ".py", 382 | "mimetype": "text/x-python", 383 | "name": "python", 384 | "nbconvert_exporter": "python", 385 | "pygments_lexer": "ipython3", 386 | "version": "3.9.13" 387 | } 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 5 391 | } 392 | -------------------------------------------------------------------------------- /Session1/filter by groupby.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "f73bff90", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | "
ABC
02.02.03.0
16.02.02.0
2NaNNaN3.0
39.05.06.0
49.05.0NaN
\n", 68 | "
" 69 | ], 70 | "text/plain": [ 71 | " A B C\n", 72 | "0 2.0 2.0 3.0\n", 73 | "1 6.0 2.0 2.0\n", 74 | "2 NaN NaN 3.0\n", 75 | "3 9.0 5.0 6.0\n", 76 | "4 9.0 5.0 NaN" 77 | ] 78 | }, 79 | "execution_count": 6, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "import pandas as pd\n", 86 | "import numpy as np\n", 87 | "df = pd.DataFrame(dict(A=[2,6,np.nan,9,9],\n", 88 | " B=[2,2,np.nan,5,5], \n", 89 | " C=[3,2,3,6,np.nan]))\n", 90 | "df" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 7, 96 | "id": "b302cb0a", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/html": [ 102 | "
\n", 103 | "\n", 116 | "\n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | "
ABC
39.05.06.0
49.05.0NaN
\n", 140 | "
" 141 | ], 142 | "text/plain": [ 143 | " A B C\n", 144 | "3 9.0 5.0 6.0\n", 145 | "4 9.0 5.0 NaN" 146 | ] 147 | }, 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "dfg = df.groupby([\"A\",\"B\"])\n", 155 | "dfg.get_group((9,5))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "id": "1a3e5165", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/html": [ 167 | "
\n", 168 | "\n", 181 | "\n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | "
ABC
39.05.06.0
49.05.0NaN
\n", 205 | "
" 206 | ], 207 | "text/plain": [ 208 | " A B C\n", 209 | "3 9.0 5.0 6.0\n", 210 | "4 9.0 5.0 NaN" 211 | ] 212 | }, 213 | "execution_count": 8, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "df[(df[\"A\"]==9)&(df[\"B\"]==5)]" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "fff668db", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | } 230 | ], 231 | "metadata": { 232 | "kernelspec": { 233 | "display_name": "Python 3 (ipykernel)", 234 | "language": "python", 235 | "name": "python3" 236 | }, 237 | "language_info": { 238 | "codemirror_mode": { 239 | "name": "ipython", 240 | "version": 3 241 | }, 242 | "file_extension": ".py", 243 | "mimetype": "text/x-python", 244 | "name": "python", 245 | "nbconvert_exporter": "python", 246 | "pygments_lexer": "ipython3", 247 | "version": "3.9.13" 248 | } 249 | }, 250 | "nbformat": 4, 251 | "nbformat_minor": 5 252 | } 253 | -------------------------------------------------------------------------------- /Session1/get rid of Unnamed in df.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "e2491e65", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | "
Unnamed: 0ABC
00223
11622
22363
\n", 60 | "
" 61 | ], 62 | "text/plain": [ 63 | " Unnamed: 0 A B C\n", 64 | "0 0 2 2 3\n", 65 | "1 1 6 2 2\n", 66 | "2 2 3 6 3" 67 | ] 68 | }, 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "import pandas as pd\n", 76 | "df = pd.DataFrame(dict(A=[2,6,3],\n", 77 | " B=[2,2,6], \n", 78 | " C=[3,2,3]))\n", 79 | "df.to_csv(\"datas.csv\")\n", 80 | "\n", 81 | "df = pd.read_csv(\"datas.csv\")\n", 82 | "df\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 9, 88 | "id": "e2087034", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/html": [ 94 | "
\n", 95 | "\n", 108 | "\n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | "
ABC
0223
1622
2363
\n", 138 | "
" 139 | ], 140 | "text/plain": [ 141 | " A B C\n", 142 | "0 2 2 3\n", 143 | "1 6 2 2\n", 144 | "2 3 6 3" 145 | ] 146 | }, 147 | "execution_count": 9, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "df = pd.read_csv(\"datas.csv\", index_col=0)\n", 154 | "df" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "49edc69c", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.9.13" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /Session1/groupby agg multindex drop level.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "d4e8e00b", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | "
ABCD
015203050
120254060
\n", 53 | "
" 54 | ], 55 | "text/plain": [ 56 | " A B C D\n", 57 | "0 15 20 30 50\n", 58 | "1 20 25 40 60" 59 | ] 60 | }, 61 | "execution_count": 1, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "import pandas as pd\n", 68 | "d = {\"A\":[15, 20], \"B\":[20, 25], \"C\":[30 ,40], \"D\":[50, 60]}\n", 69 | "df = pd.DataFrame(d)\n", 70 | "df" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 2, 76 | "id": "abc8e804", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "The Problem relies on that we don't know the column name\n" 84 | ] 85 | }, 86 | { 87 | "data": { 88 | "text/html": [ 89 | "
\n", 90 | "\n", 103 | "\n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | "
meanmax
A
1520.020
2025.025
\n", 129 | "
" 130 | ], 131 | "text/plain": [ 132 | " mean max\n", 133 | "A \n", 134 | "15 20.0 20\n", 135 | "20 25.0 25" 136 | ] 137 | }, 138 | "execution_count": 2, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "print(\"The Problem relies on that we don't know the column name\")\n", 145 | "df.groupby(\"A\")[\"B\"].agg([\"mean\", \"max\"])" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 3, 151 | "id": "b89eee03", 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "The Problem relies on that we have multiindex\n" 159 | ] 160 | }, 161 | { 162 | "data": { 163 | "text/html": [ 164 | "
\n", 165 | "\n", 182 | "\n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | "
B
meanmax
A
1520.020
2025.025
\n", 212 | "
" 213 | ], 214 | "text/plain": [ 215 | " B \n", 216 | " mean max\n", 217 | "A \n", 218 | "15 20.0 20\n", 219 | "20 25.0 25" 220 | ] 221 | }, 222 | "execution_count": 3, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "print(\"The Problem relies on that we have multiindex\")\n", 229 | "df.groupby(\"A\").agg({\"B\":[\"mean\", \"max\"]}) # .columns.droplevel()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 4, 235 | "id": "c6b0212c", 236 | "metadata": {}, 237 | "outputs": [ 238 | { 239 | "data": { 240 | "text/html": [ 241 | "
\n", 242 | "\n", 255 | "\n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | "
A_meanA_max
A
1520.020
2025.025
\n", 281 | "
" 282 | ], 283 | "text/plain": [ 284 | " A_mean A_max\n", 285 | "A \n", 286 | "15 20.0 20\n", 287 | "20 25.0 25" 288 | ] 289 | }, 290 | "execution_count": 4, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "df.groupby(\"A\")[\"B\"].agg(A_mean = \"mean\", A_max = \"max\")" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "id": "a2f7aa57", 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [] 306 | } 307 | ], 308 | "metadata": { 309 | "kernelspec": { 310 | "display_name": "Python 3 (ipykernel)", 311 | "language": "python", 312 | "name": "python3" 313 | }, 314 | "language_info": { 315 | "codemirror_mode": { 316 | "name": "ipython", 317 | "version": 3 318 | }, 319 | "file_extension": ".py", 320 | "mimetype": "text/x-python", 321 | "name": "python", 322 | "nbconvert_exporter": "python", 323 | "pygments_lexer": "ipython3", 324 | "version": "3.9.13" 325 | } 326 | }, 327 | "nbformat": 4, 328 | "nbformat_minor": 5 329 | } 330 | -------------------------------------------------------------------------------- /Session1/insert and change column order in pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 22, 6 | "id": "894283de", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | "
ABCD
015203050
120254060
\n", 53 | "
" 54 | ], 55 | "text/plain": [ 56 | " A B C D\n", 57 | "0 15 20 30 50\n", 58 | "1 20 25 40 60" 59 | ] 60 | }, 61 | "execution_count": 22, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "import pandas as pd\n", 68 | "d = {\"A\":[15, 20], \"B\":[20, 25], \"C\":[30 ,40], \"D\":[50, 60]}\n", 69 | "df = pd.DataFrame(d)\n", 70 | "df" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 14, 76 | "id": "8ceaf614", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/html": [ 82 | "
\n", 83 | "\n", 96 | "\n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
ABCC2D
01520306050
12025408060
\n", 126 | "
" 127 | ], 128 | "text/plain": [ 129 | " A B C C2 D\n", 130 | "0 15 20 30 60 50\n", 131 | "1 20 25 40 80 60" 132 | ] 133 | }, 134 | "execution_count": 14, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "# Using insert\n", 141 | "df.insert(3, \"C2\", df[\"C\"]*2)\n", 142 | "df" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 8, 148 | "id": "c0bb34d2", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/html": [ 154 | "
\n", 155 | "\n", 168 | "\n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | "
ABC2DC
01520605030
12025806040
\n", 198 | "
" 199 | ], 200 | "text/plain": [ 201 | " A B C2 D C\n", 202 | "0 15 20 60 50 30\n", 203 | "1 20 25 80 60 40" 204 | ] 205 | }, 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "col_num = 2\n", 213 | "df.iloc[:,[i for i in range(df.shape[1]) if i!=col_num] + [col_num]]" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 9, 219 | "id": "8b005c59", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "[0, 1, 3, 4, 2]" 226 | ] 227 | }, 228 | "execution_count": 9, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "[i for i in range(df.shape[1]) if i!=col_num] + [col_num]" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "757068d1", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3 (ipykernel)", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.9.13" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 5 267 | } 268 | -------------------------------------------------------------------------------- /Session1/regex eliminate string and convert string to float.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "id": "8f94fb2e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | "
customersales
0A1100
1B950.75
2C$$$-400
3D$1250.35*
\n", 57 | "
" 58 | ], 59 | "text/plain": [ 60 | " customer sales\n", 61 | "0 A 1100\n", 62 | "1 B 950.75\n", 63 | "2 C $$$-400\n", 64 | "3 D $1250.35*" 65 | ] 66 | }, 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "import pandas as pd \n", 74 | "d = {\"customer\": [\"A\", \"B\", \"C\", \"D\"], \"sales\":[1100, 950.75, \"$$$-400\", \"$1250.35*\"]}\n", 75 | "df = pd.DataFrame(d)\n", 76 | "df\n", 77 | "\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 42, 83 | "id": "4ecbe5ce", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "0 \n", 90 | "1 \n", 91 | "2 \n", 92 | "3 \n", 93 | "Name: sales, dtype: object" 94 | ] 95 | }, 96 | "execution_count": 42, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "# Step 1: check the data types\n", 103 | "df[\"sales\"].apply(type)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 2, 109 | "id": "c01ba1a5", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "text/html": [ 115 | "
\n", 116 | "\n", 129 | "\n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | "
customersales
0A1100.00
1B950.75
2C400.00
3D1250.35
\n", 160 | "
" 161 | ], 162 | "text/plain": [ 163 | " customer sales\n", 164 | "0 A 1100.00\n", 165 | "1 B 950.75\n", 166 | "2 C 400.00\n", 167 | "3 D 1250.35" 168 | ] 169 | }, 170 | "execution_count": 2, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "# Step 2: use regex\n", 177 | "df[\"sales\"] = df[\"sales\"].replace(\"[$\\-\\_\\*]\", \"\", regex = True).astype(\"float\")\n", 178 | "df" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 5, 184 | "id": "ccfecc45", 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "0 NaN\n", 191 | "1 NaN\n", 192 | "2 -400.00\n", 193 | "3 1250.35\n", 194 | "Name: sales, dtype: float64" 195 | ] 196 | }, 197 | "execution_count": 5, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "df[\"sales\"].str.strip(\"$*\").astype(\"float\")" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "62825311", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3 (ipykernel)", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.9.13" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 5 236 | } 237 | -------------------------------------------------------------------------------- /Session2/If_Reads_Numerical_Values_As_Object_-_pandas.read_csv.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "51f838e1", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "5a5dadd3", 16 | "metadata": {}, 17 | "source": [ 18 | "example.csv\n", 19 | "\n", 20 | "0 | 100,000\n", 21 | "\n", 22 | "1 | 200,000\n", 23 | "\n", 24 | "2 | 300,000\n", 25 | "\n", 26 | "3 | 250,000\n", 27 | "\n", 28 | "If pandas reads " 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "id": "3d1b7fdf", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "df = pd.read_csv(\"example.csv\", index_col = 0, thousands = \",\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "f65d2009", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
0
0100000
1200000
2300000
3250000
\n", 90 | "
" 91 | ], 92 | "text/plain": [ 93 | " 0\n", 94 | "0 100000\n", 95 | "1 200000\n", 96 | "2 300000\n", 97 | "3 250000" 98 | ] 99 | }, 100 | "execution_count": 3, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "df" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "id": "09b31e28", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "0 int64\n", 119 | "dtype: object" 120 | ] 121 | }, 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "df.dtypes" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "e00a7d28", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3 (ipykernel)", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.9.13" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 5 161 | } 162 | -------------------------------------------------------------------------------- /Session2/Quick_One-Hot_Encoding_with_Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0364dbb2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "cbc7d9dc", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "dummies = pd.DataFrame([\"A\", \"B\", \"A\", \"B\", \"C\"])" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "id": "d73675e2", 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | "
0_A0_B0_C
0100
1010
2100
3010
4001
\n", 88 | "
" 89 | ], 90 | "text/plain": [ 91 | " 0_A 0_B 0_C\n", 92 | "0 1 0 0\n", 93 | "1 0 1 0\n", 94 | "2 1 0 0\n", 95 | "3 0 1 0\n", 96 | "4 0 0 1" 97 | ] 98 | }, 99 | "execution_count": 3, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "pd.get_dummies(dummies)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "id": "18ede13c", 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/html": [ 117 | "
\n", 118 | "\n", 131 | "\n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | "
Dummy_ADummy_BDummy_C
0100
1010
2100
3010
4001
\n", 173 | "
" 174 | ], 175 | "text/plain": [ 176 | " Dummy_A Dummy_B Dummy_C\n", 177 | "0 1 0 0\n", 178 | "1 0 1 0\n", 179 | "2 1 0 0\n", 180 | "3 0 1 0\n", 181 | "4 0 0 1" 182 | ] 183 | }, 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "pd.get_dummies(dummies, prefix = \"Dummy\")" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 5, 196 | "id": "bb9739c5", 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/html": [ 202 | "
\n", 203 | "\n", 216 | "\n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | "
Dummy_BDummy_C
000
110
200
310
401
\n", 252 | "
" 253 | ], 254 | "text/plain": [ 255 | " Dummy_B Dummy_C\n", 256 | "0 0 0\n", 257 | "1 1 0\n", 258 | "2 0 0\n", 259 | "3 1 0\n", 260 | "4 0 1" 261 | ] 262 | }, 263 | "execution_count": 5, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "pd.get_dummies(dummies, prefix = \"Dummy\", drop_first = True)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "id": "774a487b", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "Python 3 (ipykernel)", 284 | "language": "python", 285 | "name": "python3" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.9.13" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } 303 | -------------------------------------------------------------------------------- /Session2/Quick_Web_Scraping_with_Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "a00fe8f5", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "id": "3a87ca23", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "verbs = pd.read_html(\"https://www.wse.com.tr/blog/irregular-verbs-duzensiz-fiiller/\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "4e0fae35", 26 | "metadata": {}, 27 | "source": [ 28 | " -Errors-\n", 29 | "ImportError: html5lib not found, please install it\n", 30 | "pip install html5lib\n", 31 | "\n", 32 | "ValueError: No tables found\n", 33 | "\n", 34 | "urllib.error.HTTPError: HTTP Error 403: Forbidden" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "ba595fe5", 40 | "metadata": {}, 41 | "source": [ 42 | "!pip install html5lib" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "id": "ed61c3f9", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | "
0123
0V1 – Base FormV2 – Past SimpleV3 – Past ParticipleTürkçe Anlamı
1awakeawokeawokenuyandırmak
2bewas, werebeenolmak
3beatbeatbeatenvurmak
4becomebecamebecomeolmak
...............
93understandunderstoodunderstoodanlamak
94wakewokewokenuyanmak
95wearworeworngiymek
96winwonwonkazanmak
97writewrotewrittenyazmak
\n", 158 | "

98 rows × 4 columns

\n", 159 | "
" 160 | ], 161 | "text/plain": [ 162 | " 0 1 2 3\n", 163 | "0 V1 – Base Form V2 – Past Simple V3 – Past Participle Türkçe Anlamı\n", 164 | "1 awake awoke awoken uyandırmak\n", 165 | "2 be was, were been olmak\n", 166 | "3 beat beat beaten vurmak\n", 167 | "4 become became become olmak\n", 168 | ".. ... ... ... ...\n", 169 | "93 understand understood understood anlamak\n", 170 | "94 wake woke woken uyanmak\n", 171 | "95 wear wore worn giymek\n", 172 | "96 win won won kazanmak\n", 173 | "97 write wrote written yazmak\n", 174 | "\n", 175 | "[98 rows x 4 columns]" 176 | ] 177 | }, 178 | "execution_count": 3, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "verbs[0]" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3 (ipykernel)", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.9.13" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 5 209 | } 210 | -------------------------------------------------------------------------------- /Session2/create others values in pandas column.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "75e11a3a", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/plain": [ 12 | "A 5\n", 13 | "Other 4\n", 14 | "B 2\n", 15 | "Name: genre, dtype: int64" 16 | ] 17 | }, 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "output_type": "execute_result" 21 | } 22 | ], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "\n", 26 | "d = {\"genre\": [\"A\", \"A\", \"A\", \"A\", \"A\", \"B\", \"B\", \"C\", \"D\", \"E\", \"F\"]}\n", 27 | "df = pd.DataFrame(d)\n", 28 | "df[\"genre\"].value_counts()\n", 29 | "\n", 30 | "\n", 31 | "liste = ['A','B']\n", 32 | "\n", 33 | "# Step 2: update the df\n", 34 | "df_updated = df.where(df[\"genre\"].isin(liste), other = \"Other\")\n", 35 | "df_updated[\"genre\"].value_counts()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "id": "f65c3a91", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/html": [ 47 | "
\n", 48 | "\n", 61 | "\n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | "
genre
0A
1A
2A
3A
4A
5B
6B
7Other
8Other
9Other
10Other
\n", 115 | "
" 116 | ], 117 | "text/plain": [ 118 | " genre\n", 119 | "0 A\n", 120 | "1 A\n", 121 | "2 A\n", 122 | "3 A\n", 123 | "4 A\n", 124 | "5 B\n", 125 | "6 B\n", 126 | "7 Other\n", 127 | "8 Other\n", 128 | "9 Other\n", 129 | "10 Other" 130 | ] 131 | }, 132 | "execution_count": 2, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "df_updated" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "id": "b6a35980", 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3 (ipykernel)", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.9.13" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 5 171 | } 172 | -------------------------------------------------------------------------------- /Session2/map function and assign numbers to category (factorize) and boolean.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "5889ac14", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | "
gendercoloragegender_mappedcolor_factorizedage_compared_boolean
0malered25M0False
1femalegreen30F1False
2maleblue15M2True
\n", 68 | "
" 69 | ], 70 | "text/plain": [ 71 | " gender color age gender_mapped color_factorized age_compared_boolean\n", 72 | "0 male red 25 M 0 False\n", 73 | "1 female green 30 F 1 False\n", 74 | "2 male blue 15 M 2 True" 75 | ] 76 | }, 77 | "execution_count": 2, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "import pandas as pd\n", 84 | "# Do some fast feature eng on the DF\n", 85 | "d = {\"gender\":[\"male\", \"female\", \"male\"], \"color\":[\"red\", \"green\", \"blue\"], \"age\":[25, 30, 15]}\n", 86 | "df = pd.DataFrame(d)\n", 87 | "df\n", 88 | "\n", 89 | "# Solution\n", 90 | "map_dict = {\"male\":\"M\", \"female\":\"F\"}\n", 91 | "df[\"gender_mapped\"] = df[\"gender\"].map(map_dict) # using dictionaries to map values\n", 92 | "\n", 93 | "df[\"color_factorized\"] = df[\"color\"].factorize()[0] # using factorize: returns a tuple of arrays (array([0, 1, 2]), Index(['red', 'green', 'blue'], dtype='object')) that's why we select [0]\n", 94 | "\n", 95 | "df[\"age_compared_boolean\"] = df[\"age\"] < 18 # return a True False boolean value\n", 96 | "\n", 97 | "df" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "cdeb5327", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3 (ipykernel)", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.9.13" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 5 130 | } 131 | -------------------------------------------------------------------------------- /Session2/select columns by slicing pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "cf777aaa", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "a6f12180", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "\tA\tB\tC\tD\tE\tF\tG\n", 21 | "0\t14\t16\t14\t2\t4\t11\t16\n", 22 | "1\t9\t14\t18\t18\t7\t5\t16\n", 23 | "2\t8\t18\t3\t19\t13\t18\t17\n", 24 | "3\t13\t13\t12\t4\t12\t1\t10\n", 25 | "4\t2\t7\t5\t2\t7\t2\t10\n", 26 | "5\t16\t5\t10\t3\t13\t16\t15\n", 27 | "6\t10\t13\t17\t9\t16\t12\t9\n", 28 | "7\t9\t16\t6\t19\t2\t11\t13\n", 29 | "8\t2\t3\t13\t13\t7\t14\t5\n", 30 | "9\t17\t8\t10\t18\t17\t8\t4\n", 31 | "10\t19\t14\t15\t1\t7\t8\t19\n", 32 | "11\t12\t16\t8\t19\t4\t4\t19\n", 33 | "12\t6\t1\t4\t9\t1\t5\t7\n", 34 | "13\t13\t13\t2\t12\t5\t10\t4\n", 35 | "14\t13\t5\t17\t1\t14\t19\t12\n", 36 | "15\t8\t18\t5\t8\t9\t15\t7\n", 37 | "16\t2\t1\t13\t12\t14\t12\t13\n", 38 | "17\t8\t6\t12\t18\t18\t1\t11" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "14e51ec1", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | "
01267891016
A1498109217192
B161418131638141
C1418317613101513
D218199191318112
E4713162717714
F115181211148812
G161617913541913
\n", 166 | "
" 167 | ], 168 | "text/plain": [ 169 | " 0 1 2 6 7 8 9 10 16\n", 170 | "A 14 9 8 10 9 2 17 19 2\n", 171 | "B 16 14 18 13 16 3 8 14 1\n", 172 | "C 14 18 3 17 6 13 10 15 13\n", 173 | "D 2 18 19 9 19 13 18 1 12\n", 174 | "E 4 7 13 16 2 7 17 7 14\n", 175 | "F 11 5 18 12 11 14 8 8 12\n", 176 | "G 16 16 17 9 13 5 4 19 13" 177 | ] 178 | }, 179 | "execution_count": 3, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "df = pd.read_clipboard()\n", 186 | "\n", 187 | "import numpy as np\n", 188 | "df = df.T\n", 189 | "cols_str = list(map(str, list(df.columns))) # so that we can do df[\"0\"] as string for the example\n", 190 | "df.columns = cols_str\n", 191 | "\n", 192 | "# Using pandas concatenation\n", 193 | "# if you are ever confused about axis = 1 or axis = 0, just put axis = \"columns\" or axis = \"rows\"\n", 194 | "pd.concat([df.loc[:, \"0\":\"2\"], df.loc[:, \"6\":\"10\"], df.loc[:, \"16\":\"17\"]], axis = \"columns\") # ------------------> here we are selecting columns converted to strings\n", 195 | "\n", 196 | "# Using lists\n", 197 | "# please ntoe that df.columns is a series with index, so we are using index to filter # -------------------------> here we are selecting the index of columns\n", 198 | "df[list(df.columns[0:3]) + list(df.columns[6:11]) + list(df.columns[16:17])]\n", 199 | "\n", 200 | "# Using numpy\n", 201 | "df.iloc[:, np.r_[0:3, 6:11, 16:17]] # probably the most beautiful solution" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 4, 207 | "id": "263bcf86", 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "array([ 0, 1, 2, 6, 7, 8, 9, 10, 16])" 214 | ] 215 | }, 216 | "execution_count": 4, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "np.r_[0:3, 6:11, 16:17]" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "4af6b356", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [] 232 | } 233 | ], 234 | "metadata": { 235 | "kernelspec": { 236 | "display_name": "Python 3 (ipykernel)", 237 | "language": "python", 238 | "name": "python3" 239 | }, 240 | "language_info": { 241 | "codemirror_mode": { 242 | "name": "ipython", 243 | "version": 3 244 | }, 245 | "file_extension": ".py", 246 | "mimetype": "text/x-python", 247 | "name": "python", 248 | "nbconvert_exporter": "python", 249 | "pygments_lexer": "ipython3", 250 | "version": "3.9.13" 251 | } 252 | }, 253 | "nbformat": 4, 254 | "nbformat_minor": 5 255 | } 256 | -------------------------------------------------------------------------------- /Session3/Deal_with_zip_files.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": 20, 20 | "metadata": { 21 | "colab": { 22 | "base_uri": "https://localhost:8080/", 23 | "height": 848 24 | }, 25 | "id": "KoG7NgjdhDR3", 26 | "outputId": "b3d2fe98-01aa-47c5-a8e8-7bbf388b5eac" 27 | }, 28 | "outputs": [ 29 | { 30 | "output_type": "display_data", 31 | "data": { 32 | "text/plain": [ 33 | " v1 v2 Unnamed: 2 \\\n", 34 | "0 ham Go until jurong point, crazy.. Available only ... NaN \n", 35 | "1 ham Ok lar... Joking wif u oni... NaN \n", 36 | "2 spam Free entry in 2 a wkly comp to win FA Cup fina... NaN \n", 37 | "3 ham U dun say so early hor... U c already then say... NaN \n", 38 | "4 ham Nah I don't think he goes to usf, he lives aro... NaN \n", 39 | "... ... ... ... \n", 40 | "5567 spam This is the 2nd time we have tried 2 contact u... NaN \n", 41 | "5568 ham Will Ì_ b going to esplanade fr home? NaN \n", 42 | "5569 ham Pity, * was in mood for that. So...any other s... NaN \n", 43 | "5570 ham The guy did some bitching but I acted like i'd... NaN \n", 44 | "5571 ham Rofl. Its true to its name NaN \n", 45 | "\n", 46 | " Unnamed: 3 Unnamed: 4 \n", 47 | "0 NaN NaN \n", 48 | "1 NaN NaN \n", 49 | "2 NaN NaN \n", 50 | "3 NaN NaN \n", 51 | "4 NaN NaN \n", 52 | "... ... ... \n", 53 | "5567 NaN NaN \n", 54 | "5568 NaN NaN \n", 55 | "5569 NaN NaN \n", 56 | "5570 NaN NaN \n", 57 | "5571 NaN NaN \n", 58 | "\n", 59 | "[5572 rows x 5 columns]" 60 | ], 61 | "text/html": [ 62 | "\n", 63 | "
\n", 64 | "
\n", 65 | "
\n", 66 | "\n", 79 | "\n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | "
v1v2Unnamed: 2Unnamed: 3Unnamed: 4
0hamGo until jurong point, crazy.. Available only ...NaNNaNNaN
1hamOk lar... Joking wif u oni...NaNNaNNaN
2spamFree entry in 2 a wkly comp to win FA Cup fina...NaNNaNNaN
3hamU dun say so early hor... U c already then say...NaNNaNNaN
4hamNah I don't think he goes to usf, he lives aro...NaNNaNNaN
..................
5567spamThis is the 2nd time we have tried 2 contact u...NaNNaNNaN
5568hamWill Ì_ b going to esplanade fr home?NaNNaNNaN
5569hamPity, * was in mood for that. So...any other s...NaNNaNNaN
5570hamThe guy did some bitching but I acted like i'd...NaNNaNNaN
5571hamRofl. Its true to its nameNaNNaNNaN
\n", 181 | "

5572 rows × 5 columns

\n", 182 | "
\n", 183 | " \n", 193 | " \n", 194 | " \n", 231 | "\n", 232 | " \n", 256 | "
\n", 257 | "
\n", 258 | " " 259 | ] 260 | }, 261 | "metadata": {} 262 | }, 263 | { 264 | "output_type": "stream", 265 | "name": "stdout", 266 | "text": [ 267 | "Compressing ratio(perc): 43.10390383083717\n" 268 | ] 269 | }, 270 | { 271 | "output_type": "display_data", 272 | "data": { 273 | "text/plain": [ 274 | " Unnamed: 0 v1 v2 \\\n", 275 | "0 0 ham Go until jurong point, crazy.. Available only ... \n", 276 | "1 1 ham Ok lar... Joking wif u oni... \n", 277 | "2 2 spam Free entry in 2 a wkly comp to win FA Cup fina... \n", 278 | "3 3 ham U dun say so early hor... U c already then say... \n", 279 | "4 4 ham Nah I don't think he goes to usf, he lives aro... \n", 280 | "... ... ... ... \n", 281 | "5567 5567 spam This is the 2nd time we have tried 2 contact u... \n", 282 | "5568 5568 ham Will Ì_ b going to esplanade fr home? \n", 283 | "5569 5569 ham Pity, * was in mood for that. So...any other s... \n", 284 | "5570 5570 ham The guy did some bitching but I acted like i'd... \n", 285 | "5571 5571 ham Rofl. Its true to its name \n", 286 | "\n", 287 | " Unnamed: 2 Unnamed: 3 Unnamed: 4 \n", 288 | "0 NaN NaN NaN \n", 289 | "1 NaN NaN NaN \n", 290 | "2 NaN NaN NaN \n", 291 | "3 NaN NaN NaN \n", 292 | "4 NaN NaN NaN \n", 293 | "... ... ... ... \n", 294 | "5567 NaN NaN NaN \n", 295 | "5568 NaN NaN NaN \n", 296 | "5569 NaN NaN NaN \n", 297 | "5570 NaN NaN NaN \n", 298 | "5571 NaN NaN NaN \n", 299 | "\n", 300 | "[5572 rows x 6 columns]" 301 | ], 302 | "text/html": [ 303 | "\n", 304 | "
\n", 305 | "
\n", 306 | "
\n", 307 | "\n", 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | "
Unnamed: 0v1v2Unnamed: 2Unnamed: 3Unnamed: 4
00hamGo until jurong point, crazy.. Available only ...NaNNaNNaN
11hamOk lar... Joking wif u oni...NaNNaNNaN
22spamFree entry in 2 a wkly comp to win FA Cup fina...NaNNaNNaN
33hamU dun say so early hor... U c already then say...NaNNaNNaN
44hamNah I don't think he goes to usf, he lives aro...NaNNaNNaN
.....................
55675567spamThis is the 2nd time we have tried 2 contact u...NaNNaNNaN
55685568hamWill Ì_ b going to esplanade fr home?NaNNaNNaN
55695569hamPity, * was in mood for that. So...any other s...NaNNaNNaN
55705570hamThe guy did some bitching but I acted like i'd...NaNNaNNaN
55715571hamRofl. Its true to its nameNaNNaNNaN
\n", 434 | "

5572 rows × 6 columns

\n", 435 | "
\n", 436 | " \n", 446 | " \n", 447 | " \n", 484 | "\n", 485 | " \n", 509 | "
\n", 510 | "
\n", 511 | " " 512 | ] 513 | }, 514 | "metadata": {} 515 | } 516 | ], 517 | "source": [ 518 | "import pandas as pd\n", 519 | "\n", 520 | "\"\"\"\n", 521 | "------------ WRITE ZIP FILES -----------\n", 522 | "\"\"\"\n", 523 | "# Read data from internet\n", 524 | "url = \"https://raw.githubusercontent.com/keitazoumana/Fastapi-tutorial/master/data/spam.csv\"\n", 525 | "spam_data = pd.read_csv(url, encoding=\"ISO-8859-1\")\n", 526 | "display(spam_data)\n", 527 | "# Save it as a .csv file\n", 528 | "spam_data.to_csv(\"spam.csv\")\n", 529 | "# Save it as a zip file\n", 530 | "spam_data.to_csv(\"spam.csv.zip\", compression=\"zip\")\n", 531 | "\n", 532 | "# Check the files sizes\n", 533 | "from os import path\n", 534 | "print(\"Compressing ratio(perc): \", 100 * path.getsize('spam.csv.zip') / path.getsize('spam.csv'))\n", 535 | "\n", 536 | "\n", 537 | "\"\"\"\n", 538 | "------------ READ ZIP FILES -----------\n", 539 | "\"\"\"\n", 540 | "# Case 1: read a single zip file \n", 541 | "spam_df_unzip = pd.read_csv('spam.csv.zip', compression='zip')\n", 542 | "\n", 543 | "# Case 2: read a file from a folder\n", 544 | "from zipfile import ZipFile\n", 545 | "\n", 546 | "# Read the file from a zip folder\n", 547 | "spam_df = pd.read_csv(ZipFile(\"spam.csv.zip\").open('spam.csv'))\n", 548 | "display(spam_df)" 549 | ] 550 | } 551 | ] 552 | } -------------------------------------------------------------------------------- /Session3/count words in row.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "332c8e4c", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | "
TitleWords
0allows us to input information when a image wi...11
1By leaving it blank it just waits for anykey t...13
2continuing. By placing numbers (except 0), we ...12
3how long you keep the window open (time is in ...12
\n", 57 | "
" 58 | ], 59 | "text/plain": [ 60 | " Title Words\n", 61 | "0 allows us to input information when a image wi... 11\n", 62 | "1 By leaving it blank it just waits for anykey t... 13\n", 63 | "2 continuing. By placing numbers (except 0), we ... 12\n", 64 | "3 how long you keep the window open (time is in ... 12" 65 | ] 66 | }, 67 | "execution_count": 1, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "import pandas as pd\n", 74 | "df = pd.DataFrame({\"Title\":[\"allows us to input information when a image window is open\"\n", 75 | ",\"By leaving it blank it just waits for anykey to be pressed before\" \n", 76 | ",\"continuing. By placing numbers (except 0), we can specify a delay for\"\n", 77 | ",\"how long you keep the window open (time is in milliseconds here)\"]})\n", 78 | "\n", 79 | "df[\"Words\"] = df[\"Title\"].str.count(\" \") + 1\n", 80 | "df" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "d3ecff59", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Python 3 (ipykernel)", 95 | "language": "python", 96 | "name": "python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.9.13" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 5 113 | } 114 | -------------------------------------------------------------------------------- /Session3/pd.cut pd.qcut.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 59, 6 | "id": "81772eea", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "\n", 11 | "import random\n", 12 | "\n", 13 | "# np.random.randint(low = 3,high=8,size=10)\n", 14 | "df = pd.DataFrame({\"nums\":random.sample(range(1, 100), 15)})" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 60, 20 | "id": "873a4bad", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "df[\"bin\"] = pd.cut(df[\"nums\"], bins = [0, 15, 50, 80, 99])\n", 25 | "\n", 26 | "df[\"bin_cat\"] = pd.cut(df[\"nums\"], bins = [0, 15, 50, 80, 99],labels = [\"awful\", \"bad\", \"average\", \"good\"])" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 61, 32 | "id": "de7988a7", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "df[\"divide_equal_4\"] = pd.qcut(df[\"nums\"], q = 4)\n", 37 | "\n", 38 | "df[\"divide_equal_cat\"] = pd.qcut(df[\"nums\"], q = 4,labels = [\"awful\", \"bad\", \"average\", \"good\"])" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 66, 44 | "id": "65fa7db1", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "df[\"nums_cat\"] = pd.qcut(df[\"nums\"], q = 4,labels = [0,1,2,3])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 69, 54 | "id": "b29a92a3", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | "
numsbinbin_catdivide_equal_4divide_equal_catnums_catnums_cat2
086(80, 99]good(78.0, 90.0]good3Diamond
176(50, 80]average(66.0, 78.0]average2Platinum
231(15, 50]bad(21.0, 66.0]bad1Silver
380(50, 80]average(78.0, 90.0]good3Platinum
490(80, 99]good(78.0, 90.0]good3Diamond
571(50, 80]average(66.0, 78.0]average2Platinum
612(0, 15]awful(1.999, 21.0]awful0Silver
720(15, 50]bad(1.999, 21.0]awful0Silver
822(15, 50]bad(21.0, 66.0]bad1Silver
989(80, 99]good(78.0, 90.0]good3Diamond
1042(15, 50]bad(21.0, 66.0]bad1Gold
1172(50, 80]average(66.0, 78.0]average2Platinum
122(0, 15]awful(1.999, 21.0]awful0Bronze
1366(50, 80]average(21.0, 66.0]bad1Gold
149(0, 15]awful(1.999, 21.0]awful0Bronze
\n", 240 | "
" 241 | ], 242 | "text/plain": [ 243 | " nums bin bin_cat divide_equal_4 divide_equal_cat nums_cat nums_cat2\n", 244 | "0 86 (80, 99] good (78.0, 90.0] good 3 Diamond\n", 245 | "1 76 (50, 80] average (66.0, 78.0] average 2 Platinum\n", 246 | "2 31 (15, 50] bad (21.0, 66.0] bad 1 Silver\n", 247 | "3 80 (50, 80] average (78.0, 90.0] good 3 Platinum\n", 248 | "4 90 (80, 99] good (78.0, 90.0] good 3 Diamond\n", 249 | "5 71 (50, 80] average (66.0, 78.0] average 2 Platinum\n", 250 | "6 12 (0, 15] awful (1.999, 21.0] awful 0 Silver\n", 251 | "7 20 (15, 50] bad (1.999, 21.0] awful 0 Silver\n", 252 | "8 22 (15, 50] bad (21.0, 66.0] bad 1 Silver\n", 253 | "9 89 (80, 99] good (78.0, 90.0] good 3 Diamond\n", 254 | "10 42 (15, 50] bad (21.0, 66.0] bad 1 Gold\n", 255 | "11 72 (50, 80] average (66.0, 78.0] average 2 Platinum\n", 256 | "12 2 (0, 15] awful (1.999, 21.0] awful 0 Bronze\n", 257 | "13 66 (50, 80] average (21.0, 66.0] bad 1 Gold\n", 258 | "14 9 (0, 15] awful (1.999, 21.0] awful 0 Bronze" 259 | ] 260 | }, 261 | "execution_count": 69, 262 | "metadata": {}, 263 | "output_type": "execute_result" 264 | } 265 | ], 266 | "source": [ 267 | "bin_labels_5 = ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond']\n", 268 | "df['nums_cat2'] = pd.qcut(df['nums'],\n", 269 | " q=[0, .1, .4, .5, .8, 1],\n", 270 | " labels=bin_labels_5)\n", 271 | "df" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "id": "fb422c74", 277 | "metadata": {}, 278 | "source": [ 279 | "https://www.skytowner.com/explore/pandas_qcut_method" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "id": "b5223131", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 155, 293 | "id": "a85167b2", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "d38e053e", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [] 305 | } 306 | ], 307 | "metadata": { 308 | "kernelspec": { 309 | "display_name": "Python 3 (ipykernel)", 310 | "language": "python", 311 | "name": "python3" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.9.13" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 5 328 | } 329 | -------------------------------------------------------------------------------- /Session3/query_Dataframe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 13, 6 | "id": "05543ee8-86a2-49c8-a5fe-db0ed06a70f5", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | "
colum_without_spacecolumn_with_space
012
124
236
\n", 52 | "
" 53 | ], 54 | "text/plain": [ 55 | " colum_without_space column_with_space\n", 56 | "0 1 2\n", 57 | "1 2 4\n", 58 | "2 3 6" 59 | ] 60 | }, 61 | "execution_count": 13, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "import pandas as pd\n", 68 | "import numpy as np\n", 69 | "\n", 70 | "d={\"colum_without_space\":np.array([1,2,3,4,5,6]),\"column_with_space\":np.array([1,2,3,4,5,6])*2}\n", 71 | "df=pd.DataFrame(d)\n", 72 | "df\n" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 14, 78 | "id": "aeeccfd7-e5e7-4a92-8a18-4854cc32cbfa", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/html": [ 84 | "
\n", 85 | "\n", 98 | "\n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | "
colum_without_spacecolumn_with_space
012
124
236
348
4510
5612
\n", 139 | "
" 140 | ], 141 | "text/plain": [ 142 | " colum_without_space column_with_space\n", 143 | "0 1 2\n", 144 | "1 2 4\n", 145 | "2 3 6\n", 146 | "3 4 8\n", 147 | "4 5 10\n", 148 | "5 6 12" 149 | ] 150 | }, 151 | "execution_count": 14, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "df.query(\"colum_without_space<4\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 15, 163 | "id": "90e00738-bd00-4deb-a98f-0f41f0666f9c", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/html": [ 169 | "
\n", 170 | "\n", 183 | "\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | "
colum_without_spacecolumn_with_space
012
124
236
\n", 209 | "
" 210 | ], 211 | "text/plain": [ 212 | " colum_without_space column_with_space\n", 213 | "0 1 2\n", 214 | "1 2 4\n", 215 | "2 3 6" 216 | ] 217 | }, 218 | "execution_count": 15, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "df.query(\"column_with_space<8\")" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "57244ce6-8110-4566-8045-e4e62be938c2", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "Python 3 (ipykernel)", 239 | "language": "python", 240 | "name": "python3" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.9.12" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 5 257 | } 258 | -------------------------------------------------------------------------------- /Session3/transform sum pandas column.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "51074644", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | "
orderiditem
0110.0
11120.0
21130.0
32200.0
42300.0
53550.0
6412.3
75200.0
\n", 77 | "
" 78 | ], 79 | "text/plain": [ 80 | " orderid item\n", 81 | "0 1 10.0\n", 82 | "1 1 120.0\n", 83 | "2 1 130.0\n", 84 | "3 2 200.0\n", 85 | "4 2 300.0\n", 86 | "5 3 550.0\n", 87 | "6 4 12.3\n", 88 | "7 5 200.0" 89 | ] 90 | }, 91 | "execution_count": 2, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "import pandas as pd\n", 98 | "d = {\"orderid\":[1, 1, 1, 2, 2, 3, 4, 5], \"item\":[10, 120, 130, 200, 300, 550, 12.3, 200]}\n", 99 | "df = pd.DataFrame(d)\n", 100 | "df\n", 101 | "\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 3, 107 | "id": "b4acf6ed", 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "This is the output we want to aggregate to the original df\n" 115 | ] 116 | }, 117 | { 118 | "data": { 119 | "text/html": [ 120 | "
\n", 121 | "\n", 134 | "\n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | "
item
orderid
1260.0
2500.0
3550.0
412.3
5200.0
\n", 168 | "
" 169 | ], 170 | "text/plain": [ 171 | " item\n", 172 | "orderid \n", 173 | "1 260.0\n", 174 | "2 500.0\n", 175 | "3 550.0\n", 176 | "4 12.3\n", 177 | "5 200.0" 178 | ] 179 | }, 180 | "execution_count": 3, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "print(\"This is the output we want to aggregate to the original df\")\n", 187 | "df.groupby(\"orderid\")[\"item\"].sum().to_frame()\n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 4, 193 | "id": "f2904919", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/html": [ 199 | "
\n", 200 | "\n", 213 | "\n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | "
orderiditemtotal_items_sold
0110.0260.0
11120.0260.0
21130.0260.0
32200.0500.0
42300.0500.0
53550.0550.0
6412.312.3
75200.0200.0
\n", 273 | "
" 274 | ], 275 | "text/plain": [ 276 | " orderid item total_items_sold\n", 277 | "0 1 10.0 260.0\n", 278 | "1 1 120.0 260.0\n", 279 | "2 1 130.0 260.0\n", 280 | "3 2 200.0 500.0\n", 281 | "4 2 300.0 500.0\n", 282 | "5 3 550.0 550.0\n", 283 | "6 4 12.3 12.3\n", 284 | "7 5 200.0 200.0" 285 | ] 286 | }, 287 | "execution_count": 4, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "df[\"total_items_sold\"] = df.groupby(\"orderid\")[\"item\"].transform(sum)\n", 294 | "df" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "51f5e41f", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [] 304 | } 305 | ], 306 | "metadata": { 307 | "kernelspec": { 308 | "display_name": "Python 3 (ipykernel)", 309 | "language": "python", 310 | "name": "python3" 311 | }, 312 | "language_info": { 313 | "codemirror_mode": { 314 | "name": "ipython", 315 | "version": 3 316 | }, 317 | "file_extension": ".py", 318 | "mimetype": "text/x-python", 319 | "name": "python", 320 | "nbconvert_exporter": "python", 321 | "pygments_lexer": "ipython3", 322 | "version": "3.9.13" 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 5 327 | } 328 | -------------------------------------------------------------------------------- /Session4/load autotime for every cell runtime and select_dtypes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "ba2de180", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "data": { 11 | "text/html": [ 12 | "
\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | "
departmentpromotedreviewprojectssalarytenuresatisfactionbonusavg_hrs_monthleft
0operations00.5775693low5.00.6267590180.866070no
1operations00.7519003medium6.00.4436790182.708149no
2support00.7225483medium6.00.4468230184.416084no
3logistics00.6751584high8.00.4401390188.707545no
4sales00.6762033high5.00.5776071179.821083no
.................................
9535operations00.6109884medium8.00.5436410188.155738yes
9536logistics00.7468873medium8.00.5490480188.176164yes
9537operations00.5579803low7.00.7054250186.531008yes
9538IT00.5844464medium8.00.6072871187.641370yes
9539finance00.6263733low7.00.7064551185.920934yes
\n", 188 | "

9540 rows × 10 columns

\n", 189 | "
" 190 | ], 191 | "text/plain": [ 192 | " department promoted review projects salary tenure satisfaction \\\n", 193 | "0 operations 0 0.577569 3 low 5.0 0.626759 \n", 194 | "1 operations 0 0.751900 3 medium 6.0 0.443679 \n", 195 | "2 support 0 0.722548 3 medium 6.0 0.446823 \n", 196 | "3 logistics 0 0.675158 4 high 8.0 0.440139 \n", 197 | "4 sales 0 0.676203 3 high 5.0 0.577607 \n", 198 | "... ... ... ... ... ... ... ... \n", 199 | "9535 operations 0 0.610988 4 medium 8.0 0.543641 \n", 200 | "9536 logistics 0 0.746887 3 medium 8.0 0.549048 \n", 201 | "9537 operations 0 0.557980 3 low 7.0 0.705425 \n", 202 | "9538 IT 0 0.584446 4 medium 8.0 0.607287 \n", 203 | "9539 finance 0 0.626373 3 low 7.0 0.706455 \n", 204 | "\n", 205 | " bonus avg_hrs_month left \n", 206 | "0 0 180.866070 no \n", 207 | "1 0 182.708149 no \n", 208 | "2 0 184.416084 no \n", 209 | "3 0 188.707545 no \n", 210 | "4 1 179.821083 no \n", 211 | "... ... ... ... \n", 212 | "9535 0 188.155738 yes \n", 213 | "9536 0 188.176164 yes \n", 214 | "9537 0 186.531008 yes \n", 215 | "9538 1 187.641370 yes \n", 216 | "9539 1 185.920934 yes \n", 217 | "\n", 218 | "[9540 rows x 10 columns]" 219 | ] 220 | }, 221 | "execution_count": 1, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | }, 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "time: 29.1 ms (started: 2023-03-26 23:09:11 +03:00)\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "import pandas as pd\n", 235 | "import numpy as np\n", 236 | "\n", 237 | "import autotime\n", 238 | "\n", 239 | "%load_ext autotime\n", 240 | "\n", 241 | "df=pd.read_csv(\"employee_churn_data.csv\")\n", 242 | "df" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 5, 248 | "id": "e119ba3b", 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "\n", 256 | "RangeIndex: 9540 entries, 0 to 9539\n", 257 | "Data columns (total 10 columns):\n", 258 | " # Column Non-Null Count Dtype \n", 259 | "--- ------ -------------- ----- \n", 260 | " 0 department 9540 non-null object \n", 261 | " 1 promoted 9540 non-null int64 \n", 262 | " 2 review 9540 non-null float64\n", 263 | " 3 projects 9540 non-null int64 \n", 264 | " 4 salary 9540 non-null object \n", 265 | " 5 tenure 9540 non-null float64\n", 266 | " 6 satisfaction 9540 non-null float64\n", 267 | " 7 bonus 9540 non-null int64 \n", 268 | " 8 avg_hrs_month 9540 non-null float64\n", 269 | " 9 left 9540 non-null object \n", 270 | "dtypes: float64(4), int64(3), object(3)\n", 271 | "memory usage: 745.4+ KB\n", 272 | "time: 16.9 ms (started: 2023-03-26 23:09:43 +03:00)\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "df.info()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 2, 283 | "id": "ca9b468f", 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/html": [ 289 | "
\n", 290 | "\n", 303 | "\n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | "
promotedreviewprojectstenuresatisfactionbonusavg_hrs_month
000.57756935.00.6267590180.866070
100.75190036.00.4436790182.708149
200.72254836.00.4468230184.416084
300.67515848.00.4401390188.707545
400.67620335.00.5776071179.821083
........................
953500.61098848.00.5436410188.155738
953600.74688738.00.5490480188.176164
953700.55798037.00.7054250186.531008
953800.58444648.00.6072871187.641370
953900.62637337.00.7064551185.920934
\n", 429 | "

9540 rows × 7 columns

\n", 430 | "
" 431 | ], 432 | "text/plain": [ 433 | " promoted review projects tenure satisfaction bonus avg_hrs_month\n", 434 | "0 0 0.577569 3 5.0 0.626759 0 180.866070\n", 435 | "1 0 0.751900 3 6.0 0.443679 0 182.708149\n", 436 | "2 0 0.722548 3 6.0 0.446823 0 184.416084\n", 437 | "3 0 0.675158 4 8.0 0.440139 0 188.707545\n", 438 | "4 0 0.676203 3 5.0 0.577607 1 179.821083\n", 439 | "... ... ... ... ... ... ... ...\n", 440 | "9535 0 0.610988 4 8.0 0.543641 0 188.155738\n", 441 | "9536 0 0.746887 3 8.0 0.549048 0 188.176164\n", 442 | "9537 0 0.557980 3 7.0 0.705425 0 186.531008\n", 443 | "9538 0 0.584446 4 8.0 0.607287 1 187.641370\n", 444 | "9539 0 0.626373 3 7.0 0.706455 1 185.920934\n", 445 | "\n", 446 | "[9540 rows x 7 columns]" 447 | ] 448 | }, 449 | "execution_count": 2, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | }, 453 | { 454 | "name": "stdout", 455 | "output_type": "stream", 456 | "text": [ 457 | "time: 14.4 ms (started: 2023-03-26 23:09:24 +03:00)\n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "df.select_dtypes(include=['int64',\"float64\"])" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 4, 468 | "id": "752a65a7", 469 | "metadata": {}, 470 | "outputs": [ 471 | { 472 | "data": { 473 | "text/html": [ 474 | "
\n", 475 | "\n", 488 | "\n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | "
promotedreviewprojectstenuresatisfactionbonusavg_hrs_month
000.57756935.00.6267590180.866070
100.75190036.00.4436790182.708149
200.72254836.00.4468230184.416084
300.67515848.00.4401390188.707545
400.67620335.00.5776071179.821083
........................
953500.61098848.00.5436410188.155738
953600.74688738.00.5490480188.176164
953700.55798037.00.7054250186.531008
953800.58444648.00.6072871187.641370
953900.62637337.00.7064551185.920934
\n", 614 | "

9540 rows × 7 columns

\n", 615 | "
" 616 | ], 617 | "text/plain": [ 618 | " promoted review projects tenure satisfaction bonus avg_hrs_month\n", 619 | "0 0 0.577569 3 5.0 0.626759 0 180.866070\n", 620 | "1 0 0.751900 3 6.0 0.443679 0 182.708149\n", 621 | "2 0 0.722548 3 6.0 0.446823 0 184.416084\n", 622 | "3 0 0.675158 4 8.0 0.440139 0 188.707545\n", 623 | "4 0 0.676203 3 5.0 0.577607 1 179.821083\n", 624 | "... ... ... ... ... ... ... ...\n", 625 | "9535 0 0.610988 4 8.0 0.543641 0 188.155738\n", 626 | "9536 0 0.746887 3 8.0 0.549048 0 188.176164\n", 627 | "9537 0 0.557980 3 7.0 0.705425 0 186.531008\n", 628 | "9538 0 0.584446 4 8.0 0.607287 1 187.641370\n", 629 | "9539 0 0.626373 3 7.0 0.706455 1 185.920934\n", 630 | "\n", 631 | "[9540 rows x 7 columns]" 632 | ] 633 | }, 634 | "execution_count": 4, 635 | "metadata": {}, 636 | "output_type": "execute_result" 637 | }, 638 | { 639 | "name": "stdout", 640 | "output_type": "stream", 641 | "text": [ 642 | "time: 15.1 ms (started: 2023-03-26 23:09:33 +03:00)\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "df.select_dtypes(exclude ='object')" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "id": "8ca95338", 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [] 657 | } 658 | ], 659 | "metadata": { 660 | "kernelspec": { 661 | "display_name": "Python 3 (ipykernel)", 662 | "language": "python", 663 | "name": "python3" 664 | }, 665 | "language_info": { 666 | "codemirror_mode": { 667 | "name": "ipython", 668 | "version": 3 669 | }, 670 | "file_extension": ".py", 671 | "mimetype": "text/x-python", 672 | "name": "python", 673 | "nbconvert_exporter": "python", 674 | "pygments_lexer": "ipython3", 675 | "version": "3.9.13" 676 | } 677 | }, 678 | "nbformat": 4, 679 | "nbformat_minor": 5 680 | } 681 | -------------------------------------------------------------------------------- /Session4/swifter apply fastest run.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "73339d7b", 6 | "metadata": {}, 7 | "source": [ 8 | "From the documentation, it is stated that Swifter could apply function a hundred times faster than Pandas function. This, however, only applied if we are using a vectorized form of function." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "ae1c4d99", 14 | "metadata": {}, 15 | "source": [ 16 | "https://github.com/jmcarpenter2/swifter/blob/master/docs/documentation.md\n", 17 | "\n", 18 | "https://github.com/jmcarpenter2/swifter" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 213, 24 | "id": "d59d4435", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "The autotime extension is already loaded. To reload it, use:\n", 32 | " %reload_ext autotime\n" 33 | ] 34 | }, 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | "
departmentpromotedreviewprojectssalarytenuresatisfactionbonusavg_hrs_monthleft
0operations00.5775693low5.00.6267590180.866070no
1operations00.7519003medium6.00.4436790182.708149no
2support00.7225483medium6.00.4468230184.416084no
3logistics00.6751584high8.00.4401390188.707545no
4sales00.6762033high5.00.5776071179.821083no
.................................
9535operations00.6109884medium8.00.5436410188.155738yes
9536logistics00.7468873medium8.00.5490480188.176164yes
9537operations00.5579803low7.00.7054250186.531008yes
9538IT00.5844464medium8.00.6072871187.641370yes
9539finance00.6263733low7.00.7064551185.920934yes
\n", 214 | "

9540 rows × 10 columns

\n", 215 | "
" 216 | ], 217 | "text/plain": [ 218 | " department promoted review projects salary tenure satisfaction \\\n", 219 | "0 operations 0 0.577569 3 low 5.0 0.626759 \n", 220 | "1 operations 0 0.751900 3 medium 6.0 0.443679 \n", 221 | "2 support 0 0.722548 3 medium 6.0 0.446823 \n", 222 | "3 logistics 0 0.675158 4 high 8.0 0.440139 \n", 223 | "4 sales 0 0.676203 3 high 5.0 0.577607 \n", 224 | "... ... ... ... ... ... ... ... \n", 225 | "9535 operations 0 0.610988 4 medium 8.0 0.543641 \n", 226 | "9536 logistics 0 0.746887 3 medium 8.0 0.549048 \n", 227 | "9537 operations 0 0.557980 3 low 7.0 0.705425 \n", 228 | "9538 IT 0 0.584446 4 medium 8.0 0.607287 \n", 229 | "9539 finance 0 0.626373 3 low 7.0 0.706455 \n", 230 | "\n", 231 | " bonus avg_hrs_month left \n", 232 | "0 0 180.866070 no \n", 233 | "1 0 182.708149 no \n", 234 | "2 0 184.416084 no \n", 235 | "3 0 188.707545 no \n", 236 | "4 1 179.821083 no \n", 237 | "... ... ... ... \n", 238 | "9535 0 188.155738 yes \n", 239 | "9536 0 188.176164 yes \n", 240 | "9537 0 186.531008 yes \n", 241 | "9538 1 187.641370 yes \n", 242 | "9539 1 185.920934 yes \n", 243 | "\n", 244 | "[9540 rows x 10 columns]" 245 | ] 246 | }, 247 | "execution_count": 213, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | }, 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "time: 31 ms (started: 2023-03-27 04:14:03 +03:00)\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "import warnings\n", 261 | "warnings.filterwarnings(\"ignore\")\n", 262 | "import pandas as pd\n", 263 | "import numpy as np\n", 264 | "import swifter\n", 265 | "import autotime\n", 266 | "%load_ext autotime\n", 267 | "df=pd.read_csv(\"employee_churn_data.csv\")\n", 268 | "df" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 204, 274 | "id": "848016dc", 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "time: 1.17 ms (started: 2023-03-27 03:32:40 +03:00)\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "df=df.select_dtypes(include=['int64',\"float64\"])" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 205, 292 | "id": "b580ffb6", 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "time: 64.9 ms (started: 2023-03-27 03:32:40 +03:00)\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "for i in range(6):\n", 305 | " df=df.append(df)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 212, 311 | "id": "bbb5bde1", 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "610560" 318 | ] 319 | }, 320 | "execution_count": 212, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | }, 324 | { 325 | "name": "stdout", 326 | "output_type": "stream", 327 | "text": [ 328 | "time: 3.71 ms (started: 2023-03-27 03:33:12 +03:00)\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "df.shape[0]" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 206, 339 | "id": "6a0fd71e", 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "time: 115 ms (started: 2023-03-27 03:32:40 +03:00)\n" 347 | ] 348 | } 349 | ], 350 | "source": [ 351 | "a = df.apply(lambda x: x/2 -1)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 207, 357 | "id": "8868d15e", 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "time: 165 ms (started: 2023-03-27 03:32:40 +03:00)\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "b = df.swifter.apply(lambda x: x/2 -1)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 208, 375 | "id": "72e2953a", 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "time: 6.84 s (started: 2023-03-27 03:32:40 +03:00)\n" 383 | ] 384 | } 385 | ], 386 | "source": [ 387 | "def scoring_comment(x):\n", 388 | " if x['bonus'] == 0:\n", 389 | " return x['review'] *2\n", 390 | " else:\n", 391 | " return x['review']\n", 392 | "#Trying applying the function using Pandas apply\n", 393 | "\n", 394 | "a = df.apply(scoring_comment, axis =1)" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 209, 400 | "id": "e9e03a26", 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "application/vnd.jupyter.widget-view+json": { 406 | "model_id": "e9f976687ee641bd8a43e354108a0f3d", 407 | "version_major": 2, 408 | "version_minor": 0 409 | }, 410 | "text/plain": [ 411 | "Pandas Apply: 0%| | 0/610560 [00:00\n", 13 | "\n", 26 | "\n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | "
set_of_numbers
0NaN
16.4
270.2
3NaN
4NaN
58
69
710
8NaN
9-
10*
\n", 80 | "" 81 | ], 82 | "text/plain": [ 83 | " set_of_numbers\n", 84 | "0 NaN\n", 85 | "1 6.4\n", 86 | "2 70.2\n", 87 | "3 NaN\n", 88 | "4 NaN\n", 89 | "5 8\n", 90 | "6 9\n", 91 | "7 10\n", 92 | "8 NaN\n", 93 | "9 -\n", 94 | "10 *" 95 | ] 96 | }, 97 | "execution_count": 13, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "import pandas as pd \n", 104 | "import numpy as np\n", 105 | "import pandas as pd\n", 106 | "import numpy as np\n", 107 | "\n", 108 | "data = {'set_of_numbers': [np.nan,6.4,70.2,np.nan,np.nan,8,9,10,np.nan,\"-\",\"*\"]}\n", 109 | "df = pd.DataFrame(data)\n", 110 | "\n", 111 | "df" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 14, 117 | "id": "195b4bb6", 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "0 NaN\n", 124 | "1 6.4\n", 125 | "2 70.2\n", 126 | "3 NaN\n", 127 | "4 NaN\n", 128 | "5 8\n", 129 | "6 9\n", 130 | "7 10\n", 131 | "8 NaN\n", 132 | "9 -\n", 133 | "10 *\n", 134 | "Name: set_of_numbers, dtype: object" 135 | ] 136 | }, 137 | "execution_count": 14, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "pd.to_numeric(df[\"set_of_numbers\"],errors ='ignore')" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 15, 149 | "id": "700569a2", 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "0 NaN\n", 156 | "1 6.4\n", 157 | "2 70.2\n", 158 | "3 NaN\n", 159 | "4 NaN\n", 160 | "5 8.0\n", 161 | "6 9.0\n", 162 | "7 10.0\n", 163 | "8 NaN\n", 164 | "9 NaN\n", 165 | "10 NaN\n", 166 | "Name: set_of_numbers, dtype: float64" 167 | ] 168 | }, 169 | "execution_count": 15, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "pd.to_numeric(df[\"set_of_numbers\"], errors ='coerce')" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 16, 181 | "id": "62e91dca", 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "0 10\n", 188 | "1 12\n", 189 | "2 15\n", 190 | "3 20\n", 191 | "4 A\n", 192 | "5 31\n", 193 | "6 C\n", 194 | "7 D\n", 195 | "dtype: object" 196 | ] 197 | }, 198 | "execution_count": 16, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "df = pd.Series(['10', '12', '15', '20', 'A', '31', 'C', 'D'])\n", 205 | "df" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 18, 211 | "id": "807da86c", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "0 10.0\n", 218 | "1 12.0\n", 219 | "2 15.0\n", 220 | "3 20.0\n", 221 | "4 NaN\n", 222 | "5 31.0\n", 223 | "6 NaN\n", 224 | "7 NaN\n", 225 | "dtype: float64" 226 | ] 227 | }, 228 | "execution_count": 18, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "pd.to_numeric(df, errors='coerce')" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "4bfdd542", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3 (ipykernel)", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.9.13" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 5 267 | } 268 | --------------------------------------------------------------------------------