├── .gitignore ├── 1_log ├── income.csv ├── logarithm_in_data_analysis.ipynb └── revenue.csv ├── 3_normal_distribution ├── Exercise │ ├── bhp.csv │ ├── exercise.md │ └── exercise_solution.ipynb ├── heights.csv ├── heights_few_samples.xlsx ├── normal_distribution.ipynb └── zscore.png ├── 4_mean_percentile ├── Exercise │ ├── exercise.md │ └── percentile_exercise_solution.ipynb ├── income.csv └── median_percentile.ipynb ├── 5_log_normal_distribution ├── income.csv ├── lognormal_dist.ipynb └── usa_household_income.xls ├── 6_cosine_similarity └── cosine_similarity.ipynb ├── 7_modified_z_score ├── modified_z_score.xlsx ├── modified_z_score_tutorial.ipynb └── movie_revenues.csv └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /1_log/income.csv: -------------------------------------------------------------------------------- 1 | company,revenue 2 | Tesla ,31 3 | UBER,11 4 | Amazon,386 5 | Jindal Steel,4.7 6 | Axis Bank,5.6 7 | Vedanta,11.3 8 | -------------------------------------------------------------------------------- /1_log/logarithm_in_data_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 9, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/html": [ 20 | "
\n", 21 | "\n", 34 | "\n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | "
companyrevenue
0Tesla31.0
1UBER11.0
2Amazon386.0
3Jindal Steel4.7
4Axis Bank5.6
5Vedanta11.3
\n", 75 | "
" 76 | ], 77 | "text/plain": [ 78 | " company revenue\n", 79 | "0 Tesla 31.0\n", 80 | "1 UBER 11.0\n", 81 | "2 Amazon 386.0\n", 82 | "3 Jindal Steel 4.7\n", 83 | "4 Axis Bank 5.6\n", 84 | "5 Vedanta 11.3" 85 | ] 86 | }, 87 | "execution_count": 9, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "df = pd.read_csv(\"revenue.csv\")\n", 94 | "df.head(10)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 6, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "" 106 | ] 107 | }, 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | }, 112 | { 113 | "data": { 114 | "image/png": "\n", 115 | "text/plain": [ 116 | "
" 117 | ] 118 | }, 119 | "metadata": { 120 | "needs_background": "light" 121 | }, 122 | "output_type": "display_data" 123 | } 124 | ], 125 | "source": [ 126 | "df.plot(x='company', y='revenue', kind='bar')" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 8, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "" 138 | ] 139 | }, 140 | "execution_count": 8, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | }, 144 | { 145 | "data": { 146 | "image/png": "\n", 147 | "text/plain": [ 148 | "
" 149 | ] 150 | }, 151 | "metadata": { 152 | "needs_background": "light" 153 | }, 154 | "output_type": "display_data" 155 | } 156 | ], 157 | "source": [ 158 | "df.plot(x='company', y='revenue',kind='bar', logy=True)" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "celltoolbar": "Raw Cell Format", 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.8.5" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 4 184 | } 185 | -------------------------------------------------------------------------------- /1_log/revenue.csv: -------------------------------------------------------------------------------- 1 | company,revenue 2 | Tesla ,31 3 | UBER,11 4 | Amazon,386 5 | Jindal Steel,4.7 6 | Axis Bank,5.6 7 | Vedanta,11.3 8 | -------------------------------------------------------------------------------- /3_normal_distribution/Exercise/exercise.md: -------------------------------------------------------------------------------- 1 | ## Exercise (Normal Distribution and Z Score for Outlier Removal) 2 | 3 | You are given bhp.csv which contains property prices in the city of banglore, India. You need to examine price_per_sqft column and do following, 4 | 5 | 1. Remove outliers using percentile technique first. Use [0.001, 0.999] for lower and upper bound percentiles 6 | 1. After removing outliers in step 1, you get a new dataframe. 7 | 1. On step(2) dataframe, use 4 standard deviation to remove outliers 8 | 1. Plot histogram for new dataframe that is generated after step (3). Also plot bell curve on same histogram 9 | 1. On step(2) dataframe, use zscore of 4 to remove outliers. This is quite similar to step (3) and you will get exact same result 10 | 11 | [Solution](https://github.com/codebasics/math-for-machine-learning/blob/main/3_normal_distribution/Exercise/exercise_solution.ipynb) -------------------------------------------------------------------------------- /3_normal_distribution/heights_few_samples.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codebasics/math-for-machine-learning/d6b97013d7787b23c41976d66c4ed35959a35e81/3_normal_distribution/heights_few_samples.xlsx -------------------------------------------------------------------------------- /3_normal_distribution/normal_distribution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Normal Distribution and Z Score: Math and statistics for data science

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 79, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import seaborn as sn" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "We are going to use heights dataset from kaggle.com. Dataset has heights and weights \n", 25 | "both but I have removed weights to make it simple\n", 26 | "\n", 27 | "\n", 28 | "https://www.kaggle.com/mustafaali96/weight-height" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 80, 34 | "metadata": { 35 | "scrolled": true 36 | }, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | "
genderheight
0Male73.847017
1Male68.781904
2Male74.110105
3Male71.730978
4Male69.881796
\n", 91 | "
" 92 | ], 93 | "text/plain": [ 94 | " gender height\n", 95 | "0 Male 73.847017\n", 96 | "1 Male 68.781904\n", 97 | "2 Male 74.110105\n", 98 | "3 Male 71.730978\n", 99 | "4 Male 69.881796" 100 | ] 101 | }, 102 | "execution_count": 80, 103 | "metadata": {}, 104 | "output_type": "execute_result" 105 | } 106 | ], 107 | "source": [ 108 | "df = pd.read_csv(\"heights.csv\")\n", 109 | "df.head()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "**(1) Outlier detection and removal using Standard Deviation**" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 81, 122 | "metadata": { 123 | "scrolled": false 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "count 10000.000000\n", 130 | "mean 66.367560\n", 131 | "std 3.847528\n", 132 | "min 54.263133\n", 133 | "25% 63.505620\n", 134 | "50% 66.318070\n", 135 | "75% 69.174262\n", 136 | "max 78.998742\n", 137 | "Name: height, dtype: float64" 138 | ] 139 | }, 140 | "execution_count": 81, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "df.height.describe()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 82, 152 | "metadata": { 153 | "scrolled": true 154 | }, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "" 160 | ] 161 | }, 162 | "execution_count": 82, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | }, 166 | { 167 | "data": { 168 | "image/png": "\n", 169 | "text/plain": [ 170 | "
" 171 | ] 172 | }, 173 | "metadata": { 174 | "needs_background": "light" 175 | }, 176 | "output_type": "display_data" 177 | } 178 | ], 179 | "source": [ 180 | "sn.histplot(df.height, kde=True)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 83, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "66.367559754866" 192 | ] 193 | }, 194 | "execution_count": 83, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "mean = df.height.mean()\n", 201 | "mean" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 84, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "3.847528120795573" 213 | ] 214 | }, 215 | "execution_count": 84, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "std_deviation = df.height.std()\n", 222 | "std_deviation" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 85, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "data": { 232 | "text/plain": [ 233 | "54.824975392479274" 234 | ] 235 | }, 236 | "execution_count": 85, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "mean-3*std_deviation" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 86, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "77.91014411725271" 254 | ] 255 | }, 256 | "execution_count": 86, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "mean+3*std_deviation" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 88, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/html": [ 273 | "
\n", 274 | "\n", 287 | "\n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | "
genderheight
994Male78.095867
1317Male78.462053
2014Male78.998742
3285Male78.528210
3757Male78.621374
6624Female54.616858
9285Female54.263133
\n", 333 | "
" 334 | ], 335 | "text/plain": [ 336 | " gender height\n", 337 | "994 Male 78.095867\n", 338 | "1317 Male 78.462053\n", 339 | "2014 Male 78.998742\n", 340 | "3285 Male 78.528210\n", 341 | "3757 Male 78.621374\n", 342 | "6624 Female 54.616858\n", 343 | "9285 Female 54.263133" 344 | ] 345 | }, 346 | "execution_count": 88, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "df[(df.height < 54.82) | (df.height > 77.91)]" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 90, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "(9993, 2)" 364 | ] 365 | }, 366 | "execution_count": 90, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "df_no_outlier = df[(df.height<77.91) & (df.height>54.82)]\n", 373 | "df_no_outlier.shape" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "**(2) Outlier detection and removal using Z Score**\n", 381 | "\n", 382 | "Z score is a way to achieve same thing that we did above in part (1)\n", 383 | "\n", 384 | "Z score indicates how many standard deviation away a data point is.\n", 385 | "\n", 386 | "For example in our case mean is 66.37 and standard deviation is 3.84.\n", 387 | "\n", 388 | "If a value of a data point is 77.91 then Z score for that is 3 because it is 3 standard deviation away (77.91 = 66.37 + 3 * 3.84)\n", 389 | "\n", 390 | "Calculate the Z Score" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": {}, 396 | "source": [ 397 | "" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "Let's add a new column in our dataframe for this Z score" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 91, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/html": [ 415 | "
\n", 416 | "\n", 429 | "\n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | "
genderheightzscore
0Male73.8470171.943964
1Male68.7819040.627505
2Male74.1101052.012343
3Male71.7309781.393991
4Male69.8817960.913375
\n", 471 | "
" 472 | ], 473 | "text/plain": [ 474 | " gender height zscore\n", 475 | "0 Male 73.847017 1.943964\n", 476 | "1 Male 68.781904 0.627505\n", 477 | "2 Male 74.110105 2.012343\n", 478 | "3 Male 71.730978 1.393991\n", 479 | "4 Male 69.881796 0.913375" 480 | ] 481 | }, 482 | "execution_count": 91, 483 | "metadata": {}, 484 | "output_type": "execute_result" 485 | } 486 | ], 487 | "source": [ 488 | "df['zscore'] = ( df.height - df.height.mean() ) / df.height.std()\n", 489 | "df.head(5)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "Above for first record with height 73.84, z score is 1.94. This means 73.84 is 1.94 standard deviation away from mean" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 93, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "data": { 506 | "text/plain": [ 507 | "66.367559754866" 508 | ] 509 | }, 510 | "execution_count": 93, 511 | "metadata": {}, 512 | "output_type": "execute_result" 513 | } 514 | ], 515 | "source": [ 516 | "df.height.mean()" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 94, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/plain": [ 527 | "3.847528120795573" 528 | ] 529 | }, 530 | "execution_count": 94, 531 | "metadata": {}, 532 | "output_type": "execute_result" 533 | } 534 | ], 535 | "source": [ 536 | "df.height.std()" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 92, 542 | "metadata": {}, 543 | "outputs": [ 544 | { 545 | "data": { 546 | "text/plain": [ 547 | "1.9453124999999998" 548 | ] 549 | }, 550 | "execution_count": 92, 551 | "metadata": {}, 552 | "output_type": "execute_result" 553 | } 554 | ], 555 | "source": [ 556 | "(73.84-66.37)/3.84" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 95, 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "data": { 566 | "text/html": [ 567 | "
\n", 568 | "\n", 581 | "\n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | "
genderheightzscore
994Male78.0958673.048271
1317Male78.4620533.143445
2014Male78.9987423.282934
3285Male78.5282103.160640
3757Male78.6213743.184854
\n", 623 | "
" 624 | ], 625 | "text/plain": [ 626 | " gender height zscore\n", 627 | "994 Male 78.095867 3.048271\n", 628 | "1317 Male 78.462053 3.143445\n", 629 | "2014 Male 78.998742 3.282934\n", 630 | "3285 Male 78.528210 3.160640\n", 631 | "3757 Male 78.621374 3.184854" 632 | ] 633 | }, 634 | "execution_count": 95, 635 | "metadata": {}, 636 | "output_type": "execute_result" 637 | } 638 | ], 639 | "source": [ 640 | "df[df['zscore']>3]" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 96, 646 | "metadata": { 647 | "scrolled": true 648 | }, 649 | "outputs": [ 650 | { 651 | "data": { 652 | "text/html": [ 653 | "
\n", 654 | "\n", 667 | "\n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | "
genderheightzscore
6624Female54.616858-3.054091
9285Female54.263133-3.146027
\n", 691 | "
" 692 | ], 693 | "text/plain": [ 694 | " gender height zscore\n", 695 | "6624 Female 54.616858 -3.054091\n", 696 | "9285 Female 54.263133 -3.146027" 697 | ] 698 | }, 699 | "execution_count": 96, 700 | "metadata": {}, 701 | "output_type": "execute_result" 702 | } 703 | ], 704 | "source": [ 705 | "df[df['zscore']<-3]" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "

Exercise

" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "You are given bhp.csv which contains property prices in the city of banglore, India. You need to examine price_per_sqft column and do following,\n", 720 | "\n", 721 | "(1) Remove outliers using percentile technique first. Use [0.001, 0.999] for lower and upper bound percentiles\n", 722 | "\n", 723 | "(2) After removing outliers in step 1, you get a new dataframe.\n", 724 | "\n", 725 | "(3) On step(2) dataframe, use 4 standard deviation to remove outliers\n", 726 | "\n", 727 | "(4) Plot histogram for new dataframe that is generated after step (3). Also plot bell curve on same histogram\n", 728 | "\n", 729 | "(5) On step(2) dataframe, use zscore of 4 to remove outliers. This is quite similar to step (3) and you will get exact same result" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [] 738 | } 739 | ], 740 | "metadata": { 741 | "kernelspec": { 742 | "display_name": "Python 3", 743 | "language": "python", 744 | "name": "python3" 745 | }, 746 | "language_info": { 747 | "codemirror_mode": { 748 | "name": "ipython", 749 | "version": 3 750 | }, 751 | "file_extension": ".py", 752 | "mimetype": "text/x-python", 753 | "name": "python", 754 | "nbconvert_exporter": "python", 755 | "pygments_lexer": "ipython3", 756 | "version": "3.8.5" 757 | } 758 | }, 759 | "nbformat": 4, 760 | "nbformat_minor": 4 761 | } 762 | -------------------------------------------------------------------------------- /3_normal_distribution/zscore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codebasics/math-for-machine-learning/d6b97013d7787b23c41976d66c4ed35959a35e81/3_normal_distribution/zscore.png -------------------------------------------------------------------------------- /4_mean_percentile/Exercise/exercise.md: -------------------------------------------------------------------------------- 1 | ## Exercise: Median, Mean, Percentile 2 | 3 | Use this air bnb new york city [data set](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data/data) and remove outliers using percentile based on price per night for a given apartment/home. You can use suitable upper and lower limits on percentile based on your intuition. Your goal is to come up with new pandas dataframe that doesn't have the outliers present in it. 4 | 5 | [Solution](https://github.com/codebasics/math-for-machine-learning/blob/main/4_mean_percentile/Exercise/percentile_exercise_solution.ipynb) -------------------------------------------------------------------------------- /4_mean_percentile/income.csv: -------------------------------------------------------------------------------- 1 | Name,Monthly Income ($) 2 | Rob,5000 3 | Rafiq,6000 4 | Nina,4000 5 | Sofia,7500 6 | Mohan,8000 7 | Tao,7000 8 | Elon Musk,10000000 -------------------------------------------------------------------------------- /4_mean_percentile/median_percentile.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 76, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 99, 16 | "metadata": { 17 | "scrolled": true 18 | }, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
\n", 24 | "\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | "
nameincome
0Rob5000
1Rafiq6000
2Nina4000
3Sofia7500
4Mohan8000
5Tao7000
6Elon Musk10000000
\n", 83 | "
" 84 | ], 85 | "text/plain": [ 86 | " name income\n", 87 | "0 Rob 5000\n", 88 | "1 Rafiq 6000\n", 89 | "2 Nina 4000\n", 90 | "3 Sofia 7500\n", 91 | "4 Mohan 8000\n", 92 | "5 Tao 7000\n", 93 | "6 Elon Musk 10000000" 94 | ] 95 | }, 96 | "execution_count": 99, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "df = pd.read_csv(\"income.csv\", names=[\"name\",\"income\"], skiprows=[0])\n", 103 | "df" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 12, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "count 7.000000e+00\n", 115 | "mean 1.433929e+06\n", 116 | "std 3.777283e+06\n", 117 | "min 4.000000e+03\n", 118 | "25% 5.500000e+03\n", 119 | "50% 7.000000e+03\n", 120 | "75% 7.750000e+03\n", 121 | "max 1.000000e+07\n", 122 | "Name: income, dtype: float64" 123 | ] 124 | }, 125 | "execution_count": 12, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "df.income.describe()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 54, 137 | "metadata": { 138 | "scrolled": true 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "4000.0" 145 | ] 146 | }, 147 | "execution_count": 54, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "df.income.quantile(0)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 63, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "6000" 165 | ] 166 | }, 167 | "execution_count": 63, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "df.income.quantile(0.25,interpolation=\"higher\")" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 62, 179 | "metadata": { 180 | "scrolled": true 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "7000" 187 | ] 188 | }, 189 | "execution_count": 62, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "df.income.quantile(0.5,interpolation=\"higher\")" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 53, 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/plain": [ 206 | "7750.0" 207 | ] 208 | }, 209 | "execution_count": 53, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "df.income.quantile(0.75)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 55, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "10000000.0" 227 | ] 228 | }, 229 | "execution_count": 55, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "df.income.quantile(1)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 72, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "9400479.999999994" 247 | ] 248 | }, 249 | "execution_count": 72, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "percentile_99 = df.income.quantile(0.99)\n", 256 | "percentile_99" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 73, 262 | "metadata": { 263 | "scrolled": true 264 | }, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/html": [ 269 | "
\n", 270 | "\n", 283 | "\n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | "
nameincome
6Elon Musk10000000
\n", 299 | "
" 300 | ], 301 | "text/plain": [ 302 | " name income\n", 303 | "6 Elon Musk 10000000" 304 | ] 305 | }, 306 | "execution_count": 73, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "df[df.income>percentile_99]" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 74, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "
\n", 324 | "\n", 337 | "\n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | "
nameincome
0Rob5000
1Rafiq6000
2Nina4000
3Sofia7500
4Mohan8000
5Tao7000
6Elon Musk10000000
\n", 383 | "
" 384 | ], 385 | "text/plain": [ 386 | " name income\n", 387 | "0 Rob 5000\n", 388 | "1 Rafiq 6000\n", 389 | "2 Nina 4000\n", 390 | "3 Sofia 7500\n", 391 | "4 Mohan 8000\n", 392 | "5 Tao 7000\n", 393 | "6 Elon Musk 10000000" 394 | ] 395 | }, 396 | "execution_count": 74, 397 | "metadata": {}, 398 | "output_type": "execute_result" 399 | } 400 | ], 401 | "source": [ 402 | "df" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 108, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "name": "stderr", 412 | "output_type": "stream", 413 | "text": [ 414 | ":1: SettingWithCopyWarning: \n", 415 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 416 | "\n", 417 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 418 | " df['income'][3]=np.NaN\n", 419 | "C:\\Program Files\\Python38\\lib\\site-packages\\pandas\\core\\indexing.py:205: SettingWithCopyWarning: \n", 420 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 421 | "\n", 422 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", 423 | " self._setitem_with_indexer(indexer, value)\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "df['income'][3]=np.NaN" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 109, 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "data": { 438 | "text/html": [ 439 | "
\n", 440 | "\n", 453 | "\n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | "
nameincome
0Rob5000.0
1Rafiq6000.0
2Nina4000.0
3SofiaNaN
4Mohan8000.0
5Tao7000.0
6Elon Musk10000000.0
\n", 499 | "
" 500 | ], 501 | "text/plain": [ 502 | " name income\n", 503 | "0 Rob 5000.0\n", 504 | "1 Rafiq 6000.0\n", 505 | "2 Nina 4000.0\n", 506 | "3 Sofia NaN\n", 507 | "4 Mohan 8000.0\n", 508 | "5 Tao 7000.0\n", 509 | "6 Elon Musk 10000000.0" 510 | ] 511 | }, 512 | "execution_count": 109, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "df" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 112, 524 | "metadata": {}, 525 | "outputs": [ 526 | { 527 | "data": { 528 | "text/plain": [ 529 | "1671666.6666666667" 530 | ] 531 | }, 532 | "execution_count": 112, 533 | "metadata": {}, 534 | "output_type": "execute_result" 535 | } 536 | ], 537 | "source": [ 538 | "df.income.mean()" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 110, 544 | "metadata": {}, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/html": [ 549 | "
\n", 550 | "\n", 563 | "\n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | "
nameincome
0Rob5.000000e+03
1Rafiq6.000000e+03
2Nina4.000000e+03
3Sofia1.671667e+06
4Mohan8.000000e+03
5Tao7.000000e+03
6Elon Musk1.000000e+07
\n", 609 | "
" 610 | ], 611 | "text/plain": [ 612 | " name income\n", 613 | "0 Rob 5.000000e+03\n", 614 | "1 Rafiq 6.000000e+03\n", 615 | "2 Nina 4.000000e+03\n", 616 | "3 Sofia 1.671667e+06\n", 617 | "4 Mohan 8.000000e+03\n", 618 | "5 Tao 7.000000e+03\n", 619 | "6 Elon Musk 1.000000e+07" 620 | ] 621 | }, 622 | "execution_count": 110, 623 | "metadata": {}, 624 | "output_type": "execute_result" 625 | } 626 | ], 627 | "source": [ 628 | "df_new = df.fillna(df.income.mean())\n", 629 | "df_new" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 111, 635 | "metadata": {}, 636 | "outputs": [ 637 | { 638 | "data": { 639 | "text/html": [ 640 | "
\n", 641 | "\n", 654 | "\n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | "
nameincome
0Rob5000.0
1Rafiq6000.0
2Nina4000.0
3Sofia6500.0
4Mohan8000.0
5Tao7000.0
6Elon Musk10000000.0
\n", 700 | "
" 701 | ], 702 | "text/plain": [ 703 | " name income\n", 704 | "0 Rob 5000.0\n", 705 | "1 Rafiq 6000.0\n", 706 | "2 Nina 4000.0\n", 707 | "3 Sofia 6500.0\n", 708 | "4 Mohan 8000.0\n", 709 | "5 Tao 7000.0\n", 710 | "6 Elon Musk 10000000.0" 711 | ] 712 | }, 713 | "execution_count": 111, 714 | "metadata": {}, 715 | "output_type": "execute_result" 716 | } 717 | ], 718 | "source": [ 719 | "df_new = df.fillna(df.income.median())\n", 720 | "df_new" 721 | ] 722 | } 723 | ], 724 | "metadata": { 725 | "kernelspec": { 726 | "display_name": "Python 3", 727 | "language": "python", 728 | "name": "python3" 729 | }, 730 | "language_info": { 731 | "codemirror_mode": { 732 | "name": "ipython", 733 | "version": 3 734 | }, 735 | "file_extension": ".py", 736 | "mimetype": "text/x-python", 737 | "name": "python", 738 | "nbconvert_exporter": "python", 739 | "pygments_lexer": "ipython3", 740 | "version": "3.8.5" 741 | } 742 | }, 743 | "nbformat": 4, 744 | "nbformat_minor": 4 745 | } 746 | -------------------------------------------------------------------------------- /5_log_normal_distribution/income.csv: -------------------------------------------------------------------------------- 1 | income($),count 2 | 5000,4371 3 | 10000,3295 4 | 15000,5825 5 | 20000,6047 6 | 25000,6097 7 | 30000,5738 8 | 35000,6100 9 | 40000,5720 10 | 45000,5098 11 | 50000,4991 12 | 55000,5152 13 | 60000,4194 14 | 65000,4411 15 | 70000,3709 16 | 75000,3811 17 | 80000,3766 18 | 85000,3565 19 | 90000,3035 20 | 95000,2753 21 | 100000,2644 22 | 105000,2692 23 | 110000,2180 24 | 115000,2278 25 | 120000,1932 26 | 125000,1992 27 | 130000,1656 28 | 135000,1547 29 | 140000,1424 30 | 145000,1342 31 | 150000,1134 32 | 155000,1489 33 | 160000,1006 34 | 165000,1085 35 | 170000,784 36 | 175000,869 37 | 180000,849 38 | 185000,730 39 | 190000,688 40 | 195000,697 41 | 200000,508 42 | -------------------------------------------------------------------------------- /5_log_normal_distribution/lognormal_dist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import seaborn as sns\n", 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 21, 17 | "metadata": { 18 | "scrolled": false 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "
\n", 25 | "\n", 38 | "\n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "
incomecount
050004371
1100003295
2150005825
3200006047
4250006097
\n", 74 | "
" 75 | ], 76 | "text/plain": [ 77 | " income count\n", 78 | "0 5000 4371\n", 79 | "1 10000 3295\n", 80 | "2 15000 5825\n", 81 | "3 20000 6047\n", 82 | "4 25000 6097" 83 | ] 84 | }, 85 | "execution_count": 21, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "df = pd.read_csv(\n", 92 | " \"income.csv\", \n", 93 | " index_col=None, \n", 94 | " names=[\"income\",\"count\"], \n", 95 | " skiprows=1\n", 96 | ")\n", 97 | "df.head()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 24, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "image/png": "\n", 108 | "text/plain": [ 109 | "
" 110 | ] 111 | }, 112 | "metadata": {}, 113 | "output_type": "display_data" 114 | } 115 | ], 116 | "source": [ 117 | "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", 118 | "g = sns.barplot(x='income',y='count',data=df)\n", 119 | "g.set_xticklabels(g.get_xticklabels(), \n", 120 | " rotation=45, \n", 121 | " horizontalalignment='right');" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 25, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "image/png": "\n", 132 | "text/plain": [ 133 | "
" 134 | ] 135 | }, 136 | "metadata": {}, 137 | "output_type": "display_data" 138 | } 139 | ], 140 | "source": [ 141 | "sns.set(rc={'figure.figsize':(11.7,8.27)})\n", 142 | "g = sns.barplot(x='income',y='count',data=df)\n", 143 | "g.set_xticklabels(g.get_xticklabels(), \n", 144 | " rotation=45, \n", 145 | " horizontalalignment='right');\n", 146 | "g.set(xscale=\"log\");" 147 | ] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 3", 153 | "language": "python", 154 | "name": "python3" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 3 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython3", 166 | "version": "3.8.5" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 4 171 | } 172 | -------------------------------------------------------------------------------- /5_log_normal_distribution/usa_household_income.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codebasics/math-for-machine-learning/d6b97013d7787b23c41976d66c4ed35959a35e81/5_log_normal_distribution/usa_household_income.xls -------------------------------------------------------------------------------- /6_cosine_similarity/cosine_similarity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from sklearn.metrics.pairwise import cosine_similarity, cosine_distances" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 7, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "array([[1.]])" 21 | ] 22 | }, 23 | "execution_count": 7, 24 | "metadata": {}, 25 | "output_type": "execute_result" 26 | } 27 | ], 28 | "source": [ 29 | "cosine_similarity([[3,1]],[[6,2]])" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 8, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "array([[0.]])" 41 | ] 42 | }, 43 | "execution_count": 8, 44 | "metadata": {}, 45 | "output_type": "execute_result" 46 | } 47 | ], 48 | "source": [ 49 | "cosine_similarity([[3,0]],[[0,8]])" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 9, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "array([[0.96476382]])" 61 | ] 62 | }, 63 | "execution_count": 9, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "cosine_similarity([[3,1]],[[3,2]])" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 10, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "doc1 = \"\"\"\n", 79 | "iphone sales contributed to 70% of revenue. iphone demand is increasing by 20% yoy. \n", 80 | "the main competitor phone galaxy recorded 5% less growth compared to iphone\"\n", 81 | "\"\"\"\n", 82 | "\n", 83 | "doc2 = \"\"\"\n", 84 | "The upside pressure on volumes for the iPhone 12 series, historical outperformance \n", 85 | "in the July-September time period heading into launch event, and further catalysts in relation\n", 86 | "to outperformance for iPhone 13 volumes relative to lowered investor expectations implies a \n", 87 | "very attractive set up for the shares.\n", 88 | "\"\"\"\n", 89 | "\n", 90 | "doc3 = \"\"\"\n", 91 | "samsung's flagship product galaxy is able to penetrate more into asian markets compared to\n", 92 | "iphone. galaxy is redesigned with new look that appeals young demographics. 60% of samsung revenues\n", 93 | "are coming from galaxy phone sales\n", 94 | "\"\"\"\n", 95 | "\n", 96 | "doc4 = \"\"\"\n", 97 | "Samsung Electronics unveils its Galaxy S21 flagship, with modest spec improvements \n", 98 | "and a significantly lower price point. Galaxy S21 price is lower by ~20% (much like the iPhone 12A), \n", 99 | "which highlights Samsung's focus on boosting shipments and regaining market share.\n", 100 | "\"\"\"" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 15, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/html": [ 111 | "
\n", 112 | "\n", 125 | "\n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | "
iPhonegalaxy
doc131
doc220
doc313
doc412
\n", 156 | "
" 157 | ], 158 | "text/plain": [ 159 | " iPhone galaxy\n", 160 | "doc1 3 1\n", 161 | "doc2 2 0\n", 162 | "doc3 1 3\n", 163 | "doc4 1 2" 164 | ] 165 | }, 166 | "execution_count": 15, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "import pandas as pd\n", 173 | "\n", 174 | "df = pd.DataFrame([\n", 175 | " {'iPhone': 3,'galaxy': 1},\n", 176 | " {'iPhone': 2,'galaxy': 0},\n", 177 | " {'iPhone': 1,'galaxy': 3},\n", 178 | " {'iPhone': 1,'galaxy': 2},\n", 179 | " ],\n", 180 | " index=[\n", 181 | " \"doc1\",\n", 182 | " \"doc2\",\n", 183 | " \"doc3\",\n", 184 | " \"doc4\"\n", 185 | " ])\n", 186 | "\n", 187 | "df" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 20, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/html": [ 198 | "
\n", 199 | "\n", 212 | "\n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | "
iPhonegalaxy
doc131
\n", 228 | "
" 229 | ], 230 | "text/plain": [ 231 | " iPhone galaxy\n", 232 | "doc1 3 1" 233 | ] 234 | }, 235 | "execution_count": 20, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "df.loc[\"doc1\":\"doc1\"]" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 21, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "array([[0.9486833]])" 253 | ] 254 | }, 255 | "execution_count": 21, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "cosine_similarity(df.loc[\"doc1\":\"doc1\"],df.loc[\"doc2\":\"doc2\"])" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 22, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "array([[0.6]])" 273 | ] 274 | }, 275 | "execution_count": 22, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "cosine_similarity(df.loc[\"doc1\":\"doc1\"],df.loc[\"doc3\":\"doc3\"])" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 23, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "array([[0.98994949]])" 293 | ] 294 | }, 295 | "execution_count": 23, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "cosine_similarity(df.loc[\"doc3\":\"doc3\"],df.loc[\"doc4\":\"doc4\"])" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 24, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "array([[0.70710678]])" 313 | ] 314 | }, 315 | "execution_count": 24, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "cosine_similarity(df.loc[\"doc1\":\"doc1\"],df.loc[\"doc4\":\"doc4\"])" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 25, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "array([[0.29289322]])" 333 | ] 334 | }, 335 | "execution_count": 25, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "cosine_distances(df.loc[\"doc1\":\"doc1\"],df.loc[\"doc4\":\"doc4\"])" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 26, 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "data": { 351 | "text/plain": [ 352 | "0.29289321999999995" 353 | ] 354 | }, 355 | "execution_count": 26, 356 | "metadata": {}, 357 | "output_type": "execute_result" 358 | } 359 | ], 360 | "source": [ 361 | "1-0.70710678" 362 | ] 363 | } 364 | ], 365 | "metadata": { 366 | "kernelspec": { 367 | "display_name": "Python 3", 368 | "language": "python", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.8.5" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 4 386 | } 387 | -------------------------------------------------------------------------------- /7_modified_z_score/modified_z_score.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codebasics/math-for-machine-learning/d6b97013d7787b23c41976d66c4ed35959a35e81/7_modified_z_score/modified_z_score.xlsx -------------------------------------------------------------------------------- /7_modified_z_score/modified_z_score_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Modified Z Score Tutorial

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "I took following file and sampled few rows to demonstrate modified Z score\n", 15 | "\n", 16 | "datasource: https://www.kaggle.com/tmdb/tmdb-movie-metadata/version/2?select=tmdb_5000_movies.csv" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import pandas as pd\n", 26 | "import numpy as np" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "
\n", 38 | "\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companiesproduction_countriesrelease_daterevenueruntimespoken_languagesstatustaglinetitlevote_averagevote_count
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...12/10/20092787965087162[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.211800
154000000[{\"id\": 35, \"name\": \"Comedy\"}, {\"id\": 10749, \"...http://www.youmeanddupree.com/1819[{\"id\": 1253, \"name\": \"roommate\"}, {\"id\": 2038...enYou, Me and DupreeAfter standing in as best man for his longtime...18.600367[{\"name\": \"Universal Pictures\", \"id\": 33}, {\"n...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...7/14/2006130431368108[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedTwo's company. Dupree's a crowd.You, Me and Dupree5.4407
221000000[{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 53, \"name...NaN2575[{\"id\": 246, \"name\": \"dancing\"}, {\"id\": 470, \"...enThe Tailor of PanamaA British spy is banished to Panama after havi...7.047975[{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam...[{\"iso_3166_1\": \"IE\", \"name\": \"Ireland\"}, {\"is...2/11/200128008462109[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedIn a place this treacherous, what a good spy n...The Tailor of Panama6.292
31000000[{\"id\": 80, \"name\": \"Crime\"}, {\"id\": 18, \"name...NaN26791[]enBrigham CityWes Clayton is a lawman and a bishop in a Morm...0.280083[{\"name\": \"Main Street Movie Company\", \"id\": 6...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...3/30/2001852206119[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedNaNBrigham City7.34
4100000000[{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 878, \"nam...NaN644[{\"id\": 310, \"name\": \"artificial intelligence\"...enA.I. Artificial IntelligenceA robotic boy, the first programmed to love, D...34.035114[{\"name\": \"DreamWorks SKG\", \"id\": 27}, {\"name\"...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...6/29/2001235926552146[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedDavid is 11 years old. He weighs 60 pounds. He...A.I. Artificial Intelligence6.81974
\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " budget genres \\\n", 199 | "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", 200 | "1 54000000 [{\"id\": 35, \"name\": \"Comedy\"}, {\"id\": 10749, \"... \n", 201 | "2 21000000 [{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 53, \"name... \n", 202 | "3 1000000 [{\"id\": 80, \"name\": \"Crime\"}, {\"id\": 18, \"name... \n", 203 | "4 100000000 [{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 878, \"nam... \n", 204 | "\n", 205 | " homepage id \\\n", 206 | "0 http://www.avatarmovie.com/ 19995 \n", 207 | "1 http://www.youmeanddupree.com/ 1819 \n", 208 | "2 NaN 2575 \n", 209 | "3 NaN 26791 \n", 210 | "4 NaN 644 \n", 211 | "\n", 212 | " keywords original_language \\\n", 213 | "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", 214 | "1 [{\"id\": 1253, \"name\": \"roommate\"}, {\"id\": 2038... en \n", 215 | "2 [{\"id\": 246, \"name\": \"dancing\"}, {\"id\": 470, \"... en \n", 216 | "3 [] en \n", 217 | "4 [{\"id\": 310, \"name\": \"artificial intelligence\"... en \n", 218 | "\n", 219 | " original_title \\\n", 220 | "0 Avatar \n", 221 | "1 You, Me and Dupree \n", 222 | "2 The Tailor of Panama \n", 223 | "3 Brigham City \n", 224 | "4 A.I. Artificial Intelligence \n", 225 | "\n", 226 | " overview popularity \\\n", 227 | "0 In the 22nd century, a paraplegic Marine is di... 150.437577 \n", 228 | "1 After standing in as best man for his longtime... 18.600367 \n", 229 | "2 A British spy is banished to Panama after havi... 7.047975 \n", 230 | "3 Wes Clayton is a lawman and a bishop in a Morm... 0.280083 \n", 231 | "4 A robotic boy, the first programmed to love, D... 34.035114 \n", 232 | "\n", 233 | " production_companies \\\n", 234 | "0 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... \n", 235 | "1 [{\"name\": \"Universal Pictures\", \"id\": 33}, {\"n... \n", 236 | "2 [{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam... \n", 237 | "3 [{\"name\": \"Main Street Movie Company\", \"id\": 6... \n", 238 | "4 [{\"name\": \"DreamWorks SKG\", \"id\": 27}, {\"name\"... \n", 239 | "\n", 240 | " production_countries release_date revenue \\\n", 241 | "0 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 12/10/2009 2787965087 \n", 242 | "1 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 7/14/2006 130431368 \n", 243 | "2 [{\"iso_3166_1\": \"IE\", \"name\": \"Ireland\"}, {\"is... 2/11/2001 28008462 \n", 244 | "3 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 3/30/2001 852206 \n", 245 | "4 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 6/29/2001 235926552 \n", 246 | "\n", 247 | " runtime spoken_languages status \\\n", 248 | "0 162 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", 249 | "1 108 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", 250 | "2 109 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", 251 | "3 119 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", 252 | "4 146 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", 253 | "\n", 254 | " tagline \\\n", 255 | "0 Enter the World of Pandora. \n", 256 | "1 Two's company. Dupree's a crowd. \n", 257 | "2 In a place this treacherous, what a good spy n... \n", 258 | "3 NaN \n", 259 | "4 David is 11 years old. He weighs 60 pounds. He... \n", 260 | "\n", 261 | " title vote_average vote_count \n", 262 | "0 Avatar 7.2 11800 \n", 263 | "1 You, Me and Dupree 5.4 407 \n", 264 | "2 The Tailor of Panama 6.2 92 \n", 265 | "3 Brigham City 7.3 4 \n", 266 | "4 A.I. Artificial Intelligence 6.8 1974 " 267 | ] 268 | }, 269 | "execution_count": 3, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "df = pd.read_csv(\"movie_revenues.csv\")\n", 276 | "df.head()" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 184, 282 | "metadata": { 283 | "scrolled": true 284 | }, 285 | "outputs": [ 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "count 4.600000e+01\n", 290 | "mean 1.879289e+08\n", 291 | "std 4.551144e+08\n", 292 | "min 8.522060e+05\n", 293 | "25% 2.866957e+07\n", 294 | "50% 8.381714e+07\n", 295 | "75% 1.382135e+08\n", 296 | "max 2.787965e+09\n", 297 | "Name: revenue, dtype: float64" 298 | ] 299 | }, 300 | "execution_count": 184, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "df.revenue.describe()" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": {}, 312 | "source": [ 313 | "Here revenue is an absolute dollar value. To avoid a large scale, we will add a new column for revenue in millions" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 185, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "count 46.000000\n", 325 | "mean 187.928898\n", 326 | "std 455.114423\n", 327 | "min 0.852206\n", 328 | "25% 28.669569\n", 329 | "50% 83.817142\n", 330 | "75% 138.213502\n", 331 | "max 2787.965087\n", 332 | "Name: revenue_mln, dtype: float64" 333 | ] 334 | }, 335 | "execution_count": 185, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "df['revenue_mln'] = df['revenue'].apply(lambda x: x/1000000)\n", 342 | "df.revenue_mln.describe()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 186, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "_, mean, std, *_ = df.revenue_mln.describe()" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 187, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "text/plain": [ 362 | "187.92889841304347" 363 | ] 364 | }, 365 | "execution_count": 187, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "mean" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 188, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "data": { 381 | "text/plain": [ 382 | "455.1144234195408" 383 | ] 384 | }, 385 | "execution_count": 188, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "std" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "

Outlier detection using Z score

" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 199, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "def get_z_score(value, mean, std):\n", 408 | " return (value - mean)/std" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 200, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/html": [ 419 | "
\n", 420 | "\n", 433 | "\n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companies...runtimespoken_languagesstatustaglinetitlevote_averagevote_countrevenue_mlnmod_z_scorez_score
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289......162[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.2118002787.96508732.3397625.712929
154000000[{\"id\": 35, \"name\": \"Comedy\"}, {\"id\": 10749, \"...http://www.youmeanddupree.com/1819[{\"id\": 1253, \"name\": \"roommate\"}, {\"id\": 2038...enYou, Me and DupreeAfter standing in as best man for his longtime...18.600367[{\"name\": \"Universal Pictures\", \"id\": 33}, {\"n......108[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedTwo's company. Dupree's a crowd.You, Me and Dupree5.4407130.4313680.557474-0.126336
221000000[{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 53, \"name...NaN2575[{\"id\": 246, \"name\": \"dancing\"}, {\"id\": 470, \"...enThe Tailor of PanamaA British spy is banished to Panama after havi...7.047975[{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam......109[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedIn a place this treacherous, what a good spy n...The Tailor of Panama6.29228.008462-0.667434-0.351385
\n", 535 | "

3 rows × 23 columns

\n", 536 | "
" 537 | ], 538 | "text/plain": [ 539 | " budget genres \\\n", 540 | "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", 541 | "1 54000000 [{\"id\": 35, \"name\": \"Comedy\"}, {\"id\": 10749, \"... \n", 542 | "2 21000000 [{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 53, \"name... \n", 543 | "\n", 544 | " homepage id \\\n", 545 | "0 http://www.avatarmovie.com/ 19995 \n", 546 | "1 http://www.youmeanddupree.com/ 1819 \n", 547 | "2 NaN 2575 \n", 548 | "\n", 549 | " keywords original_language \\\n", 550 | "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", 551 | "1 [{\"id\": 1253, \"name\": \"roommate\"}, {\"id\": 2038... en \n", 552 | "2 [{\"id\": 246, \"name\": \"dancing\"}, {\"id\": 470, \"... en \n", 553 | "\n", 554 | " original_title overview \\\n", 555 | "0 Avatar In the 22nd century, a paraplegic Marine is di... \n", 556 | "1 You, Me and Dupree After standing in as best man for his longtime... \n", 557 | "2 The Tailor of Panama A British spy is banished to Panama after havi... \n", 558 | "\n", 559 | " popularity production_companies ... runtime \\\n", 560 | "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... 162 \n", 561 | "1 18.600367 [{\"name\": \"Universal Pictures\", \"id\": 33}, {\"n... ... 108 \n", 562 | "2 7.047975 [{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam... ... 109 \n", 563 | "\n", 564 | " spoken_languages status \\\n", 565 | "0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", 566 | "1 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", 567 | "2 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", 568 | "\n", 569 | " tagline title \\\n", 570 | "0 Enter the World of Pandora. Avatar \n", 571 | "1 Two's company. Dupree's a crowd. You, Me and Dupree \n", 572 | "2 In a place this treacherous, what a good spy n... The Tailor of Panama \n", 573 | "\n", 574 | " vote_average vote_count revenue_mln mod_z_score z_score \n", 575 | "0 7.2 11800 2787.965087 32.339762 5.712929 \n", 576 | "1 5.4 407 130.431368 0.557474 -0.126336 \n", 577 | "2 6.2 92 28.008462 -0.667434 -0.351385 \n", 578 | "\n", 579 | "[3 rows x 23 columns]" 580 | ] 581 | }, 582 | "execution_count": 200, 583 | "metadata": {}, 584 | "output_type": "execute_result" 585 | } 586 | ], 587 | "source": [ 588 | "df['z_score'] = df.revenue_mln.apply(lambda x: get_z_score(x, mean, std))\n", 589 | "df.head(3)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "**General guideline is to treat anything that has a z score of 3 or more as an outlier**" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 201, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/html": [ 607 | "
\n", 608 | "\n", 621 | "\n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companies...runtimespoken_languagesstatustaglinetitlevote_averagevote_countrevenue_mlnmod_z_scorez_score
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289......162[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.2118002787.96508732.3397625.712929
\n", 675 | "

1 rows × 23 columns

\n", 676 | "
" 677 | ], 678 | "text/plain": [ 679 | " budget genres \\\n", 680 | "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", 681 | "\n", 682 | " homepage id \\\n", 683 | "0 http://www.avatarmovie.com/ 19995 \n", 684 | "\n", 685 | " keywords original_language \\\n", 686 | "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", 687 | "\n", 688 | " original_title overview \\\n", 689 | "0 Avatar In the 22nd century, a paraplegic Marine is di... \n", 690 | "\n", 691 | " popularity production_companies ... runtime \\\n", 692 | "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... 162 \n", 693 | "\n", 694 | " spoken_languages status \\\n", 695 | "0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", 696 | "\n", 697 | " tagline title vote_average vote_count revenue_mln \\\n", 698 | "0 Enter the World of Pandora. Avatar 7.2 11800 2787.965087 \n", 699 | "\n", 700 | " mod_z_score z_score \n", 701 | "0 32.339762 5.712929 \n", 702 | "\n", 703 | "[1 rows x 23 columns]" 704 | ] 705 | }, 706 | "execution_count": 201, 707 | "metadata": {}, 708 | "output_type": "execute_result" 709 | } 710 | ], 711 | "source": [ 712 | "df[df.z_score>3]" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "

Outlier detection using modified Z score

" 720 | ] 721 | }, 722 | { 723 | "cell_type": "markdown", 724 | "metadata": {}, 725 | "source": [ 726 | "So we got only 1 outlier using simple Z score. Based on the kind of analysis we are doing sometimes it might be better to use modified Z score as it will return more outliers." 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": 191, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "def get_mad(s):\n", 736 | " median = np.median(s)\n", 737 | " diff = abs(s-median)\n", 738 | " MAD = np.median(diff)\n", 739 | " return MAD" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 192, 745 | "metadata": { 746 | "scrolled": true 747 | }, 748 | "outputs": [ 749 | { 750 | "data": { 751 | "text/plain": [ 752 | "(56.399542499999995, 83.8171415)" 753 | ] 754 | }, 755 | "execution_count": 192, 756 | "metadata": {}, 757 | "output_type": "execute_result" 758 | } 759 | ], 760 | "source": [ 761 | "MAD = get_mad(df.revenue_mln)\n", 762 | "median = np.median(df.revenue_mln)\n", 763 | "MAD, median" 764 | ] 765 | }, 766 | { 767 | "cell_type": "markdown", 768 | "metadata": {}, 769 | "source": [ 770 | "**General guideline for modified Z score is to use 3.5 as a thresold. i.e. anything that has a mod z score of 3.5 or more is an outlier**" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 195, 776 | "metadata": {}, 777 | "outputs": [], 778 | "source": [ 779 | "def get_modified_z_score(x, median, MAD):\n", 780 | " return 0.6745*(x-median)/MAD" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "**Test z score function for a sample data point**" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 196, 793 | "metadata": {}, 794 | "outputs": [ 795 | { 796 | "data": { 797 | "text/plain": [ 798 | "31.41931227648256" 799 | ] 800 | }, 801 | "execution_count": 196, 802 | "metadata": {}, 803 | "output_type": "execute_result" 804 | } 805 | ], 806 | "source": [ 807 | "get_modified_z_score(2711, median, MAD)" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": 197, 813 | "metadata": { 814 | "scrolled": true 815 | }, 816 | "outputs": [ 817 | { 818 | "data": { 819 | "text/html": [ 820 | "
\n", 821 | "\n", 834 | "\n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companies...revenueruntimespoken_languagesstatustaglinetitlevote_averagevote_countrevenue_mlnmod_z_score
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289......2787965087162[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.2118002787.96508732.339762
154000000[{\"id\": 35, \"name\": \"Comedy\"}, {\"id\": 10749, \"...http://www.youmeanddupree.com/1819[{\"id\": 1253, \"name\": \"roommate\"}, {\"id\": 2038...enYou, Me and DupreeAfter standing in as best man for his longtime...18.600367[{\"name\": \"Universal Pictures\", \"id\": 33}, {\"n......130431368108[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedTwo's company. Dupree's a crowd.You, Me and Dupree5.4407130.4313680.557474
221000000[{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 53, \"name...NaN2575[{\"id\": 246, \"name\": \"dancing\"}, {\"id\": 470, \"...enThe Tailor of PanamaA British spy is banished to Panama after havi...7.047975[{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam......28008462109[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedIn a place this treacherous, what a good spy n...The Tailor of Panama6.29228.008462-0.667434
\n", 936 | "

3 rows × 22 columns

\n", 937 | "
" 938 | ], 939 | "text/plain": [ 940 | " budget genres \\\n", 941 | "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", 942 | "1 54000000 [{\"id\": 35, \"name\": \"Comedy\"}, {\"id\": 10749, \"... \n", 943 | "2 21000000 [{\"id\": 18, \"name\": \"Drama\"}, {\"id\": 53, \"name... \n", 944 | "\n", 945 | " homepage id \\\n", 946 | "0 http://www.avatarmovie.com/ 19995 \n", 947 | "1 http://www.youmeanddupree.com/ 1819 \n", 948 | "2 NaN 2575 \n", 949 | "\n", 950 | " keywords original_language \\\n", 951 | "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", 952 | "1 [{\"id\": 1253, \"name\": \"roommate\"}, {\"id\": 2038... en \n", 953 | "2 [{\"id\": 246, \"name\": \"dancing\"}, {\"id\": 470, \"... en \n", 954 | "\n", 955 | " original_title overview \\\n", 956 | "0 Avatar In the 22nd century, a paraplegic Marine is di... \n", 957 | "1 You, Me and Dupree After standing in as best man for his longtime... \n", 958 | "2 The Tailor of Panama A British spy is banished to Panama after havi... \n", 959 | "\n", 960 | " popularity production_companies ... \\\n", 961 | "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... \n", 962 | "1 18.600367 [{\"name\": \"Universal Pictures\", \"id\": 33}, {\"n... ... \n", 963 | "2 7.047975 [{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam... ... \n", 964 | "\n", 965 | " revenue runtime spoken_languages \\\n", 966 | "0 2787965087 162 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... \n", 967 | "1 130431368 108 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] \n", 968 | "2 28008462 109 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... \n", 969 | "\n", 970 | " status tagline \\\n", 971 | "0 Released Enter the World of Pandora. \n", 972 | "1 Released Two's company. Dupree's a crowd. \n", 973 | "2 Released In a place this treacherous, what a good spy n... \n", 974 | "\n", 975 | " title vote_average vote_count revenue_mln mod_z_score \n", 976 | "0 Avatar 7.2 11800 2787.965087 32.339762 \n", 977 | "1 You, Me and Dupree 5.4 407 130.431368 0.557474 \n", 978 | "2 The Tailor of Panama 6.2 92 28.008462 -0.667434 \n", 979 | "\n", 980 | "[3 rows x 22 columns]" 981 | ] 982 | }, 983 | "execution_count": 197, 984 | "metadata": {}, 985 | "output_type": "execute_result" 986 | } 987 | ], 988 | "source": [ 989 | "df['mod_z_score'] = df.revenue_mln.apply(lambda x: get_modified_z_score(x, median, MAD))\n", 990 | "df.head(3)" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "metadata": {}, 996 | "source": [ 997 | "**General guideline is to treat anything that has a modified z score of 3 or more as an outlier**" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": 198, 1003 | "metadata": {}, 1004 | "outputs": [ 1005 | { 1006 | "data": { 1007 | "text/html": [ 1008 | "
\n", 1009 | "\n", 1022 | "\n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companies...revenueruntimespoken_languagesstatustaglinetitlevote_averagevote_countrevenue_mlnmod_z_score
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289......2787965087162[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.2118002787.96508732.339762
7150000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.jurassicworld.com/135397[{\"id\": 1299, \"name\": \"monster\"}, {\"id\": 1718,...enJurassic WorldTwenty-two years after the events of Jurassic ...418.708552[{\"name\": \"Universal Studios\", \"id\": 13}, {\"na......1513528810124[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedThe park is open.Jurassic World6.586621513.52881017.098375
1225000000[{\"id\": 53, \"name\": \"Thriller\"}, {\"id\": 28, \"n...NaN619[{\"id\": 1156, \"name\": \"sister sister relations...enThe BodyguardA former Secret Service agent grudgingly takes...26.576385[{\"name\": \"Tig Productions\", \"id\": 335}, {\"nam......411006740129[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedNever let her out of your sight. Never let you...The Bodyguard6.1661411.0067403.912964
\n", 1124 | "

3 rows × 22 columns

\n", 1125 | "
" 1126 | ], 1127 | "text/plain": [ 1128 | " budget genres \\\n", 1129 | "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", 1130 | "7 150000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", 1131 | "12 25000000 [{\"id\": 53, \"name\": \"Thriller\"}, {\"id\": 28, \"n... \n", 1132 | "\n", 1133 | " homepage id \\\n", 1134 | "0 http://www.avatarmovie.com/ 19995 \n", 1135 | "7 http://www.jurassicworld.com/ 135397 \n", 1136 | "12 NaN 619 \n", 1137 | "\n", 1138 | " keywords original_language \\\n", 1139 | "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", 1140 | "7 [{\"id\": 1299, \"name\": \"monster\"}, {\"id\": 1718,... en \n", 1141 | "12 [{\"id\": 1156, \"name\": \"sister sister relations... en \n", 1142 | "\n", 1143 | " original_title overview \\\n", 1144 | "0 Avatar In the 22nd century, a paraplegic Marine is di... \n", 1145 | "7 Jurassic World Twenty-two years after the events of Jurassic ... \n", 1146 | "12 The Bodyguard A former Secret Service agent grudgingly takes... \n", 1147 | "\n", 1148 | " popularity production_companies ... \\\n", 1149 | "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... \n", 1150 | "7 418.708552 [{\"name\": \"Universal Studios\", \"id\": 13}, {\"na... ... \n", 1151 | "12 26.576385 [{\"name\": \"Tig Productions\", \"id\": 335}, {\"nam... ... \n", 1152 | "\n", 1153 | " revenue runtime spoken_languages \\\n", 1154 | "0 2787965087 162 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... \n", 1155 | "7 1513528810 124 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] \n", 1156 | "12 411006740 129 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] \n", 1157 | "\n", 1158 | " status tagline \\\n", 1159 | "0 Released Enter the World of Pandora. \n", 1160 | "7 Released The park is open. \n", 1161 | "12 Released Never let her out of your sight. Never let you... \n", 1162 | "\n", 1163 | " title vote_average vote_count revenue_mln mod_z_score \n", 1164 | "0 Avatar 7.2 11800 2787.965087 32.339762 \n", 1165 | "7 Jurassic World 6.5 8662 1513.528810 17.098375 \n", 1166 | "12 The Bodyguard 6.1 661 411.006740 3.912964 \n", 1167 | "\n", 1168 | "[3 rows x 22 columns]" 1169 | ] 1170 | }, 1171 | "execution_count": 198, 1172 | "metadata": {}, 1173 | "output_type": "execute_result" 1174 | } 1175 | ], 1176 | "source": [ 1177 | "df[df.mod_z_score>3.5]" 1178 | ] 1179 | } 1180 | ], 1181 | "metadata": { 1182 | "kernelspec": { 1183 | "display_name": "Python 3", 1184 | "language": "python", 1185 | "name": "python3" 1186 | }, 1187 | "language_info": { 1188 | "codemirror_mode": { 1189 | "name": "ipython", 1190 | "version": 3 1191 | }, 1192 | "file_extension": ".py", 1193 | "mimetype": "text/x-python", 1194 | "name": "python", 1195 | "nbconvert_exporter": "python", 1196 | "pygments_lexer": "ipython3", 1197 | "version": "3.8.5" 1198 | } 1199 | }, 1200 | "nbformat": 4, 1201 | "nbformat_minor": 4 1202 | } 1203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # math-for-machine-learning 2 | Statistics and math for machine learning and data science 3 | --------------------------------------------------------------------------------