├── .gitignore ├── .travis.yml ├── README.md ├── data ├── clean │ └── clean_housing_data.txt └── raw │ └── housing_data.txt ├── notebooks └── train_and_test_model.ipynb ├── setup.py ├── src ├── data │ ├── __init__.py │ └── preprocessing_helpers.py ├── features │ ├── __init__.py │ └── as_numpy.py ├── models │ ├── __init__.py │ └── train.py └── visualization │ ├── __init__.py │ └── plots.py └── tests ├── __init__.py ├── data ├── __init__.py └── test_preprocessing_helpers.py ├── features ├── __init__.py └── test_as_numpy.py ├── models ├── __init__.py └── test_train.py └── visualization ├── baseline ├── test_plot_for_almost_linear_data.png └── test_plot_for_linear_data.png └── test_plots.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 108 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 109 | 110 | # User-specific stuff 111 | .idea/**/workspace.xml 112 | .idea/**/tasks.xml 113 | .idea/**/dictionaries 114 | .idea/**/shelf 115 | 116 | # Sensitive or high-churn files 117 | .idea/**/dataSources/ 118 | .idea/**/dataSources.ids 119 | .idea/**/dataSources.local.xml 120 | .idea/**/sqlDataSources.xml 121 | .idea/**/dynamic.xml 122 | .idea/**/uiDesigner.xml 123 | .idea/**/dbnavigator.xml 124 | 125 | # Gradle 126 | .idea/**/gradle.xml 127 | .idea/**/libraries 128 | 129 | # CMake 130 | cmake-build-debug/ 131 | cmake-build-release/ 132 | 133 | # Mongo Explorer plugin 134 | .idea/**/mongoSettings.xml 135 | 136 | # File-based project format 137 | *.iws 138 | 139 | # IntelliJ 140 | out/ 141 | 142 | # mpeltonen/sbt-idea plugin 143 | .idea_modules/ 144 | 145 | # JIRA plugin 146 | atlassian-ide-plugin.xml 147 | 148 | # Cursive Clojure plugin 149 | .idea/replstate.xml 150 | 151 | # Crashlytics plugin (for Android Studio and IntelliJ) 152 | com_crashlytics_export_strings.xml 153 | crashlytics.properties 154 | crashlytics-build.properties 155 | fabric.properties 156 | 157 | # Editor-based Rest Client 158 | .idea/httpRequests 159 | 160 | # Custom 161 | cleaned_housing_data.txt 162 | 163 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | install: 5 | - pip install -e . 6 | - pip install codecov pytest-cov 7 | script: 8 | - pytest --cov=src tests 9 | after_success: 10 | - codecov -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.com/gutfeeling/univariate-linear-regression.svg?branch=master)](https://travis-ci.com/gutfeeling/univariate-linear-regression) 2 | [![codecov](https://codecov.io/gh/gutfeeling/univariate-linear-regression/branch/master/graph/badge.svg)](https://codecov.io/gh/gutfeeling/univariate-linear-regression) 3 | 4 | 5 | This repository holds the code for the DataCamp course [Unit Testing for Data Science in Python by Dibya Chakravorty](https://www.datacamp.com/courses/unit-testing-for-data-science-in-python). 6 | 7 | Please make sure that you have installed the package `univariate_linear_regression` in this repo using `pip` before running `pytest`. Otherwise, you may get `ImportError`s. 8 | 9 | To install it, first clone the repo. 10 | 11 | ``` 12 | git clone https://github.com/gutfeeling/univariate-linear-regression.git 13 | ``` 14 | 15 | Then install the package locally using `pip`, making sure that you are using Python version `>=3.6`. 16 | 17 | ``` 18 | pip install -e univariate_linear_regression 19 | ``` 20 | 21 | Once the installation finishes, you can run all the tests by doing 22 | 23 | ``` 24 | cd univariate_linear_regression 25 | pytest --mpl 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /data/clean/clean_housing_data.txt: -------------------------------------------------------------------------------- 1 | 1211 202119 2 | 1010 154639 3 | 1077 180166 4 | 1098 189296 5 | 1844 311538 6 | 1163 175475 7 | 1959 300118 8 | 998 160939 9 | 908 158857 10 | 1597 248964 11 | 687 123223 12 | 2639 415005 13 | 1172 192633 14 | 2008 332592 15 | 1392 219946 16 | 1565 254284 17 | 768 122012 18 | 2420 383479 19 | 925 150043 20 | 2378 382178 21 | 1380 211505 22 | 2350 361104 23 | 1461 242079 24 | 821 143722 25 | 1683 271172 26 | 1350 229442 27 | 1996 303105 28 | 1371 219537 29 | 2081 314942 30 | 1059 186606 31 | 1148 206186 32 | 1506 248419 33 | 1210 214114 34 | 1697 277794 35 | 1268 194345 36 | 2318 372162 37 | 1468 239007 38 | 2437 375962 39 | 1654 286488 40 | 2442 386660 41 | 1466 254767 42 | 830 163537 43 | 1433 233594 44 | 1201 201260 45 | 1850 307939 46 | 1409 219147 47 | 724 130206 48 | 1441 230084 49 | 1453 237715 50 | 948 165879 51 | 1146 204260 52 | 2412 395210 53 | 1971 308721 54 | 2130 320003 55 | 1132 191479 56 | 783 141789 57 | 1444 245256 58 | 2264 376339 59 | 685 141157 60 | 1915 297843 61 | 1585 262437 62 | 977 169370 63 | 1784 267766 64 | 1233 201786 65 | 917 167772 66 | 1307 197956 67 | 1413 248095 68 | 868 141446 69 | 1699 257005 70 | 1692 258013 71 | 2230 354710 72 | 1651 248954 73 | 1281 200455 74 | 1887 303967 75 | 1259 219320 76 | 1449 233292 77 | 2253 363595 78 | 1219 217003 79 | 1807 308884 80 | 1935 316434 81 | 1370 232093 82 | 1808 284583 83 | 2040 341540 84 | 1708 292887 85 | 1589 278024 86 | 1798 306752 87 | 1808 307916 88 | 1959 307255 89 | 1746 282820 90 | 2023 308199 91 | 1667 258969 92 | 1002 156268 93 | 1031 187153 94 | 20 16903 95 | 875 131590 96 | 2055 348060 97 | 1164 200199 98 | 2210 333440 99 | 1303 219225 100 | 884 164046 101 | 1985 310744 102 | 1548 257416 103 | 1227 186854 104 | 1374 212949 105 | 1830 285744 106 | 1721 282267 107 | 1059 197995 108 | 849 144140 109 | 1447 233595 110 | 1971 315134 111 | 947 156929 112 | 1031 168408 113 | 1299 225113 114 | 1479 254790 115 | 1825 276244 116 | 2102 346046 117 | 1485 241969 118 | 1239 216360 119 | 2122 331214 120 | 1841 307679 121 | 2016 303120 122 | 1198 204100 123 | 669 117012 124 | 1716 273825 125 | 152 24831 126 | 2239 352096 127 | 1851 279282 128 | 1126 180191 129 | 1280 219627 130 | 847 133317 131 | 1373 207851 132 | 1596 278343 133 | 868 135942 134 | 825 136563 135 | 2560 396116 136 | 1440 243951 137 | 1540 260755 138 | 2941 467563 139 | 2144 344120 140 | 1594 243564 141 | 1670 287550 142 | 1426 240550 143 | 1339 201630 144 | 1301 218028 145 | 1872 282941 146 | 1839 281852 147 | 2247 370003 148 | 1527 262264 149 | 1725 271016 150 | 2281 369689 151 | 1488 227508 152 | 1533 259417 153 | 1225 223524 154 | 3042 480450 155 | 2164 343142 156 | 1919 295848 157 | 988 185590 158 | 1645 278445 159 | 1626 272805 160 | 1377 227954 161 | 2073 346592 162 | 1273 214852 163 | 2099 324153 164 | 2104 344352 165 | 1781 284295 166 | 2180 345619 167 | 1230 222889 168 | 583 87489 169 | 1597 246516 170 | 1698 259747 171 | 1228 208894 172 | 1602 257219 173 | 1082 164357 174 | 1553 247324 175 | 2311 378884 176 | 1082 191410 177 | 776 131353 178 | 546 101024 179 | 2033 322887 180 | 569 87881 181 | 1584 275613 182 | 1409 242308 183 | 1839 307818 184 | 1833 296561 185 | 912 154831 186 | 1731 293862 187 | 1916 327075 188 | 1718 264603 189 | 1878 294959 190 | 1599 246013 191 | 2341 387563 192 | 1002 165994 193 | 1103 182397 194 | 1807 293730 195 | 1532 236177 196 | 571 124486 197 | 1540 267232 198 | 1048 161563 199 | 1812 298208 200 | 2112 327908 201 | 1224 210824 202 | 986 170353 203 | 1147 194582 204 | 1289 222672 205 | 336 83298 206 | 819 160706 207 | 1046 183460 208 | 1941 311122 209 | 646 98408 210 | 1922 322345 211 | 2107 355801 212 | 1424 243262 213 | 1105 184756 214 | 288 47636 215 | 1536 260027 216 | 856 158643 217 | 2382 397094 218 | 759 124728 219 | 1553 261591 220 | 732 115935 221 | 1325 220184 222 | 915 172258 223 | 1207 186577 224 | 1198 219105 225 | 467 109962 226 | 1369 245028 227 | 1596 253649 228 | 1494 247517 229 | 1504 262006 230 | 1488 257992 231 | 888 143057 232 | 1503 232274 233 | 1408 235137 234 | 1523 239432 235 | 2409 391230 236 | 1813 296655 237 | 1444 254380 238 | 2799 452401 239 | 851 150655 240 | 1436 248048 241 | 1429 243768 242 | 1459 243788 243 | 1979 326578 244 | 950 152943 245 | 1668 273160 246 | 823 128542 247 | 1236 208989 248 | 1948 310637 249 | 668 127455 250 | 848 127859 251 | 2243 355398 252 | 1223 212779 253 | 2235 368781 254 | 2076 313793 255 | 1850 282279 256 | 1364 208166 257 | 1086 171190 258 | 2082 315012 259 | 1611 261375 260 | 1839 284486 261 | 1761 292588 262 | 1704 284335 263 | 663 127963 264 | 1296 199360 265 | 1638 251161 266 | 796 135047 267 | 1796 282751 268 | 1036 159899 269 | 976 167701 270 | 1395 239207 271 | 2011 330769 272 | 703 109109 273 | 330 62388 274 | 1732 291819 275 | 876 138100 276 | 1753 280718 277 | 1351 204435 278 | 1757 264091 279 | 1329 217956 280 | 184 38653 281 | 2002 301362 282 | 2405 372378 283 | 1692 264326 284 | 1067 189637 285 | 1810 308675 286 | 1099 195024 287 | 1396 237618 288 | 881 138630 289 | 1809 280299 290 | 1773 294241 291 | 2161 362704 292 | 774 153588 293 | 1647 262758 294 | 2308 347329 295 | 966 157146 296 | 908 139552 297 | 1753 263512 298 | 2329 368410 299 | 1669 274065 300 | 1449 255464 301 | 1909 296662 302 | 1307 196349 303 | 1311 213926 304 | 1369 211529 305 | 1621 245180 306 | 960 163299 307 | 1605 251516 308 | 833 160078 309 | 1562 244628 310 | 2201 332939 311 | 1647 252393 312 | 818 152404 313 | 1719 275389 314 | 665 100213 315 | 2243 370193 316 | 1000 156809 317 | 820 126165 318 | 1684 288275 319 | 1932 290077 320 | 2074 345126 321 | 1071 199764 322 | 2344 355914 323 | 1710 287114 324 | 889 161793 325 | 1538 249140 326 | 1947 311959 327 | 1168 189404 328 | 1735 285031 329 | 2011 305602 330 | 1136 189879 331 | 1748 272786 332 | 1290 210550 333 | 1019 163589 334 | 961 168183 335 | 1828 286707 336 | 1505 245588 337 | 1378 233152 338 | 1655 263515 339 | 1311 211683 340 | 1750 282814 341 | 1749 268143 342 | 841 142094 343 | 1218 205493 344 | 2335 361074 345 | 1346 222027 346 | 1157 188133 347 | 1037 193348 348 | 1641 264315 349 | 2183 353744 350 | 1120 171682 351 | 1863 307677 352 | 1540 255857 353 | 1495 238438 354 | 1673 276999 355 | 1384 235895 356 | 1579 257986 357 | 604 122586 358 | 1487 243866 359 | 2076 327189 360 | 1435 244910 361 | 1687 253410 362 | 992 179179 363 | 783 136448 364 | 1276 227642 365 | 1614 248766 366 | 915 173856 367 | 2495 400024 368 | 1309 205592 369 | 1689 260760 370 | 1874 301411 371 | 1130 182870 372 | 1117 168142 373 | 1672 258970 374 | 2248 371786 375 | 1674 285875 376 | 1489 233024 377 | 1818 293239 378 | 1614 263263 379 | 1183 210600 380 | 1807 291148 381 | 2003 316575 382 | 1370 205806 383 | 804 127884 384 | 1175 206425 385 | 1639 249805 386 | 2127 339819 387 | 795 131333 388 | 1738 267509 389 | 1135 186028 390 | 1781 299965 391 | 1708 287434 392 | 995 152068 393 | 1927 328031 394 | 935 169870 395 | 701 107089 396 | 1233 198954 397 | 1367 216170 398 | 1028 172258 399 | 1423 248794 400 | 1840 306216 401 | 1047 172223 402 | 2101 350083 403 | 1150 177974 404 | 606 107069 405 | 814 131339 406 | 1551 246499 407 | 1542 241847 408 | 1860 299339 409 | 1424 220148 410 | 1375 229082 411 | 2315 363854 412 | 1446 227304 413 | 638 99927 414 | 1609 252552 415 | 1291 203202 416 | 392 85245 417 | 1624 280863 418 | 2444 379016 419 | 1629 245425 420 | 1378 239094 421 | 916 162555 422 | 900 171057 423 | 1534 256403 424 | 332 83030 425 | 2114 350274 426 | 967 150572 427 | 1471 238715 428 | 1936 309990 429 | 941 161309 430 | 1325 224028 431 | 1619 250394 432 | 1328 228373 433 | 1597 246839 434 | 1884 286463 435 | 1586 262247 436 | 1503 240018 437 | 1952 297753 438 | 1041 195786 439 | 2586 403911 440 | 117 55096 441 | 1912 312882 442 | 1579 274165 443 | 2493 375033 444 | 2456 373472 445 | 1530 242104 446 | 1702 273034 447 | 433 84482 448 | 1283 219341 449 | 988 150462 450 | 2206 352609 451 | 1750 267909 452 | 2003 316292 453 | 2117 330707 454 | 2037 343333 455 | 2436 384751 456 | 1505 258667 457 | 2072 347930 458 | 1988 322929 459 | 1444 246875 460 | 1022 192628 461 | 1708 292820 462 | -------------------------------------------------------------------------------- /data/raw/housing_data.txt: -------------------------------------------------------------------------------- 1 | 1,211 202,119 2 | 1,010 154,639 3 | 1,077 180,166 4 | 1,098 189,296 5 | 1,844 311,538 6 | 1,163 175,475 7 | 1,959 300,118 8 | 998 160,939 9 | 908 158,857 10 | 1,597 248,964 11 | 687 123,223 12 | 2,639 415,005 13 | 1,172 192,633 14 | 2,008 332,592 15 | 1,392 219,946 16 | 1,565 254,284 17 | 768 122,012 18 | 2,420 383,479 19 | 925 150,043 20 | 2,378 382,178 21 | 1,380 211,505 22 | 2,350 361,104 23 | 1,461 242,079 24 | 821 143,722 25 | 1,683 271,172 26 | 1,350 229,442 27 | 1,996 303,105 28 | 1,371 219,537 29 | 2,081 314,942 30 | 1,059 186,606 31 | 293,410 32 | 1,148 206,186 33 | 1,506 248,419 34 | 1,210 214,114 35 | 1,697 277,794 36 | 1,268 194,345 37 | 2,318 372,162 38 | 1,463238,765 39 | 1,468 239,007 40 | 2,437 375,962 41 | 1,654 286,488 42 | 2,442 386,660 43 | 1,466 254,767 44 | 830 163,537 45 | 1,433 233,594 46 | 1,201 201,260 47 | 1,850 307,939 48 | 1,409 219,147 49 | 724 130,206 50 | 1,841297,488 51 | 1,441 230,084 52 | 1,453 237,715 53 | 948 165,879 54 | 1,146 204,260 55 | 2,412 395,210 56 | 1,971 308,721 57 | 2,130 320,003 58 | 1,132 191,479 59 | 783 141,789 60 | 1,444 245,256 61 | 2,264 376,339 62 | 685 141,157 63 | 1,915 297,843 64 | 1,585 262,437 65 | 977 169,370 66 | 1,784 267,766 67 | 1,233 201,786 68 | 917 167,772 69 | 1,307 197,956 70 | 1,413 248,095 71 | 1,517 264,787 72 | 868 141,446 73 | 1,699 257,005 74 | 1,692 258,013 75 | 2,230 354,710 76 | 1,651 248,954 77 | 1,281 200,455 78 | 1,887 303,967 79 | 1,259 219,320 80 | 1,449 233,292 81 | 2,253 363,595 82 | 1,219 217,003 83 | 1,807 308,884 84 | 1,935 316,434 85 | 1,370 232,093 86 | 1,808 284,583 87 | 2,040 341,540 88 | 1,708 292,887 89 | 1,589 278,024 90 | 1,798 306,752 91 | 1,808 307,916 92 | 1,959 307,255 93 | 1,595 242,641 94 | 1,746 282,820 95 | 2,023 308,199 96 | 1,667 258,969 97 | 1,002 156,268 98 | 1,031 187,153 99 | 20 16,903 100 | 875 131,590 101 | 2,055 348,060 102 | 339,457 103 | 1,164 200,199 104 | 2,210 333,440 105 | 1,303 219,225 106 | 884 164,046 107 | 1,985 310,744 108 | 2,607428,391 109 | 1,548 257,416 110 | 1,227 186,854 111 | 1,374 212,949 112 | 1,830 285,744 113 | 1,721 282,267 114 | 1,059 197,995 115 | 849 144,140 116 | 1,447 233,595 117 | 1,971 315,134 118 | 1,770294,633 119 | 947 156,929 120 | 1,031 168,408 121 | 1,299 225,113 122 | 1,479 254,790 123 | 1,825 276,244 124 | 2,102 346,046 125 | 1,485 241,969 126 | 1,239 216,360 127 | 2,122 331,214 128 | 1,841 307,679 129 | 2,016 303,120 130 | 1,198 204,100 131 | 669 117,012 132 | 1,716 273,825 133 | 152 24,831 134 | 2,239 352,096 135 | 1,851 279,282 136 | 1,573 249,289 137 | 1,126 180,191 138 | 1,280 219,627 139 | 847 133,317 140 | 1,373 207,851 141 | 1,596 278,343 142 | 868 135,942 143 | 825 136,563 144 | 2,560 396,116 145 | 1,440 243,951 146 | 1,540 260,755 147 | 2,941 467,563 148 | 1,621271,984 149 | 2,144 344,120 150 | 1,594 243,564 151 | 1,670 287,550 152 | 1,426 240,550 153 | 1,339 201,630 154 | 1,301 218,028 155 | 1,820274,947 156 | 1,872 282,941 157 | 127,679 158 | 1,839 281,852 159 | 2,247 370,003 160 | 1,527 262,264 161 | 1,725 271,016 162 | 2,281 369,689 163 | 1,488 227,508 164 | 1,533 259,417 165 | 1,225 223,524 166 | 3,042 480,450 167 | 2,164 343,142 168 | 1,919 295,848 169 | 988 185,590 170 | 1,645 278,445 171 | 1,626 272,805 172 | 1,377 227,954 173 | 2,073 346,592 174 | 1,273 214,852 175 | 2,099 324,153 176 | 2,104 344,352 177 | 1,781 284,295 178 | 2,180 345,619 179 | 1,230 222,889 180 | 583 87,489 181 | 1,597 246,516 182 | 1,698 259,747 183 | 1,228 208,894 184 | 1,602 257,219 185 | 1,082 164,357 186 | 1,553 247,324 187 | 2,311 378,884 188 | 1,082 191,410 189 | 776 131,353 190 | 546 101,024 191 | 2,033 322,887 192 | 569 87,881 193 | 1,584 275,613 194 | 1,409 242,308 195 | 1,839 307,818 196 | 1,833 296,561 197 | 912 154,831 198 | 1,731 293,862 199 | 1,916 327,075 200 | 1,718 264,603 201 | 1,878 294,959 202 | 1,599 246,013 203 | 2,341 387,563 204 | 1,002 165,994 205 | 1,103 182,397 206 | 1,807 293,730 207 | 1,532 236,177 208 | 571 124,486 209 | 1,540 267,232 210 | 1,048 161,563 211 | 1,812 298,208 212 | 2,112 327,908 213 | 1,224 210,824 214 | 986 170,353 215 | 1,147 194,582 216 | 1,289 222,672 217 | 336 83,298 218 | 819 160,706 219 | 1,046 183,460 220 | 1,941 311,122 221 | 646 98,408 222 | 1,922 322,345 223 | 2,107 355,801 224 | 1,424 243,262 225 | 1,895316,624 226 | 1,105 184,756 227 | 288 47,636 228 | 1,536 260,027 229 | 856 158,643 230 | 2,382 397,094 231 | 759 124,728 232 | 1,553 261,591 233 | 732 115,935 234 | 1,325 220,184 235 | 915 172,258 236 | 1,207 186,577 237 | 1,698262,462 238 | 1,198 219,105 239 | 467 109,962 240 | 1,369 245,028 241 | 1,596 253,649 242 | 1,494 247,517 243 | 1,504 262,006 244 | 1,488 257,992 245 | 1,568268,366 246 | 888 143,057 247 | 1,503 232,274 248 | 1,468 225,446 249 | 1,408 235,137 250 | 1,523 239,432 251 | 2,409 391,230 252 | 1,813 296,655 253 | 355,946 254 | 1,444 254,380 255 | 2,799 452,401 256 | 851 150,655 257 | 1,436 248,048 258 | 1,429 243,768 259 | 1,459 243,788 260 | 1,979 326,578 261 | 2,179350,907 262 | 950 152,943 263 | 1,668 273,160 264 | 823 128,542 265 | 1,236 208,989 266 | 1,948 310,637 267 | 668 127,455 268 | 848 127,859 269 | 2,243 355,398 270 | 1,223 212,779 271 | 2,235 368,781 272 | 2,076 313,793 273 | 1,850 282,279 274 | 1,364 208,166 275 | 1,086 171,190 276 | 2,082 315,012 277 | 1,611 261,375 278 | 26,953 279 | 1,839 284,486 280 | 1,761 292,588 281 | 1,704 284,335 282 | 663 127,963 283 | 1,296 199,360 284 | 1,638 251,161 285 | 796 135,047 286 | 1,796 282,751 287 | 1,036 159,899 288 | 976 167,701 289 | 1,395 239,207 290 | 2,011 330,769 291 | 703 109,109 292 | 330 62,388 293 | 1,732 291,819 294 | 876 138,100 295 | 1,753 280,718 296 | 1,351 204,435 297 | 1,757 264,091 298 | 1,329 217,956 299 | 184 38,653 300 | 2,002 301,362 301 | 2,405 372,378 302 | 1,692 264,326 303 | 1,067 189,637 304 | 1,810 308,675 305 | 1,099 195,024 306 | 1,396 237,618 307 | 881 138,630 308 | 1,809 280,299 309 | 1,773 294,241 310 | 2,161 362,704 311 | 774 153,588 312 | 1,647 262,758 313 | 2,308 347,329 314 | 966 157,146 315 | 908 139,552 316 | 1,753 263,512 317 | 2,329 368,410 318 | 1,669 274,065 319 | 1,449 255,464 320 | 1,271203,723 321 | 1,909 296,662 322 | 1,307 196,349 323 | 1,311 213,926 324 | 1,369 211,529 325 | 1,621 245,180 326 | 960 163,299 327 | 1,605 251,516 328 | 833 160,078 329 | 1,562 244,628 330 | 2,201 332,939 331 | 1,647 252,393 332 | 818 152,404 333 | 1,719 275,389 334 | 665 100,213 335 | 2,243 370,193 336 | 1,000 156,809 337 | 820 126,165 338 | 416,283 339 | 1,684 288,275 340 | 1,932 290,077 341 | 2,074 345,126 342 | 1,071 199,764 343 | 2,344 355,914 344 | 385,393 345 | 1,710 287,114 346 | 1,439222,555 347 | 889 161,793 348 | 1,538 249,140 349 | 1,947 311,959 350 | 1,168 189,404 351 | 1,735 285,031 352 | 2,011 305,602 353 | 1,136 189,879 354 | 1,748 272,786 355 | 1,290 210,550 356 | 1,019 163,589 357 | 410,024 358 | 961 168,183 359 | 1,828 286,707 360 | 1,505 245,588 361 | 272,957 362 | 335,746 363 | 1,378 233,152 364 | 1,655 263,515 365 | 1,311 211,683 366 | 1,750 282,814 367 | 154,454 368 | 1,749 268,143 369 | 841 142,094 370 | 1,218 205,493 371 | 2,335 361,074 372 | 1,346 222,027 373 | 1,157 188,133 374 | 1,037 193,348 375 | 1,641 264,315 376 | 2,183 353,744 377 | 1,120 171,682 378 | 1,863 307,677 379 | 1,540 255,857 380 | 1,495 238,438 381 | 1,673 276,999 382 | -266 -38,609 383 | 1,384 235,895 384 | 1,579 257,986 385 | 604 122,586 386 | 102,065 387 | 1,487 243,866 388 | 2,076 327,189 389 | 1,435 244,910 390 | 1,687 253,410 391 | 992 179,179 392 | 783 136,448 393 | 1,276 227,642 394 | 1,614 248,766 395 | 915 173,856 396 | 2,495 400,024 397 | 1,309 205,592 398 | 1,689 260,760 399 | 1,874 301,411 400 | 1,618261,075 401 | 1,130 182,870 402 | 1,117 168,142 403 | 1,672 258,970 404 | 2,248 371,786 405 | 1,674 285,875 406 | 1,489 233,024 407 | 1,818 293,239 408 | 1,614 263,263 409 | 1,183 210,600 410 | 364,392 411 | 1,807 291,148 412 | 2,003 316,575 413 | 1,370 205,806 414 | 804 127,884 415 | 1,175 206,425 416 | 1,639 249,805 417 | 2,127 339,819 418 | 197,343 419 | 795 131,333 420 | 1,738 267,509 421 | 1,135 186,028 422 | 1,781 299,965 423 | 1,708 287,434 424 | 995 152,068 425 | 1,927 328,031 426 | 935 169,870 427 | 701 107,089 428 | 1,233 198,954 429 | 1,367 216,170 430 | 1,028 172,258 431 | 1,386 218,497 432 | 2,344377,075 433 | 1,423 248,794 434 | 1,840 306,216 435 | 1,047 172,223 436 | 2,101 350,083 437 | 1,150 177,974 438 | 606 107,069 439 | 814 131,339 440 | 1,551 246,499 441 | 1,542 241,847 442 | 1,860 299,339 443 | 1,424 220,148 444 | 1,375 229,082 445 | 2,315 363,854 446 | 1,446 227,304 447 | 1,488257,742 448 | 638 99,927 449 | 1,609 252,552 450 | 1,291 203,202 451 | 392 85,245 452 | 1,624 280,863 453 | 2,444 379,016 454 | 1,629 245,425 455 | 1,378 239,094 456 | 916 162,555 457 | 900 171,057 458 | 1,534 256,403 459 | 332 83,030 460 | 2,114 350,274 461 | 967 150,572 462 | 1,471 238,715 463 | 1,936 309,990 464 | 941 161,309 465 | 1,325 224,028 466 | 978151,487 467 | 1,619 250,394 468 | 1,328 228,373 469 | 1,597 246,839 470 | 1,884 286,463 471 | 1,586 262,247 472 | 1,503 240,018 473 | 1,952 297,753 474 | 1,041 195,786 475 | 2,586 403,911 476 | 117 55,096 477 | 1,912 312,882 478 | 1,579 274,165 479 | 2,493 375,033 480 | 2,456 373,472 481 | 1,530 242,104 482 | 1,702 273,034 483 | 433 84,482 484 | 1,283 219,341 485 | 988 150,462 486 | 2,206 352,609 487 | 1,750 267,909 488 | 2,003 316,292 489 | 2,117 330,707 490 | 2,037 343,333 491 | 2,436 384,751 492 | 1,505 258,667 493 | 2,072 347,930 494 | 1,988 322,929 495 | 1,444 246,875 496 | 359,642 497 | 1,124 195,300 498 | 1,022 192,628 499 | 1,708 292,820 500 | 316,771 501 | -------------------------------------------------------------------------------- /notebooks/train_and_test_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib notebook" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 5, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from pathlib import Path\n", 19 | "\n", 20 | "from data.preprocessing_helpers import preprocess\n", 21 | "from features.as_numpy import get_data_as_numpy_array\n", 22 | "from models.train import split_into_training_and_testing_sets, train_model, model_test\n", 23 | "from visualization.plots import get_plot_for_best_fit_line" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 6, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "Slope: 150.16584581528764, Intercept: 19199.572584126232\n", 36 | "R Square of fit on the testing set is 0.980457404006585\n" 37 | ] 38 | }, 39 | { 40 | "data": { 41 | "application/javascript": [ 42 | "/* Put everything inside the global mpl namespace */\n", 43 | "window.mpl = {};\n", 44 | "\n", 45 | "\n", 46 | "mpl.get_websocket_type = function() {\n", 47 | " if (typeof(WebSocket) !== 'undefined') {\n", 48 | " return WebSocket;\n", 49 | " } else if (typeof(MozWebSocket) !== 'undefined') {\n", 50 | " return MozWebSocket;\n", 51 | " } else {\n", 52 | " alert('Your browser does not have WebSocket support.' +\n", 53 | " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", 54 | " 'Firefox 4 and 5 are also supported but you ' +\n", 55 | " 'have to enable WebSockets in about:config.');\n", 56 | " };\n", 57 | "}\n", 58 | "\n", 59 | "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", 60 | " this.id = figure_id;\n", 61 | "\n", 62 | " this.ws = websocket;\n", 63 | "\n", 64 | " this.supports_binary = (this.ws.binaryType != undefined);\n", 65 | "\n", 66 | " if (!this.supports_binary) {\n", 67 | " var warnings = document.getElementById(\"mpl-warnings\");\n", 68 | " if (warnings) {\n", 69 | " warnings.style.display = 'block';\n", 70 | " warnings.textContent = (\n", 71 | " \"This browser does not support binary websocket messages. \" +\n", 72 | " \"Performance may be slow.\");\n", 73 | " }\n", 74 | " }\n", 75 | "\n", 76 | " this.imageObj = new Image();\n", 77 | "\n", 78 | " this.context = undefined;\n", 79 | " this.message = undefined;\n", 80 | " this.canvas = undefined;\n", 81 | " this.rubberband_canvas = undefined;\n", 82 | " this.rubberband_context = undefined;\n", 83 | " this.format_dropdown = undefined;\n", 84 | "\n", 85 | " this.image_mode = 'full';\n", 86 | "\n", 87 | " this.root = $('
');\n", 88 | " this._root_extra_style(this.root)\n", 89 | " this.root.attr('style', 'display: inline-block');\n", 90 | "\n", 91 | " $(parent_element).append(this.root);\n", 92 | "\n", 93 | " this._init_header(this);\n", 94 | " this._init_canvas(this);\n", 95 | " this._init_toolbar(this);\n", 96 | "\n", 97 | " var fig = this;\n", 98 | "\n", 99 | " this.waiting = false;\n", 100 | "\n", 101 | " this.ws.onopen = function () {\n", 102 | " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", 103 | " fig.send_message(\"send_image_mode\", {});\n", 104 | " if (mpl.ratio != 1) {\n", 105 | " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", 106 | " }\n", 107 | " fig.send_message(\"refresh\", {});\n", 108 | " }\n", 109 | "\n", 110 | " this.imageObj.onload = function() {\n", 111 | " if (fig.image_mode == 'full') {\n", 112 | " // Full images could contain transparency (where diff images\n", 113 | " // almost always do), so we need to clear the canvas so that\n", 114 | " // there is no ghosting.\n", 115 | " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", 116 | " }\n", 117 | " fig.context.drawImage(fig.imageObj, 0, 0);\n", 118 | " };\n", 119 | "\n", 120 | " this.imageObj.onunload = function() {\n", 121 | " fig.ws.close();\n", 122 | " }\n", 123 | "\n", 124 | " this.ws.onmessage = this._make_on_message_function(this);\n", 125 | "\n", 126 | " this.ondownload = ondownload;\n", 127 | "}\n", 128 | "\n", 129 | "mpl.figure.prototype._init_header = function() {\n", 130 | " var titlebar = $(\n", 131 | " '
');\n", 133 | " var titletext = $(\n", 134 | " '
');\n", 136 | " titlebar.append(titletext)\n", 137 | " this.root.append(titlebar);\n", 138 | " this.header = titletext[0];\n", 139 | "}\n", 140 | "\n", 141 | "\n", 142 | "\n", 143 | "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", 144 | "\n", 145 | "}\n", 146 | "\n", 147 | "\n", 148 | "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", 149 | "\n", 150 | "}\n", 151 | "\n", 152 | "mpl.figure.prototype._init_canvas = function() {\n", 153 | " var fig = this;\n", 154 | "\n", 155 | " var canvas_div = $('
');\n", 156 | "\n", 157 | " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", 158 | "\n", 159 | " function canvas_keyboard_event(event) {\n", 160 | " return fig.key_event(event, event['data']);\n", 161 | " }\n", 162 | "\n", 163 | " canvas_div.keydown('key_press', canvas_keyboard_event);\n", 164 | " canvas_div.keyup('key_release', canvas_keyboard_event);\n", 165 | " this.canvas_div = canvas_div\n", 166 | " this._canvas_extra_style(canvas_div)\n", 167 | " this.root.append(canvas_div);\n", 168 | "\n", 169 | " var canvas = $('');\n", 170 | " canvas.addClass('mpl-canvas');\n", 171 | " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", 172 | "\n", 173 | " this.canvas = canvas[0];\n", 174 | " this.context = canvas[0].getContext(\"2d\");\n", 175 | "\n", 176 | " var backingStore = this.context.backingStorePixelRatio ||\n", 177 | "\tthis.context.webkitBackingStorePixelRatio ||\n", 178 | "\tthis.context.mozBackingStorePixelRatio ||\n", 179 | "\tthis.context.msBackingStorePixelRatio ||\n", 180 | "\tthis.context.oBackingStorePixelRatio ||\n", 181 | "\tthis.context.backingStorePixelRatio || 1;\n", 182 | "\n", 183 | " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", 184 | "\n", 185 | " var rubberband = $('');\n", 186 | " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", 187 | "\n", 188 | " var pass_mouse_events = true;\n", 189 | "\n", 190 | " canvas_div.resizable({\n", 191 | " start: function(event, ui) {\n", 192 | " pass_mouse_events = false;\n", 193 | " },\n", 194 | " resize: function(event, ui) {\n", 195 | " fig.request_resize(ui.size.width, ui.size.height);\n", 196 | " },\n", 197 | " stop: function(event, ui) {\n", 198 | " pass_mouse_events = true;\n", 199 | " fig.request_resize(ui.size.width, ui.size.height);\n", 200 | " },\n", 201 | " });\n", 202 | "\n", 203 | " function mouse_event_fn(event) {\n", 204 | " if (pass_mouse_events)\n", 205 | " return fig.mouse_event(event, event['data']);\n", 206 | " }\n", 207 | "\n", 208 | " rubberband.mousedown('button_press', mouse_event_fn);\n", 209 | " rubberband.mouseup('button_release', mouse_event_fn);\n", 210 | " // Throttle sequential mouse events to 1 every 20ms.\n", 211 | " rubberband.mousemove('motion_notify', mouse_event_fn);\n", 212 | "\n", 213 | " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", 214 | " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", 215 | "\n", 216 | " canvas_div.on(\"wheel\", function (event) {\n", 217 | " event = event.originalEvent;\n", 218 | " event['data'] = 'scroll'\n", 219 | " if (event.deltaY < 0) {\n", 220 | " event.step = 1;\n", 221 | " } else {\n", 222 | " event.step = -1;\n", 223 | " }\n", 224 | " mouse_event_fn(event);\n", 225 | " });\n", 226 | "\n", 227 | " canvas_div.append(canvas);\n", 228 | " canvas_div.append(rubberband);\n", 229 | "\n", 230 | " this.rubberband = rubberband;\n", 231 | " this.rubberband_canvas = rubberband[0];\n", 232 | " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", 233 | " this.rubberband_context.strokeStyle = \"#000000\";\n", 234 | "\n", 235 | " this._resize_canvas = function(width, height) {\n", 236 | " // Keep the size of the canvas, canvas container, and rubber band\n", 237 | " // canvas in synch.\n", 238 | " canvas_div.css('width', width)\n", 239 | " canvas_div.css('height', height)\n", 240 | "\n", 241 | " canvas.attr('width', width * mpl.ratio);\n", 242 | " canvas.attr('height', height * mpl.ratio);\n", 243 | " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", 244 | "\n", 245 | " rubberband.attr('width', width);\n", 246 | " rubberband.attr('height', height);\n", 247 | " }\n", 248 | "\n", 249 | " // Set the figure to an initial 600x600px, this will subsequently be updated\n", 250 | " // upon first draw.\n", 251 | " this._resize_canvas(600, 600);\n", 252 | "\n", 253 | " // Disable right mouse context menu.\n", 254 | " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", 255 | " return false;\n", 256 | " });\n", 257 | "\n", 258 | " function set_focus () {\n", 259 | " canvas.focus();\n", 260 | " canvas_div.focus();\n", 261 | " }\n", 262 | "\n", 263 | " window.setTimeout(set_focus, 100);\n", 264 | "}\n", 265 | "\n", 266 | "mpl.figure.prototype._init_toolbar = function() {\n", 267 | " var fig = this;\n", 268 | "\n", 269 | " var nav_element = $('
')\n", 270 | " nav_element.attr('style', 'width: 100%');\n", 271 | " this.root.append(nav_element);\n", 272 | "\n", 273 | " // Define a callback function for later on.\n", 274 | " function toolbar_event(event) {\n", 275 | " return fig.toolbar_button_onclick(event['data']);\n", 276 | " }\n", 277 | " function toolbar_mouse_event(event) {\n", 278 | " return fig.toolbar_button_onmouseover(event['data']);\n", 279 | " }\n", 280 | "\n", 281 | " for(var toolbar_ind in mpl.toolbar_items) {\n", 282 | " var name = mpl.toolbar_items[toolbar_ind][0];\n", 283 | " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", 284 | " var image = mpl.toolbar_items[toolbar_ind][2];\n", 285 | " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", 286 | "\n", 287 | " if (!name) {\n", 288 | " // put a spacer in here.\n", 289 | " continue;\n", 290 | " }\n", 291 | " var button = $('');\n", 1508 | " button.click(method_name, toolbar_event);\n", 1509 | " button.mouseover(tooltip, toolbar_mouse_event);\n", 1510 | " nav_element.append(button);\n", 1511 | " }\n", 1512 | "\n", 1513 | " // Add the status bar.\n", 1514 | " var status_bar = $('');\n", 1515 | " nav_element.append(status_bar);\n", 1516 | " this.message = status_bar[0];\n", 1517 | "\n", 1518 | " // Add the close button to the window.\n", 1519 | " var buttongrp = $('
');\n", 1520 | " var button = $('');\n", 1521 | " button.click(function (evt) { fig.handle_close(fig, {}); } );\n", 1522 | " button.mouseover('Stop Interaction', toolbar_mouse_event);\n", 1523 | " buttongrp.append(button);\n", 1524 | " var titlebar = this.root.find($('.ui-dialog-titlebar'));\n", 1525 | " titlebar.prepend(buttongrp);\n", 1526 | "}\n", 1527 | "\n", 1528 | "mpl.figure.prototype._root_extra_style = function(el){\n", 1529 | " var fig = this\n", 1530 | " el.on(\"remove\", function(){\n", 1531 | "\tfig.close_ws(fig, {});\n", 1532 | " });\n", 1533 | "}\n", 1534 | "\n", 1535 | "mpl.figure.prototype._canvas_extra_style = function(el){\n", 1536 | " // this is important to make the div 'focusable\n", 1537 | " el.attr('tabindex', 0)\n", 1538 | " // reach out to IPython and tell the keyboard manager to turn it's self\n", 1539 | " // off when our div gets focus\n", 1540 | "\n", 1541 | " // location in version 3\n", 1542 | " if (IPython.notebook.keyboard_manager) {\n", 1543 | " IPython.notebook.keyboard_manager.register_events(el);\n", 1544 | " }\n", 1545 | " else {\n", 1546 | " // location in version 2\n", 1547 | " IPython.keyboard_manager.register_events(el);\n", 1548 | " }\n", 1549 | "\n", 1550 | "}\n", 1551 | "\n", 1552 | "mpl.figure.prototype._key_event_extra = function(event, name) {\n", 1553 | " var manager = IPython.notebook.keyboard_manager;\n", 1554 | " if (!manager)\n", 1555 | " manager = IPython.keyboard_manager;\n", 1556 | "\n", 1557 | " // Check for shift+enter\n", 1558 | " if (event.shiftKey && event.which == 13) {\n", 1559 | " this.canvas_div.blur();\n", 1560 | " event.shiftKey = false;\n", 1561 | " // Send a \"J\" for go to next cell\n", 1562 | " event.which = 74;\n", 1563 | " event.keyCode = 74;\n", 1564 | " manager.command_mode();\n", 1565 | " manager.handle_keydown(event);\n", 1566 | " }\n", 1567 | "}\n", 1568 | "\n", 1569 | "mpl.figure.prototype.handle_save = function(fig, msg) {\n", 1570 | " fig.ondownload(fig, null);\n", 1571 | "}\n", 1572 | "\n", 1573 | "\n", 1574 | "mpl.find_output_cell = function(html_output) {\n", 1575 | " // Return the cell and output element which can be found *uniquely* in the notebook.\n", 1576 | " // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n", 1577 | " // IPython event is triggered only after the cells have been serialised, which for\n", 1578 | " // our purposes (turning an active figure into a static one), is too late.\n", 1579 | " var cells = IPython.notebook.get_cells();\n", 1580 | " var ncells = cells.length;\n", 1581 | " for (var i=0; i= 3 moved mimebundle to data attribute of output\n", 1588 | " data = data.data;\n", 1589 | " }\n", 1590 | " if (data['text/html'] == html_output) {\n", 1591 | " return [cell, data, j];\n", 1592 | " }\n", 1593 | " }\n", 1594 | " }\n", 1595 | " }\n", 1596 | "}\n", 1597 | "\n", 1598 | "// Register the function which deals with the matplotlib target/channel.\n", 1599 | "// The kernel may be null if the page has been refreshed.\n", 1600 | "if (IPython.notebook.kernel != null) {\n", 1601 | " IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n", 1602 | "}\n" 1603 | ], 1604 | "text/plain": [ 1605 | "" 1606 | ] 1607 | }, 1608 | "metadata": {}, 1609 | "output_type": "display_data" 1610 | }, 1611 | { 1612 | "data": { 1613 | "text/html": [ 1614 | "" 1615 | ], 1616 | "text/plain": [ 1617 | "" 1618 | ] 1619 | }, 1620 | "metadata": {}, 1621 | "output_type": "display_data" 1622 | } 1623 | ], 1624 | "source": [ 1625 | "raw_data_file_path = str(Path().resolve().parents[0] / \"data\" / \"raw\" / \"housing_data.txt\")\n", 1626 | "clean_data_file_path = str(Path().resolve().parents[0] / \"data\" / \"clean\" / \"clean_housing_data.txt\")\n", 1627 | "preprocess(raw_data_file_path, clean_data_file_path)\n", 1628 | "data_array = get_data_as_numpy_array(clean_data_file_path, 2)\n", 1629 | "training_set, testing_set = split_into_training_and_testing_sets(data_array)\n", 1630 | "slope, intercept = train_model(training_set)\n", 1631 | "print(\"Slope: {0}, Intercept: {1}\".format(slope, intercept))\n", 1632 | "print(\"R Square of fit on the testing set is {0}\".format(model_test(testing_set, slope, intercept)))\n", 1633 | "training_figure = get_plot_for_best_fit_line(slope, intercept, training_set[:, 0], training_set[:, 1], \"Training\")\n", 1634 | "training_figure.show()\n", 1635 | "testing_figure = get_plot_for_best_fit_line(slope, intercept, testing_set[:, 0], testing_set[:, 1], \"Testing\")\n", 1636 | "testing_figure.show()" 1637 | ] 1638 | }, 1639 | { 1640 | "cell_type": "code", 1641 | "execution_count": null, 1642 | "metadata": {}, 1643 | "outputs": [], 1644 | "source": [] 1645 | }, 1646 | { 1647 | "cell_type": "code", 1648 | "execution_count": null, 1649 | "metadata": {}, 1650 | "outputs": [], 1651 | "source": [] 1652 | } 1653 | ], 1654 | "metadata": { 1655 | "kernelspec": { 1656 | "display_name": "Python 3", 1657 | "language": "python", 1658 | "name": "python3" 1659 | }, 1660 | "language_info": { 1661 | "codemirror_mode": { 1662 | "name": "ipython", 1663 | "version": 3 1664 | }, 1665 | "file_extension": ".py", 1666 | "mimetype": "text/x-python", 1667 | "name": "python", 1668 | "nbconvert_exporter": "python", 1669 | "pygments_lexer": "ipython3", 1670 | "version": "3.6.8" 1671 | } 1672 | }, 1673 | "nbformat": 4, 1674 | "nbformat_minor": 2 1675 | } 1676 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup(name="univariate_linear_regression", 6 | version="0.1.0", 7 | description="Univariate linear regression of housing price against housing area", 8 | author="Dibya Chakravorty", 9 | packages=find_packages("src"), 10 | package_dir={"": "src"}, 11 | author_email="dibyachakravorty@gmail.com", 12 | install_requires=["jupyter==1.0.0", 13 | "matplotlib==3.1.1", 14 | "numpy==1.17.3", 15 | "pytest==5.2.2", 16 | "pytest-mpl==0.10", 17 | "pytest-mock==1.11.2", 18 | "scipy==1.3.1", 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /src/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .preprocessing_helpers import * 2 | -------------------------------------------------------------------------------- /src/data/preprocessing_helpers.py: -------------------------------------------------------------------------------- 1 | def convert_to_int(integer_string_with_commas): 2 | comma_separated_parts = integer_string_with_commas.split(",") 3 | for i in range(len(comma_separated_parts)): 4 | if len(comma_separated_parts[i]) > 3: 5 | return None 6 | if i != 0 and len(comma_separated_parts[i]) != 3: 7 | return None 8 | integer_string_without_commas = "".join(comma_separated_parts) 9 | try: 10 | return int(integer_string_without_commas) 11 | except ValueError: 12 | return None 13 | 14 | 15 | def row_to_list(row): 16 | row = row.rstrip("\n") 17 | separated_entries = row.split("\t") 18 | if len(separated_entries) == 2 and "" not in separated_entries: 19 | return separated_entries 20 | return None 21 | 22 | 23 | def preprocess(raw_data_file_path, clean_data_file_path): 24 | with open(raw_data_file_path, "r") as input_file: 25 | rows = input_file.readlines() 26 | with open(clean_data_file_path, "w") as output_file: 27 | for row in rows: 28 | row_as_list = row_to_list(row) 29 | if row_as_list is None: 30 | continue 31 | area = convert_to_int(row_as_list[0]) 32 | price = convert_to_int(row_as_list[1]) 33 | if area is None or price is None: 34 | continue 35 | output_file.write("{0}\t{1}\n".format(area, price)) 36 | -------------------------------------------------------------------------------- /src/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .as_numpy import * 2 | -------------------------------------------------------------------------------- /src/features/as_numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def get_data_as_numpy_array(clean_data_file_path, num_columns): 5 | result = np.empty((0, num_columns)) 6 | with open(clean_data_file_path, "r") as f: 7 | rows = f.readlines() 8 | for row_num in range(len(rows)): 9 | try: 10 | row = np.array([rows[row_num].rstrip("\n").split("\t")], dtype=float) 11 | except ValueError: 12 | raise ValueError("Line {0} of {1} is badly formatted".format(row_num + 1, clean_data_file_path)) 13 | else: 14 | if row.shape != (1, num_columns): 15 | raise ValueError("Line {0} of {1} does not have {2} columns".format(row_num + 1, 16 | clean_data_file_path, 17 | num_columns 18 | ) 19 | ) 20 | result = np.append(result, row, axis=0) 21 | return result 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .train import * 2 | -------------------------------------------------------------------------------- /src/models/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import linregress 3 | 4 | 5 | def split_into_training_and_testing_sets(data_array): 6 | dim = data_array.ndim 7 | if dim != 2: 8 | raise ValueError("Argument data_array must be two dimensional. Got {0} dimensional array instead!".format(dim)) 9 | num_rows = data_array.shape[0] 10 | if num_rows < 2: 11 | raise ValueError("Argument data_array must have at least 2 rows, it actually has just {0}".format(num_rows)) 12 | num_training = int(0.75 * data_array.shape[0]) 13 | permuted_indices = np.random.permutation(data_array.shape[0]) 14 | return data_array[permuted_indices[:num_training], :], data_array[permuted_indices[num_training:], :] 15 | 16 | 17 | def model_test(testing_set, slope, intercept): 18 | dim = testing_set.ndim 19 | if dim != 2: 20 | raise ValueError("Argument testing_set must be two dimensional. Got {0} dimensional array instead!".format( 21 | dim 22 | ) 23 | ) 24 | num_cols = testing_set.shape[1] 25 | if num_cols != 2: 26 | raise ValueError("Argument testing_set must have 2 columns for univariate linear regression. " 27 | "It actually has {0} columns".format(num_cols) 28 | ) 29 | actual_price = testing_set[:, 1] 30 | predicted_price = slope*testing_set[:, 0] + intercept 31 | residual_sum_of_squares = np.sum(np.square(predicted_price - actual_price)) / testing_set.shape[0] 32 | return 1 - residual_sum_of_squares / np.var(actual_price) 33 | 34 | 35 | def train_model(training_set): 36 | dim = training_set.ndim 37 | if dim != 2: 38 | raise ValueError("Argument training_set must be two dimensional. Got {0} dimensional array instead!".format( 39 | dim 40 | ) 41 | ) 42 | num_rows = training_set.shape[0] 43 | if num_rows < 2: 44 | raise ValueError("Argument training_set must have at least 2 rows for linear regression to work, " 45 | "it actually has just {0}".format(num_rows) 46 | ) 47 | num_cols = training_set.shape[1] 48 | if num_cols != 2: 49 | raise ValueError("Argument training_set must have 2 columns for univariate linear regression. " 50 | "It actually has {0} columns".format(num_cols) 51 | ) 52 | slope, intercept, _, _, _ = linregress(training_set[:, 0], training_set[:, 1]) 53 | return slope, intercept 54 | 55 | -------------------------------------------------------------------------------- /src/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | from .plots import * 2 | -------------------------------------------------------------------------------- /src/visualization/plots.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | 5 | def get_plot_for_best_fit_line(slope, intercept, x_array, y_array, title): 6 | x_array_ndim = x_array.ndim 7 | if not x_array_ndim == 1: 8 | raise ValueError("Argument x_array should be 1 dimensional. " 9 | "It actually is {0} dimensional".format(x_array_ndim) 10 | ) 11 | y_array_ndim = y_array.ndim 12 | if not y_array_ndim == 1: 13 | raise ValueError("Argument y_array should be 1 dimensional. " 14 | "It actually is {0} dimensional".format(y_array_ndim) 15 | ) 16 | x_array_length = x_array.shape[0] 17 | y_array_length = y_array.shape[0] 18 | if x_array_length != y_array_length: 19 | raise RuntimeError("Arguments x_array and y_array should have same length. " 20 | "But x_array has length {0} and y_array has length {1}".format(x_array_length, 21 | y_array_length 22 | ) 23 | ) 24 | fig, ax = plt.subplots() 25 | ax.plot(x_array, y_array, ".") 26 | ax.plot([0, np.max(x_array)], [intercept, slope * np.max(x_array) + intercept], "-") 27 | ax.set(xlabel="area (square feet)", ylabel="price (dollars)", title=title) 28 | return fig 29 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutfeeling/univariate-linear-regression/cf8ba5f4ce949d2d823e3a51cb429c46c61fc9ce/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutfeeling/univariate-linear-regression/cf8ba5f4ce949d2d823e3a51cb429c46c61fc9ce/tests/data/__init__.py -------------------------------------------------------------------------------- /tests/data/test_preprocessing_helpers.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import call 2 | 3 | import pytest 4 | 5 | from data.preprocessing_helpers import convert_to_int, row_to_list, preprocess 6 | 7 | 8 | @pytest.fixture 9 | def raw_and_clean_data_file(tmpdir): 10 | raw_path = tmpdir.join("raw.txt") 11 | clean_path = tmpdir.join("clean.txt") 12 | with open(raw_path, "w") as f: 13 | f.write("1,801\t201,411\n" 14 | "1,767565,112\n" 15 | "2,002\t333,209\n" 16 | "1990\t782,911\n" 17 | "1,285\t389129\n" 18 | ) 19 | return raw_path, clean_path 20 | 21 | 22 | def row_to_list_bug_free(row): 23 | return_values = {"1,801\t201,411\n": ["1,801", "201,411"], 24 | "1,767565,112\n": None, 25 | "2,002\t333,209\n": ["2,002", "333,209"], 26 | "1990\t782,911\n": ["1990", "782,911"], 27 | "1,285\t389129\n": ["1,285", "389129"], 28 | } 29 | return return_values[row] 30 | 31 | 32 | def convert_to_int_bug_free(comma_separated_integer_string): 33 | return_values = {"1,801": 1801, 34 | "201,411": 201411, 35 | "2,002": 2002, 36 | "333,209": 333209, 37 | "1990": None, 38 | "782,911": 782911, 39 | "1,285": 1285, 40 | "389129": None, 41 | } 42 | return return_values[comma_separated_integer_string] 43 | 44 | 45 | class TestConvertToInt(object): 46 | def test_with_no_comma(self): 47 | test_argument = "756" 48 | expected = 756 49 | actual = convert_to_int(test_argument) 50 | assert actual == expected, "Expected: 756, Actual: {0}".format(actual) 51 | 52 | def test_with_one_comma(self): 53 | test_argument = "2,081" 54 | expected = 2081 55 | actual = convert_to_int(test_argument) 56 | assert actual == expected, "Expected: 2081, Actual: {0}".format(actual) 57 | 58 | def test_with_two_commas(self): 59 | test_argument = "1,034,891" 60 | expected = 1034891 61 | actual = convert_to_int(test_argument) 62 | assert actual == expected, "Expected: 1034891, Actual: {0}".format(actual) 63 | 64 | def test_on_string_with_incorrectly_placed_comma(self): 65 | test_argument = "12,72,891" 66 | expected = None 67 | actual = convert_to_int(test_argument) 68 | assert actual == expected, "Expected: None, Actual: {0}".format(actual) 69 | 70 | def test_on_string_with_missing_comma(self): 71 | test_argument = "178100,301" 72 | expected = None 73 | actual = convert_to_int(test_argument) 74 | assert actual == expected, "Expected: None, Actual: {0}".format(actual) 75 | 76 | def test_on_float_valued_string(self): 77 | test_argument = "6.9" 78 | expected = None 79 | actual = convert_to_int(test_argument) 80 | assert actual == expected, "Expected: None, Actual: {0}".format(actual) 81 | 82 | 83 | class TestRowToList(object): 84 | def test_on_no_tab_no_missing_value(self): # (0, 0) boundary value 85 | actual = row_to_list("123\n") 86 | assert actual is None, "Expected: None, Actual: {0}".format(actual) 87 | 88 | def test_on_two_tabs_no_missing_value(self): # (2, 0) boundary value 89 | actual = row_to_list("123\t4,567\t89\n") 90 | assert actual is None, "Expected: None, Actual: {0}".format(actual) 91 | 92 | def test_on_one_tab_with_missing_value(self): # (1, 1) boundary value 93 | actual = row_to_list("\t4,567\n") 94 | assert actual is None, "Expected: None, Actual: {0}".format(actual) 95 | 96 | def test_on_no_tab_with_missing_value(self): # (0, 1) case 97 | actual = row_to_list("\n") 98 | assert actual is None, "Expected: None, Actual: {0}".format(actual) 99 | 100 | def test_on_two_tabs_with_missing_value(self): # (0, 1) case 101 | actual = row_to_list("123\t\t89\n") 102 | assert actual is None, "Expected: None, Actual: {0}".format(actual) 103 | 104 | def test_on_normal_argument_1(self): 105 | actual = row_to_list("123\t4,567\n") 106 | expected = ["123", "4,567"] 107 | assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual) 108 | 109 | def test_on_normal_argument_2(self): 110 | actual = row_to_list("1,059\t186,606\n") 111 | expected = ["1,059", "186,606"] 112 | assert actual == expected, "Expected: {0}, Actual: {1}".format(expected, actual) 113 | 114 | 115 | class TestPreprocess(object): 116 | def test_on_raw_data(self, raw_and_clean_data_file, mocker): 117 | raw_path, clean_path = raw_and_clean_data_file 118 | row_to_list_mock = mocker.patch("data.preprocessing_helpers.row_to_list", side_effect=row_to_list_bug_free) 119 | convert_to_int_mock = mocker.patch("data.preprocessing_helpers.convert_to_int", 120 | side_effect=convert_to_int_bug_free 121 | ) 122 | preprocess(raw_path, clean_path) 123 | assert row_to_list_mock.call_args_list == [call("1,801\t201,411\n"), 124 | call("1,767565,112\n"), 125 | call("2,002\t333,209\n"), 126 | call("1990\t782,911\n"), 127 | call("1,285\t389129\n") 128 | ] 129 | assert convert_to_int_mock.call_args_list == [call("1,801"), call("201,411"), call("2,002"), call("333,209"), 130 | call("1990"), call("782,911"), call("1,285"), call("389129") 131 | ] 132 | with open(clean_path, "r") as f: 133 | lines = f.readlines() 134 | first_line = lines[0] 135 | assert first_line == "1801\t201411\n" 136 | second_line = lines[1] 137 | assert second_line == "2002\t333209\n" 138 | 139 | -------------------------------------------------------------------------------- /tests/features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutfeeling/univariate-linear-regression/cf8ba5f4ce949d2d823e3a51cb429c46c61fc9ce/tests/features/__init__.py -------------------------------------------------------------------------------- /tests/features/test_as_numpy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pytest 4 | 5 | from features.as_numpy import get_data_as_numpy_array 6 | 7 | 8 | @pytest.fixture 9 | def clean_data_file(): 10 | file_path = "clean.txt" 11 | with open(file_path, "w") as f: 12 | f.write("201\t305671\n7892\t298140\n501\t738293\n") 13 | yield file_path 14 | os.remove(file_path) 15 | 16 | 17 | @pytest.fixture 18 | def empty_file(): 19 | file_path = "empty.txt" 20 | open(file_path, "w").close() 21 | yield file_path 22 | os.remove(file_path) 23 | 24 | 25 | class TestGetDataAsNumpyArray(object): 26 | def test_on_clean_file(self, clean_data_file): 27 | expected = np.array([[201.0, 305671.0], [7892.0, 298140.0], [501.0, 738293.0]]) 28 | actual = get_data_as_numpy_array(clean_data_file, 2) 29 | assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual) 30 | 31 | def test_on_empty_file(self, empty_file): 32 | expected = np.empty((0, 2)) 33 | actual = get_data_as_numpy_array(empty_file, 2) 34 | assert actual == pytest.approx(expected), "Expected: {0}, Actual: {1}".format(expected, actual) -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutfeeling/univariate-linear-regression/cf8ba5f4ce949d2d823e3a51cb429c46c61fc9ce/tests/models/__init__.py -------------------------------------------------------------------------------- /tests/models/test_train.py: -------------------------------------------------------------------------------- 1 | from math import cos, pi, sin 2 | import numpy as np 3 | import pytest 4 | 5 | from models.train import split_into_training_and_testing_sets, train_model, model_test 6 | 7 | 8 | class TestSplitIntoTrainingAndTestingSets(object): 9 | def test_on_six_rows(self): 10 | test_argument = np.array([[2081.0, 314942.0], 11 | [1059.0, 186606.0], 12 | [1148.0, 206186.0], 13 | [1506.0, 248419.0], 14 | [1210.0, 214114.0], 15 | [1697.0, 277794.0], 16 | ] 17 | ) 18 | expected_length_training_set = 4 19 | expected_length_testing_set = 2 20 | actual = split_into_training_and_testing_sets(test_argument) 21 | assert actual[0].shape[0] == expected_length_training_set, \ 22 | "The actual number of rows in the training array is not 4" 23 | assert actual[1].shape[0] == expected_length_testing_set, \ 24 | "The actual number of rows in the testing array is not 2" 25 | 26 | def test_on_one_row(self): 27 | test_argument = np.array([[1382.0, 390167.0]]) 28 | with pytest.raises(ValueError) as exc_info: 29 | split_into_training_and_testing_sets(test_argument) 30 | expected_error_msg = "Argument data_array must have at least 2 rows, it actually has just 1" 31 | assert exc_info.match(expected_error_msg) 32 | 33 | def test_on_one_dimensional_array(self): 34 | test_argument = np.array([1382.0, 390167.0]) 35 | with pytest.raises(ValueError) as exc_info: 36 | split_into_training_and_testing_sets(test_argument) 37 | expected_error_msg = "Argument data_array must be two dimensional. Got 1 dimensional array instead!" 38 | assert exc_info.match(expected_error_msg) 39 | 40 | 41 | class TestTrainModel(object): 42 | def test_on_linear_data(self): 43 | test_argument = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]]) 44 | expected_slope = 2.0 45 | expected_intercept = 1.0 46 | actual_slope, actual_intercept = train_model(test_argument) 47 | slope_message = ("train_model({0}) should return slope {1}, " 48 | "but it actually returned slope {2}".format(test_argument, expected_slope, actual_slope) 49 | ) 50 | intercept_message = ("train_model({0}) should return intercept {1}, " 51 | "but it actually returned intercept {2}".format(test_argument, 52 | expected_intercept, 53 | actual_intercept 54 | ) 55 | ) 56 | assert actual_slope == pytest.approx(expected_slope), slope_message 57 | assert actual_intercept == pytest.approx(expected_intercept), intercept_message 58 | 59 | def test_on_positively_correlated_data(self): 60 | test_argument = np.array([[1.0, 4.0], [2.0, 4.0], 61 | [3.0, 9.0], [4.0, 10.0], 62 | [5.0, 7.0], [6.0, 13.0], 63 | ] 64 | ) 65 | actual_slope, actual_intercept = train_model(test_argument) 66 | assert actual_slope > 0, "Expected slope: > 0, Actual slope: {0}".format(actual_slope) 67 | 68 | 69 | class TestModelTest(object): 70 | def test_on_perfect_fit(self): 71 | test_argument = np.array([[1.0, 3.0], [2.0, 5.0], [3.0, 7.0]]) 72 | expected = 1.0 73 | actual = model_test(test_argument, 2.0, 1.0) 74 | message = "model_test({0}) should return {1}, but it actually returned {2}".format(test_argument, 75 | expected, 76 | actual 77 | ) 78 | assert actual == pytest.approx(expected), message 79 | 80 | def test_on_circular_data(self): 81 | theta = pi / 4.0 82 | test_argument = np.array([[0.0, 1.0], 83 | [cos(theta), sin(theta)], 84 | [1.0, 0.0], 85 | [cos(3 * theta), sin(3 * theta)], 86 | [0.0, -1.0], 87 | [cos(5 * theta), sin(5 * theta)], 88 | [-1.0, 0.0], 89 | [cos(7 * theta), sin(7 * theta)] 90 | ] 91 | ) 92 | actual = model_test(test_argument, 0.0, 0.0) 93 | assert actual == pytest.approx(0.0) 94 | 95 | def test_on_one_dimensional_array(self): 96 | test_argument = np.array([1.0, 2.0, 3.0, 4.0]) 97 | with pytest.raises(ValueError) as exc_info: 98 | model_test(test_argument, 1.0, 1.0) 99 | expected_error_msg = "Argument testing_set must be two dimensional. Got 1 dimensional array instead!" 100 | assert exc_info.match(expected_error_msg) 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /tests/visualization/baseline/test_plot_for_almost_linear_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutfeeling/univariate-linear-regression/cf8ba5f4ce949d2d823e3a51cb429c46c61fc9ce/tests/visualization/baseline/test_plot_for_almost_linear_data.png -------------------------------------------------------------------------------- /tests/visualization/baseline/test_plot_for_linear_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gutfeeling/univariate-linear-regression/cf8ba5f4ce949d2d823e3a51cb429c46c61fc9ce/tests/visualization/baseline/test_plot_for_linear_data.png -------------------------------------------------------------------------------- /tests/visualization/test_plots.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | 4 | from visualization.plots import get_plot_for_best_fit_line 5 | 6 | 7 | class TestGetPlotForBestFitLine(object): 8 | @pytest.mark.mpl_image_compare 9 | def test_plot_for_linear_data(self): 10 | slope = 2.0 11 | intercept = 1.0 12 | x_array = np.array([1.0, 2.0, 3.0]) 13 | y_array = np.array([3.0, 5.0, 7.0]) 14 | title = "Test plot for linear data" 15 | return get_plot_for_best_fit_line(slope, intercept, x_array, y_array, title) 16 | 17 | @pytest.mark.mpl_image_compare 18 | def test_plot_for_almost_linear_data(self): 19 | slope = -2.0 20 | intercept = 10.0 21 | x_array = np.array([1.0, 2.0, 3.0]) 22 | y_array = np.array([8.0, 6.0, 5.0]) 23 | title = "Test plot for almost linear data" 24 | return get_plot_for_best_fit_line(slope, intercept, x_array, y_array, title) 25 | 26 | --------------------------------------------------------------------------------