├── .gitignore ├── README.md ├── _datasets ├── Action_words.csv ├── Cab Charges May.xlsx ├── Chinook.sqlite ├── Information_gain_job_advertisements.csv ├── Interactions.csv ├── LIGO_data.hdf5 ├── NEONDS.hdf5 ├── Shift Roster.xlsx ├── Trump Tweets(2017).xlsx ├── WDIData_min.csv ├── airline.sas7bdat ├── albeck_gene_expression.mat ├── amis.csv ├── auto.csv ├── battledeath.xlsx ├── boston.csv ├── cars.csv ├── company-stock-movements-2010-2015-incl.csv ├── data.pk1 ├── data_1024.csv ├── diabetes.csv ├── digits.csv ├── disarea.dta ├── eurovision-2016.csv ├── fish.csv ├── gm_2008_region.csv ├── house-votes-84.csv ├── ind_pop_data.csv ├── industries.json ├── lcd-digits.csv ├── mnist_kaggle_some_rows.csv ├── moby_dick.txt ├── sales.sas7bdat ├── seaslug.txt ├── seeds-width-vs-length.csv ├── seeds.csv ├── titanic.csv ├── titanic_corrupt.txt ├── tweets.csv ├── white-wine.csv ├── wine.csv └── winequality-red.csv ├── certifications ├── Datacamp_course1.pdf ├── Datacamp_course17.pdf ├── Datacamp_course18.pdf ├── Datacamp_course2.pdf ├── Datacamp_course3.pdf ├── Datacamp_course4.pdf └── Datacamp_course5.pdf ├── notes ├── Datacamp │ ├── Data Science With Python Course 5.docx │ ├── Iterators and Generators-Python.docx │ ├── K-Means Clustering.rtf │ └── Regression Models.txt ├── Google ML │ ├── Google's ML Crash Course Notes.docx │ └── Tensorflow Estimators - documentation.docx └── R │ ├── Exploratory Data Analysis Dimension Reduction.txt │ ├── Probability.txt │ ├── Regression Models.txt │ ├── Rplot.png │ ├── Rplot01.png │ ├── Rplot02.png │ ├── Rplot03.png │ ├── Rplot04.png │ ├── Rplot05.png │ ├── Statistial Interference.txt │ ├── dendo1.png │ ├── prob-1.png │ └── ways to cluster.txt └── src ├── __pycache__ └── helper.cpython-36.pyc ├── case_studies ├── case_study_1.1.py ├── case_study_1.2.py ├── case_study_1.3.py ├── case_study_pipelining_and_scaling.py ├── case_study_trumps_twitter_RTs.py ├── case_study_urban_population_trends.py ├── case_study_webscraping_imdb.py └── top250names.txt ├── core ├── py_comprehensions.py ├── py_enumeration_example.py ├── py_filter_example.py ├── py_generators.py ├── py_iterable_and_iterator.py └── py_regex.py ├── db ├── __init__.py ├── py_mongo_integration.py ├── py_sql.py └── py_sql_with_pandas.py ├── file_operations ├── py_corrupt_file_read.py ├── py_default_file_read_1.py ├── py_numpy_file_read_1.py ├── py_numpy_file_read_2.py ├── py_pandas_excel_read.py ├── py_pandas_file_read_1.py ├── py_pandas_read_csv.py ├── py_pickle_read_test.py ├── py_read_hdf5_file.py ├── py_read_matlab_file.py ├── py_read_sas_file.py ├── py_read_stata_file.py ├── py_test_loops_algo.py ├── read_in_chunks.py └── read_tweets.py ├── grains_data_from_dataset.py ├── helper.py ├── misc ├── __init__.py ├── py_test_loops_algo.py ├── py_zip_example.py ├── random.py └── tensorflow_starter.py ├── ml-supervised ├── __init__.py ├── course-description.png ├── course-description.rtf ├── k-fold_cross_validation.py ├── ml_centering_and_scaling.py ├── ml_manually_remove_missing_data.py ├── ml_pipeline_with_hyperparameters.py ├── ml_pipelines.py ├── py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-1.py ├── py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-2.py ├── py_hyperparamter_tuning_with_GridSearchCV.py ├── py_hyperparamter_tuning_with_RandomizedSearchCV.py ├── py_knn_classifier_modal.py ├── py_knn_classifier_modal_train_test.py ├── py_knn_classifiers_performance_metrics.py ├── py_lasso_regularized_linear_regression.py ├── py_linear_regression_modal.py ├── py_linear_regression_modal_train_test.py ├── py_logistic_regression_modal.py ├── py_ridge_regularized_linear_regression.py └── py_sklearn_digits_dataset.py ├── ml-unsupervised ├── 01-clustering-for-dataset-exploration │ ├── 01-how-many-clusters.py │ ├── 02-clustering-2d-points.py │ ├── 03-inspect-your-clustering.py │ ├── 04-how-many-clusters-of-grain.py │ ├── 05-evaluating-the-grain-clustering.py │ ├── 06-07-scaling-&-clustering-the-fish-data.py │ ├── 08-09-scaling-&-clustering-which-stocks-move-together.py │ ├── ch1_slides.pdf │ ├── chapter-details.png │ └── chapter-details.rtf ├── 02-visualization-with-hierarchical-clustering-and-t-sne │ ├── 01-hierarchical-clustering-of-the-grain-data.py │ ├── 02-hierarchies-of-stocks.py │ ├── 03-different-linkage-different-hierarchical-clustering.py │ ├── 04-extracting-the-cluster-labels.py │ ├── 05-tsne-visualization-of-grain-dataset.py │ ├── 06-a-tsne-map-of-the-stock-market.py │ ├── ch2_slides.pdf │ ├── chapter-details.png │ └── chapter-details.rtf ├── 03-decorrelating-your-data-and-dimension-reduction │ ├── 01-correlated-data-in-nature.py │ ├── 02-decorrelating-the-grain-measurements-with-pca.py │ ├── 03-the-first-principal-component.py │ ├── 04-variance-of-the-pca-features.py │ ├── 05-dimension-reduction-of-the-fish-measuremenys.py │ ├── 06-a-tfidf-word-frequency-array.py │ ├── 07-clustering-wikipedia-part-1.py │ ├── 08-clustering-wikipedia-part-2.py │ ├── ch3_slides.pdf │ ├── chapter-details.png │ └── chapter-details.rtf ├── 04-discovering-interpretable-features │ ├── 01-nmf-applied-to-wikipedia-articles.py │ ├── 02-nmf-features-of-the-wikipedia-articles.py │ ├── 03-nmf-learns-topics-of-documents.py │ ├── 04-explore-the-led-digits-dataset.py │ ├── 05-nmf-learns-the-parts-of-images.py │ ├── 06-pca-doesnt-learn-parts.py │ ├── 07-which-articles-are-similar-to-cristiano-ronaldo.py │ ├── 08-recommend-musical-artists-part-1.py │ ├── 09-recommend-musical-artists-part-2.py │ ├── ch4_slides.pdf │ ├── chapter-details.png │ └── chapter-details.rtf ├── __init__.py ├── course-description.png └── k-means_clustering.py └── python_core ├── __init__.py └── output_questions ├── 1.py └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.idea 2 | **/venv 3 | **/.idea -------------------------------------------------------------------------------- /_datasets/Action_words.csv: -------------------------------------------------------------------------------- 1 | Accomplished 2 | Achieved 3 | Adapted 4 | Arranged 5 | Attained 6 | Built 7 | Captured 8 | Commandeered 9 | Completed 10 | Converted 11 | Crafted 12 | Created 13 | Cut 14 | Delivered 15 | Demonstrated 16 | Designed 17 | Developed 18 | Devised 19 | Directed 20 | Distributed 21 | Doubled 22 | Drove 23 | Earned 24 | Eliminated 25 | Encouraged 26 | Enforced 27 | Engineered 28 | Ensured 29 | Established 30 | Expanded 31 | Expedited 32 | Founded 33 | Generated 34 | Guided 35 | Identified 36 | Implemented 37 | Improved 38 | Improvised 39 | Increased 40 | Initiated 41 | Inspired 42 | Installed 43 | Instigated 44 | Instructed 45 | Interpreted 46 | Introduced 47 | Launched 48 | Led 49 | Liaised 50 | Modernized 51 | Motivated 52 | Negotiated 53 | Organized 54 | Promoted 55 | Redesigned 56 | Revitalized 57 | Started 58 | Streamlined 59 | Strengthened 60 | Structured 61 | Supervised 62 | Transformed 63 | Uncovered 64 | Widened 65 | Won -------------------------------------------------------------------------------- /_datasets/Cab Charges May.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Cab Charges May.xlsx -------------------------------------------------------------------------------- /_datasets/Chinook.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Chinook.sqlite -------------------------------------------------------------------------------- /_datasets/LIGO_data.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/LIGO_data.hdf5 -------------------------------------------------------------------------------- /_datasets/NEONDS.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/NEONDS.hdf5 -------------------------------------------------------------------------------- /_datasets/Shift Roster.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Shift Roster.xlsx -------------------------------------------------------------------------------- /_datasets/Trump Tweets(2017).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Trump Tweets(2017).xlsx -------------------------------------------------------------------------------- /_datasets/airline.sas7bdat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/airline.sas7bdat -------------------------------------------------------------------------------- /_datasets/albeck_gene_expression.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/albeck_gene_expression.mat -------------------------------------------------------------------------------- /_datasets/battledeath.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/battledeath.xlsx -------------------------------------------------------------------------------- /_datasets/cars.csv: -------------------------------------------------------------------------------- 1 | Unnamed: 0 cars_per_cap country drives_right 2 | 0 US 809 United States True 3 | 1 AUS 731 Australia False 4 | 2 JAP 588 Japan False 5 | 3 IN 18 India False 6 | 4 RU 200 Russia True 7 | 5 MOR 70 Morocco True 8 | 6 EG 45 Egypt True -------------------------------------------------------------------------------- /_datasets/data.pk1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/data.pk1 -------------------------------------------------------------------------------- /_datasets/disarea.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/disarea.dta -------------------------------------------------------------------------------- /_datasets/fish.csv: -------------------------------------------------------------------------------- 1 | Bream,242.0,23.2,25.4,30.0,38.4,13.4 2 | Bream,290.0,24.0,26.3,31.2,40.0,13.8 3 | Bream,340.0,23.9,26.5,31.1,39.8,15.1 4 | Bream,363.0,26.3,29.0,33.5,38.0,13.3 5 | Bream,430.0,26.5,29.0,34.0,36.6,15.1 6 | Bream,450.0,26.8,29.7,34.7,39.2,14.2 7 | Bream,500.0,26.8,29.7,34.5,41.1,15.3 8 | Bream,390.0,27.6,30.0,35.0,36.2,13.4 9 | Bream,450.0,27.6,30.0,35.1,39.9,13.8 10 | Bream,500.0,28.5,30.7,36.2,39.3,13.7 11 | Bream,475.0,28.4,31.0,36.2,39.4,14.1 12 | Bream,500.0,28.7,31.0,36.2,39.7,13.3 13 | Bream,500.0,29.1,31.5,36.4,37.8,12.0 14 | Bream,600.0,29.4,32.0,37.2,40.2,13.9 15 | Bream,600.0,29.4,32.0,37.2,41.5,15.0 16 | Bream,700.0,30.4,33.0,38.3,38.8,13.8 17 | Bream,700.0,30.4,33.0,38.5,38.8,13.5 18 | Bream,610.0,30.9,33.5,38.6,40.5,13.3 19 | Bream,650.0,31.0,33.5,38.7,37.4,14.8 20 | Bream,575.0,31.3,34.0,39.5,38.3,14.1 21 | Bream,685.0,31.4,34.0,39.2,40.8,13.7 22 | Bream,620.0,31.5,34.5,39.7,39.1,13.3 23 | Bream,680.0,31.8,35.0,40.6,38.1,15.1 24 | Bream,700.0,31.9,35.0,40.5,40.1,13.8 25 | Bream,725.0,31.8,35.0,40.9,40.0,14.8 26 | Bream,720.0,32.0,35.0,40.6,40.3,15.0 27 | Bream,714.0,32.7,36.0,41.5,39.8,14.1 28 | Bream,850.0,32.8,36.0,41.6,40.6,14.9 29 | Bream,1000.0,33.5,37.0,42.6,44.5,15.5 30 | Bream,920.0,35.0,38.5,44.1,40.9,14.3 31 | Bream,955.0,35.0,38.5,44.0,41.1,14.3 32 | Bream,925.0,36.2,39.5,45.3,41.4,14.9 33 | Bream,975.0,37.4,41.0,45.9,40.6,14.7 34 | Bream,950.0,38.0,41.0,46.5,37.9,13.7 35 | Roach,40.0,12.9,14.1,16.2,25.6,14.0 36 | Roach,69.0,16.5,18.2,20.3,26.1,13.9 37 | Roach,78.0,17.5,18.8,21.2,26.3,13.7 38 | Roach,87.0,18.2,19.8,22.2,25.3,14.3 39 | Roach,120.0,18.6,20.0,22.2,28.0,16.1 40 | Roach,0.0,19.0,20.5,22.8,28.4,14.7 41 | Roach,110.0,19.1,20.8,23.1,26.7,14.7 42 | Roach,120.0,19.4,21.0,23.7,25.8,13.9 43 | Roach,150.0,20.4,22.0,24.7,23.5,15.2 44 | Roach,145.0,20.5,22.0,24.3,27.3,14.6 45 | Roach,160.0,20.5,22.5,25.3,27.8,15.1 46 | Roach,140.0,21.0,22.5,25.0,26.2,13.3 47 | Roach,160.0,21.1,22.5,25.0,25.6,15.2 48 | Roach,169.0,22.0,24.0,27.2,27.7,14.1 49 | Roach,161.0,22.0,23.4,26.7,25.9,13.6 50 | Roach,200.0,22.1,23.5,26.8,27.6,15.4 51 | Roach,180.0,23.6,25.2,27.9,25.4,14.0 52 | Roach,290.0,24.0,26.0,29.2,30.4,15.4 53 | Roach,272.0,25.0,27.0,30.6,28.0,15.6 54 | Roach,390.0,29.5,31.7,35.0,27.1,15.3 55 | Smelt,6.7,9.3,9.8,10.8,16.1,9.7 56 | Smelt,7.5,10.0,10.5,11.6,17.0,10.0 57 | Smelt,7.0,10.1,10.6,11.6,14.9,9.9 58 | Smelt,9.7,10.4,11.0,12.0,18.3,11.5 59 | Smelt,9.8,10.7,11.2,12.4,16.8,10.3 60 | Smelt,8.7,10.8,11.3,12.6,15.7,10.2 61 | Smelt,10.0,11.3,11.8,13.1,16.9,9.8 62 | Smelt,9.9,11.3,11.8,13.1,16.9,8.9 63 | Smelt,9.8,11.4,12.0,13.2,16.7,8.7 64 | Smelt,12.2,11.5,12.2,13.4,15.6,10.4 65 | Smelt,13.4,11.7,12.4,13.5,18.0,9.4 66 | Smelt,12.2,12.1,13.0,13.8,16.5,9.1 67 | Smelt,19.7,13.2,14.3,15.2,18.9,13.6 68 | Smelt,19.9,13.8,15.0,16.2,18.1,11.6 69 | Pike,200.0,30.0,32.3,34.8,16.0,9.7 70 | Pike,300.0,31.7,34.0,37.8,15.1,11.0 71 | Pike,300.0,32.7,35.0,38.8,15.3,11.3 72 | Pike,300.0,34.8,37.3,39.8,15.8,10.1 73 | Pike,430.0,35.5,38.0,40.5,18.0,11.3 74 | Pike,345.0,36.0,38.5,41.0,15.6,9.7 75 | Pike,456.0,40.0,42.5,45.5,16.0,9.5 76 | Pike,510.0,40.0,42.5,45.5,15.0,9.8 77 | Pike,540.0,40.1,43.0,45.8,17.0,11.2 78 | Pike,500.0,42.0,45.0,48.0,14.5,10.2 79 | Pike,567.0,43.2,46.0,48.7,16.0,10.0 80 | Pike,770.0,44.8,48.0,51.2,15.0,10.5 81 | Pike,950.0,48.3,51.7,55.1,16.2,11.2 82 | Pike,1250.0,52.0,56.0,59.7,17.9,11.7 83 | Pike,1600.0,56.0,60.0,64.0,15.0,9.6 84 | Pike,1550.0,56.0,60.0,64.0,15.0,9.6 85 | Pike,1650.0,59.0,63.4,68.0,15.9,11.0 86 | -------------------------------------------------------------------------------- /_datasets/moby_dick.txt: -------------------------------------------------------------------------------- 1 | CHAPTER 1. Loomings. 2 | 3 | Call me Ishmael. Some years ago--never mind how long precisely--having 4 | little or no money in my purse, and nothing particular to interest me on 5 | shore, I thought I would sail about a little and see the watery part of 6 | the world. It is a way I have of driving off the spleen and regulating 7 | the circulation. Whenever I find myself growing grim about the mouth; 8 | whenever it is a damp, drizzly November in my soul; whenever I find 9 | myself involuntarily pausing before coffin warehouses, and bringing up 10 | the rear of every funeral I meet; and especially whenever my hypos get 11 | such an upper hand of me, that it requires a strong moral principle to 12 | prevent me from deliberately stepping into the street, and methodically 13 | knocking people's hats off--then, I account it high time to get to sea 14 | as soon as I can. This is my substitute for pistol and ball. With a 15 | philosophical flourish Cato throws himself upon his sword; I quietly 16 | take to the ship. There is nothing surprising in this. If they but knew 17 | it, almost all men in their degree, some time or other, cherish very 18 | nearly the same feelings towards the ocean with me. -------------------------------------------------------------------------------- /_datasets/sales.sas7bdat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/sales.sas7bdat -------------------------------------------------------------------------------- /_datasets/seaslug.txt: -------------------------------------------------------------------------------- 1 | Time Percent 2 | 99 0.067 3 | 99 0.133 4 | 99 0.067 5 | 99 0 6 | 99 0 7 | 0 0.5 8 | 0 0.467 9 | 0 0.857 10 | 0 0.5 11 | 0 0.357 12 | 0 0.533 13 | 5 0.467 14 | 5 0.467 15 | 5 0.125 16 | 5 0.4 17 | 5 0.214 18 | 5 0.4 19 | 10 0.067 20 | 10 0.067 21 | 10 0.333 22 | 10 0.333 23 | 10 0.133 24 | 10 0.133 25 | 15 0.267 26 | 15 0.286 27 | 15 0.333 28 | 15 0.214 29 | 15 0 30 | 15 0 31 | 20 0.267 32 | 20 0.2 33 | 20 0.267 34 | 20 0.437 35 | 20 0.077 36 | 20 0.067 37 | 25 0.133 38 | 25 0.267 39 | 25 0.412 40 | 25 0 41 | 25 0.067 42 | 25 0.133 43 | 30 0 44 | 30 0.071 45 | 30 0 46 | 30 0.067 47 | 30 0.067 48 | 30 0.133 -------------------------------------------------------------------------------- /_datasets/seeds-width-vs-length.csv: -------------------------------------------------------------------------------- 1 | 3.312,5.763 2 | 3.333,5.554 3 | 3.337,5.291 4 | 3.379,5.324 5 | 3.562,5.658 6 | 3.312,5.386 7 | 3.259,5.563 8 | 3.302,5.42 9 | 3.465,6.053 10 | 3.505,5.884 11 | 3.242,5.714 12 | 3.201,5.438 13 | 3.199,5.439 14 | 3.156,5.479 15 | 3.114,5.482 16 | 3.333,5.351 17 | 3.383,5.119 18 | 3.514,5.527 19 | 3.466,5.205 20 | 3.049,5.226 21 | 3.129,5.658 22 | 3.168,5.52 23 | 3.507,5.618 24 | 2.936,5.099 25 | 3.245,5.789 26 | 3.421,5.833 27 | 3.026,5.395 28 | 2.956,5.395 29 | 3.221,5.541 30 | 3.065,5.516 31 | 2.975,5.454 32 | 3.371,5.757 33 | 3.186,5.717 34 | 3.15,5.585 35 | 3.328,5.712 36 | 3.485,5.709 37 | 3.464,5.826 38 | 3.683,5.832 39 | 3.288,5.656 40 | 3.298,5.397 41 | 3.156,5.348 42 | 3.158,5.351 43 | 3.201,5.138 44 | 3.396,5.877 45 | 3.462,5.579 46 | 3.155,5.376 47 | 3.393,5.701 48 | 3.377,5.57 49 | 3.291,5.545 50 | 3.258,5.678 51 | 3.272,5.585 52 | 3.434,5.674 53 | 3.113,5.715 54 | 3.199,5.504 55 | 3.113,5.741 56 | 3.212,5.702 57 | 3.377,5.388 58 | 3.412,5.384 59 | 3.419,5.662 60 | 3.032,5.159 61 | 2.85,5.008 62 | 2.879,4.902 63 | 3.042,5.076 64 | 3.07,5.395 65 | 3.026,5.262 66 | 3.119,5.139 67 | 3.19,5.63 68 | 3.158,5.609 69 | 3.153,5.569 70 | 2.882,5.412 71 | 3.561,6.191 72 | 3.484,5.998 73 | 3.594,5.978 74 | 3.93,6.154 75 | 3.486,6.017 76 | 3.438,5.927 77 | 3.403,6.064 78 | 3.814,6.579 79 | 3.639,6.445 80 | 3.566,5.85 81 | 3.467,5.875 82 | 3.857,6.006 83 | 3.864,6.285 84 | 3.772,6.384 85 | 3.801,6.366 86 | 3.651,6.173 87 | 3.764,6.084 88 | 3.67,6.549 89 | 4.033,6.573 90 | 4.032,6.45 91 | 3.785,6.581 92 | 3.796,6.172 93 | 3.693,6.272 94 | 3.86,6.037 95 | 3.485,6.666 96 | 3.463,6.139 97 | 3.81,6.341 98 | 3.552,6.449 99 | 3.512,6.271 100 | 3.684,6.219 101 | 3.525,5.718 102 | 3.694,5.89 103 | 3.892,6.113 104 | 3.681,6.369 105 | 3.755,6.248 106 | 3.786,6.037 107 | 3.806,6.152 108 | 3.573,6.033 109 | 3.763,6.675 110 | 3.674,6.153 111 | 3.769,6.107 112 | 3.791,6.303 113 | 3.902,6.183 114 | 3.737,6.259 115 | 3.991,6.563 116 | 3.719,6.416 117 | 3.897,6.051 118 | 3.815,6.245 119 | 3.769,6.227 120 | 3.857,6.493 121 | 3.962,6.315 122 | 3.563,6.059 123 | 3.387,5.762 124 | 3.771,5.98 125 | 3.582,5.363 126 | 3.869,6.111 127 | 3.594,6.285 128 | 3.687,5.979 129 | 3.773,6.513 130 | 3.69,5.791 131 | 3.755,5.979 132 | 3.825,6.144 133 | 3.268,5.884 134 | 3.395,5.845 135 | 3.408,5.776 136 | 3.465,5.477 137 | 3.574,6.145 138 | 3.231,5.92 139 | 3.286,5.832 140 | 3.472,5.872 141 | 2.994,5.472 142 | 3.073,5.541 143 | 3.074,5.389 144 | 2.967,5.224 145 | 2.777,5.314 146 | 2.687,5.279 147 | 2.719,5.176 148 | 2.967,5.267 149 | 2.911,5.386 150 | 2.648,5.317 151 | 2.84,5.263 152 | 2.776,5.405 153 | 2.833,5.408 154 | 2.693,5.22 155 | 2.755,5.175 156 | 2.675,5.25 157 | 2.849,5.053 158 | 2.745,5.394 159 | 2.678,5.444 160 | 2.695,5.304 161 | 2.879,5.451 162 | 2.81,5.35 163 | 2.847,5.267 164 | 2.968,5.333 165 | 2.794,5.011 166 | 2.941,5.105 167 | 2.897,5.319 168 | 2.837,5.417 169 | 2.668,5.176 170 | 2.715,5.09 171 | 2.701,5.325 172 | 2.845,5.167 173 | 2.763,5.088 174 | 2.763,5.136 175 | 2.641,5.278 176 | 2.821,4.981 177 | 2.71,5.186 178 | 2.642,5.145 179 | 2.758,5.18 180 | 2.893,5.357 181 | 2.775,5.09 182 | 3.017,5.236 183 | 2.909,5.24 184 | 2.85,5.108 185 | 3.026,5.495 186 | 2.683,5.363 187 | 2.716,5.413 188 | 2.675,5.088 189 | 2.821,5.089 190 | 2.787,4.899 191 | 2.717,5.046 192 | 2.804,5.091 193 | 2.953,5.132 194 | 2.63,5.18 195 | 2.975,5.236 196 | 3.126,5.16 197 | 3.054,5.224 198 | 3.128,5.32 199 | 2.911,5.41 200 | 3.155,5.073 201 | 2.989,5.219 202 | 3.135,4.984 203 | 2.81,5.009 204 | 3.091,5.183 205 | 2.96,5.204 206 | 2.981,5.137 207 | 2.795,5.14 208 | 3.232,5.236 209 | 2.836,5.175 210 | 2.974,5.243 211 | -------------------------------------------------------------------------------- /_datasets/seeds.csv: -------------------------------------------------------------------------------- 1 | 15.26,14.84,0.871,5.763,3.312,2.221,5.22,1 2 | 14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1 3 | 14.29,14.09,0.905,5.291,3.337,2.699,4.825,1 4 | 13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1 5 | 16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1 6 | 14.38,14.21,0.8951,5.386,3.312,2.462,4.956,1 7 | 14.69,14.49,0.8799,5.563,3.259,3.586,5.219,1 8 | 14.11,14.1,0.8911,5.42,3.302,2.7,5,1 9 | 16.63,15.46,0.8747,6.053,3.465,2.04,5.877,1 10 | 16.44,15.25,0.888,5.884,3.505,1.969,5.533,1 11 | 15.26,14.85,0.8696,5.714,3.242,4.543,5.314,1 12 | 14.03,14.16,0.8796,5.438,3.201,1.717,5.001,1 13 | 13.89,14.02,0.888,5.439,3.199,3.986,4.738,1 14 | 13.78,14.06,0.8759,5.479,3.156,3.136,4.872,1 15 | 13.74,14.05,0.8744,5.482,3.114,2.932,4.825,1 16 | 14.59,14.28,0.8993,5.351,3.333,4.185,4.781,1 17 | 13.99,13.83,0.9183,5.119,3.383,5.234,4.781,1 18 | 15.69,14.75,0.9058,5.527,3.514,1.599,5.046,1 19 | 14.7,14.21,0.9153,5.205,3.466,1.767,4.649,1 20 | 12.72,13.57,0.8686,5.226,3.049,4.102,4.914,1 21 | 14.16,14.4,0.8584,5.658,3.129,3.072,5.176,1 22 | 14.11,14.26,0.8722,5.52,3.168,2.688,5.219,1 23 | 15.88,14.9,0.8988,5.618,3.507,0.7651,5.091,1 24 | 12.08,13.23,0.8664,5.099,2.936,1.415,4.961,1 25 | 15.01,14.76,0.8657,5.789,3.245,1.791,5.001,1 26 | 16.19,15.16,0.8849,5.833,3.421,0.903,5.307,1 27 | 13.02,13.76,0.8641,5.395,3.026,3.373,4.825,1 28 | 12.74,13.67,0.8564,5.395,2.956,2.504,4.869,1 29 | 14.11,14.18,0.882,5.541,3.221,2.754,5.038,1 30 | 13.45,14.02,0.8604,5.516,3.065,3.531,5.097,1 31 | 13.16,13.82,0.8662,5.454,2.975,0.8551,5.056,1 32 | 15.49,14.94,0.8724,5.757,3.371,3.412,5.228,1 33 | 14.09,14.41,0.8529,5.717,3.186,3.92,5.299,1 34 | 13.94,14.17,0.8728,5.585,3.15,2.124,5.012,1 35 | 15.05,14.68,0.8779,5.712,3.328,2.129,5.36,1 36 | 16.12,15,0.9,5.709,3.485,2.27,5.443,1 37 | 16.2,15.27,0.8734,5.826,3.464,2.823,5.527,1 38 | 17.08,15.38,0.9079,5.832,3.683,2.956,5.484,1 39 | 14.8,14.52,0.8823,5.656,3.288,3.112,5.309,1 40 | 14.28,14.17,0.8944,5.397,3.298,6.685,5.001,1 41 | 13.54,13.85,0.8871,5.348,3.156,2.587,5.178,1 42 | 13.5,13.85,0.8852,5.351,3.158,2.249,5.176,1 43 | 13.16,13.55,0.9009,5.138,3.201,2.461,4.783,1 44 | 15.5,14.86,0.882,5.877,3.396,4.711,5.528,1 45 | 15.11,14.54,0.8986,5.579,3.462,3.128,5.18,1 46 | 13.8,14.04,0.8794,5.376,3.155,1.56,4.961,1 47 | 15.36,14.76,0.8861,5.701,3.393,1.367,5.132,1 48 | 14.99,14.56,0.8883,5.57,3.377,2.958,5.175,1 49 | 14.79,14.52,0.8819,5.545,3.291,2.704,5.111,1 50 | 14.86,14.67,0.8676,5.678,3.258,2.129,5.351,1 51 | 14.43,14.4,0.8751,5.585,3.272,3.975,5.144,1 52 | 15.78,14.91,0.8923,5.674,3.434,5.593,5.136,1 53 | 14.49,14.61,0.8538,5.715,3.113,4.116,5.396,1 54 | 14.33,14.28,0.8831,5.504,3.199,3.328,5.224,1 55 | 14.52,14.6,0.8557,5.741,3.113,1.481,5.487,1 56 | 15.03,14.77,0.8658,5.702,3.212,1.933,5.439,1 57 | 14.46,14.35,0.8818,5.388,3.377,2.802,5.044,1 58 | 14.92,14.43,0.9006,5.384,3.412,1.142,5.088,1 59 | 15.38,14.77,0.8857,5.662,3.419,1.999,5.222,1 60 | 12.11,13.47,0.8392,5.159,3.032,1.502,4.519,1 61 | 11.42,12.86,0.8683,5.008,2.85,2.7,4.607,1 62 | 11.23,12.63,0.884,4.902,2.879,2.269,4.703,1 63 | 12.36,13.19,0.8923,5.076,3.042,3.22,4.605,1 64 | 13.22,13.84,0.868,5.395,3.07,4.157,5.088,1 65 | 12.78,13.57,0.8716,5.262,3.026,1.176,4.782,1 66 | 12.88,13.5,0.8879,5.139,3.119,2.352,4.607,1 67 | 14.34,14.37,0.8726,5.63,3.19,1.313,5.15,1 68 | 14.01,14.29,0.8625,5.609,3.158,2.217,5.132,1 69 | 14.37,14.39,0.8726,5.569,3.153,1.464,5.3,1 70 | 12.73,13.75,0.8458,5.412,2.882,3.533,5.067,1 71 | 17.63,15.98,0.8673,6.191,3.561,4.076,6.06,2 72 | 16.84,15.67,0.8623,5.998,3.484,4.675,5.877,2 73 | 17.26,15.73,0.8763,5.978,3.594,4.539,5.791,2 74 | 19.11,16.26,0.9081,6.154,3.93,2.936,6.079,2 75 | 16.82,15.51,0.8786,6.017,3.486,4.004,5.841,2 76 | 16.77,15.62,0.8638,5.927,3.438,4.92,5.795,2 77 | 17.32,15.91,0.8599,6.064,3.403,3.824,5.922,2 78 | 20.71,17.23,0.8763,6.579,3.814,4.451,6.451,2 79 | 18.94,16.49,0.875,6.445,3.639,5.064,6.362,2 80 | 17.12,15.55,0.8892,5.85,3.566,2.858,5.746,2 81 | 16.53,15.34,0.8823,5.875,3.467,5.532,5.88,2 82 | 18.72,16.19,0.8977,6.006,3.857,5.324,5.879,2 83 | 20.2,16.89,0.8894,6.285,3.864,5.173,6.187,2 84 | 19.57,16.74,0.8779,6.384,3.772,1.472,6.273,2 85 | 19.51,16.71,0.878,6.366,3.801,2.962,6.185,2 86 | 18.27,16.09,0.887,6.173,3.651,2.443,6.197,2 87 | 18.88,16.26,0.8969,6.084,3.764,1.649,6.109,2 88 | 18.98,16.66,0.859,6.549,3.67,3.691,6.498,2 89 | 21.18,17.21,0.8989,6.573,4.033,5.78,6.231,2 90 | 20.88,17.05,0.9031,6.45,4.032,5.016,6.321,2 91 | 20.1,16.99,0.8746,6.581,3.785,1.955,6.449,2 92 | 18.76,16.2,0.8984,6.172,3.796,3.12,6.053,2 93 | 18.81,16.29,0.8906,6.272,3.693,3.237,6.053,2 94 | 18.59,16.05,0.9066,6.037,3.86,6.001,5.877,2 95 | 18.36,16.52,0.8452,6.666,3.485,4.933,6.448,2 96 | 16.87,15.65,0.8648,6.139,3.463,3.696,5.967,2 97 | 19.31,16.59,0.8815,6.341,3.81,3.477,6.238,2 98 | 18.98,16.57,0.8687,6.449,3.552,2.144,6.453,2 99 | 18.17,16.26,0.8637,6.271,3.512,2.853,6.273,2 100 | 18.72,16.34,0.881,6.219,3.684,2.188,6.097,2 101 | 16.41,15.25,0.8866,5.718,3.525,4.217,5.618,2 102 | 17.99,15.86,0.8992,5.89,3.694,2.068,5.837,2 103 | 19.46,16.5,0.8985,6.113,3.892,4.308,6.009,2 104 | 19.18,16.63,0.8717,6.369,3.681,3.357,6.229,2 105 | 18.95,16.42,0.8829,6.248,3.755,3.368,6.148,2 106 | 18.83,16.29,0.8917,6.037,3.786,2.553,5.879,2 107 | 18.85,16.17,0.9056,6.152,3.806,2.843,6.2,2 108 | 17.63,15.86,0.88,6.033,3.573,3.747,5.929,2 109 | 19.94,16.92,0.8752,6.675,3.763,3.252,6.55,2 110 | 18.55,16.22,0.8865,6.153,3.674,1.738,5.894,2 111 | 18.45,16.12,0.8921,6.107,3.769,2.235,5.794,2 112 | 19.38,16.72,0.8716,6.303,3.791,3.678,5.965,2 113 | 19.13,16.31,0.9035,6.183,3.902,2.109,5.924,2 114 | 19.14,16.61,0.8722,6.259,3.737,6.682,6.053,2 115 | 20.97,17.25,0.8859,6.563,3.991,4.677,6.316,2 116 | 19.06,16.45,0.8854,6.416,3.719,2.248,6.163,2 117 | 18.96,16.2,0.9077,6.051,3.897,4.334,5.75,2 118 | 19.15,16.45,0.889,6.245,3.815,3.084,6.185,2 119 | 18.89,16.23,0.9008,6.227,3.769,3.639,5.966,2 120 | 20.03,16.9,0.8811,6.493,3.857,3.063,6.32,2 121 | 20.24,16.91,0.8897,6.315,3.962,5.901,6.188,2 122 | 18.14,16.12,0.8772,6.059,3.563,3.619,6.011,2 123 | 16.17,15.38,0.8588,5.762,3.387,4.286,5.703,2 124 | 18.43,15.97,0.9077,5.98,3.771,2.984,5.905,2 125 | 15.99,14.89,0.9064,5.363,3.582,3.336,5.144,2 126 | 18.75,16.18,0.8999,6.111,3.869,4.188,5.992,2 127 | 18.65,16.41,0.8698,6.285,3.594,4.391,6.102,2 128 | 17.98,15.85,0.8993,5.979,3.687,2.257,5.919,2 129 | 20.16,17.03,0.8735,6.513,3.773,1.91,6.185,2 130 | 17.55,15.66,0.8991,5.791,3.69,5.366,5.661,2 131 | 18.3,15.89,0.9108,5.979,3.755,2.837,5.962,2 132 | 18.94,16.32,0.8942,6.144,3.825,2.908,5.949,2 133 | 15.38,14.9,0.8706,5.884,3.268,4.462,5.795,2 134 | 16.16,15.33,0.8644,5.845,3.395,4.266,5.795,2 135 | 15.56,14.89,0.8823,5.776,3.408,4.972,5.847,2 136 | 15.38,14.66,0.899,5.477,3.465,3.6,5.439,2 137 | 17.36,15.76,0.8785,6.145,3.574,3.526,5.971,2 138 | 15.57,15.15,0.8527,5.92,3.231,2.64,5.879,2 139 | 15.6,15.11,0.858,5.832,3.286,2.725,5.752,2 140 | 16.23,15.18,0.885,5.872,3.472,3.769,5.922,2 141 | 13.07,13.92,0.848,5.472,2.994,5.304,5.395,3 142 | 13.32,13.94,0.8613,5.541,3.073,7.035,5.44,3 143 | 13.34,13.95,0.862,5.389,3.074,5.995,5.307,3 144 | 12.22,13.32,0.8652,5.224,2.967,5.469,5.221,3 145 | 11.82,13.4,0.8274,5.314,2.777,4.471,5.178,3 146 | 11.21,13.13,0.8167,5.279,2.687,6.169,5.275,3 147 | 11.43,13.13,0.8335,5.176,2.719,2.221,5.132,3 148 | 12.49,13.46,0.8658,5.267,2.967,4.421,5.002,3 149 | 12.7,13.71,0.8491,5.386,2.911,3.26,5.316,3 150 | 10.79,12.93,0.8107,5.317,2.648,5.462,5.194,3 151 | 11.83,13.23,0.8496,5.263,2.84,5.195,5.307,3 152 | 12.01,13.52,0.8249,5.405,2.776,6.992,5.27,3 153 | 12.26,13.6,0.8333,5.408,2.833,4.756,5.36,3 154 | 11.18,13.04,0.8266,5.22,2.693,3.332,5.001,3 155 | 11.36,13.05,0.8382,5.175,2.755,4.048,5.263,3 156 | 11.19,13.05,0.8253,5.25,2.675,5.813,5.219,3 157 | 11.34,12.87,0.8596,5.053,2.849,3.347,5.003,3 158 | 12.13,13.73,0.8081,5.394,2.745,4.825,5.22,3 159 | 11.75,13.52,0.8082,5.444,2.678,4.378,5.31,3 160 | 11.49,13.22,0.8263,5.304,2.695,5.388,5.31,3 161 | 12.54,13.67,0.8425,5.451,2.879,3.082,5.491,3 162 | 12.02,13.33,0.8503,5.35,2.81,4.271,5.308,3 163 | 12.05,13.41,0.8416,5.267,2.847,4.988,5.046,3 164 | 12.55,13.57,0.8558,5.333,2.968,4.419,5.176,3 165 | 11.14,12.79,0.8558,5.011,2.794,6.388,5.049,3 166 | 12.1,13.15,0.8793,5.105,2.941,2.201,5.056,3 167 | 12.44,13.59,0.8462,5.319,2.897,4.924,5.27,3 168 | 12.15,13.45,0.8443,5.417,2.837,3.638,5.338,3 169 | 11.35,13.12,0.8291,5.176,2.668,4.337,5.132,3 170 | 11.24,13,0.8359,5.09,2.715,3.521,5.088,3 171 | 11.02,13,0.8189,5.325,2.701,6.735,5.163,3 172 | 11.55,13.1,0.8455,5.167,2.845,6.715,4.956,3 173 | 11.27,12.97,0.8419,5.088,2.763,4.309,5,3 174 | 11.4,13.08,0.8375,5.136,2.763,5.588,5.089,3 175 | 10.83,12.96,0.8099,5.278,2.641,5.182,5.185,3 176 | 10.8,12.57,0.859,4.981,2.821,4.773,5.063,3 177 | 11.26,13.01,0.8355,5.186,2.71,5.335,5.092,3 178 | 10.74,12.73,0.8329,5.145,2.642,4.702,4.963,3 179 | 11.48,13.05,0.8473,5.18,2.758,5.876,5.002,3 180 | 12.21,13.47,0.8453,5.357,2.893,1.661,5.178,3 181 | 11.41,12.95,0.856,5.09,2.775,4.957,4.825,3 182 | 12.46,13.41,0.8706,5.236,3.017,4.987,5.147,3 183 | 12.19,13.36,0.8579,5.24,2.909,4.857,5.158,3 184 | 11.65,13.07,0.8575,5.108,2.85,5.209,5.135,3 185 | 12.89,13.77,0.8541,5.495,3.026,6.185,5.316,3 186 | 11.56,13.31,0.8198,5.363,2.683,4.062,5.182,3 187 | 11.81,13.45,0.8198,5.413,2.716,4.898,5.352,3 188 | 10.91,12.8,0.8372,5.088,2.675,4.179,4.956,3 189 | 11.23,12.82,0.8594,5.089,2.821,7.524,4.957,3 190 | 10.59,12.41,0.8648,4.899,2.787,4.975,4.794,3 191 | 10.93,12.8,0.839,5.046,2.717,5.398,5.045,3 192 | 11.27,12.86,0.8563,5.091,2.804,3.985,5.001,3 193 | 11.87,13.02,0.8795,5.132,2.953,3.597,5.132,3 194 | 10.82,12.83,0.8256,5.18,2.63,4.853,5.089,3 195 | 12.11,13.27,0.8639,5.236,2.975,4.132,5.012,3 196 | 12.8,13.47,0.886,5.16,3.126,4.873,4.914,3 197 | 12.79,13.53,0.8786,5.224,3.054,5.483,4.958,3 198 | 13.37,13.78,0.8849,5.32,3.128,4.67,5.091,3 199 | 12.62,13.67,0.8481,5.41,2.911,3.306,5.231,3 200 | 12.76,13.38,0.8964,5.073,3.155,2.828,4.83,3 201 | 12.38,13.44,0.8609,5.219,2.989,5.472,5.045,3 202 | 12.67,13.32,0.8977,4.984,3.135,2.3,4.745,3 203 | 11.18,12.72,0.868,5.009,2.81,4.051,4.828,3 204 | 12.7,13.41,0.8874,5.183,3.091,8.456,5,3 205 | 12.37,13.47,0.8567,5.204,2.96,3.919,5.001,3 206 | 12.19,13.2,0.8783,5.137,2.981,3.631,4.87,3 207 | 11.23,12.88,0.8511,5.14,2.795,4.325,5.003,3 208 | 13.2,13.66,0.8883,5.236,3.232,8.315,5.056,3 209 | 11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3 210 | 12.3,13.34,0.8684,5.243,2.974,5.637,5.063,3 211 | -------------------------------------------------------------------------------- /certifications/Datacamp_course1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course1.pdf -------------------------------------------------------------------------------- /certifications/Datacamp_course17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course17.pdf -------------------------------------------------------------------------------- /certifications/Datacamp_course18.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course18.pdf -------------------------------------------------------------------------------- /certifications/Datacamp_course2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course2.pdf -------------------------------------------------------------------------------- /certifications/Datacamp_course3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course3.pdf -------------------------------------------------------------------------------- /certifications/Datacamp_course4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course4.pdf -------------------------------------------------------------------------------- /certifications/Datacamp_course5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course5.pdf -------------------------------------------------------------------------------- /notes/Datacamp/Data Science With Python Course 5.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Datacamp/Data Science With Python Course 5.docx -------------------------------------------------------------------------------- /notes/Datacamp/Iterators and Generators-Python.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Datacamp/Iterators and Generators-Python.docx -------------------------------------------------------------------------------- /notes/Datacamp/Regression Models.txt: -------------------------------------------------------------------------------- 1 | Regression: Applied over continuous stream of value to predict next value. 2 | Classification: Applied over discrete set of values to identify which 'class' or category the target value is going to fall under. 3 | 4 | -------------------------------------------------------------------------------- /notes/Google ML/Google's ML Crash Course Notes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Google ML/Google's ML Crash Course Notes.docx -------------------------------------------------------------------------------- /notes/Google ML/Tensorflow Estimators - documentation.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Google ML/Tensorflow Estimators - documentation.docx -------------------------------------------------------------------------------- /notes/R/Exploratory Data Analysis Dimension Reduction.txt: -------------------------------------------------------------------------------- 1 | In other words, we'd like to find the best matrix created with fewer variables (that is, a lower rank 2 | | matrix) that explains the original data. This is related to data compression. 3 | 4 | Two related solutions to these problems are PCA which stands for Principal Component Analysis and SVD, 5 | | Singular Value Decomposition. This latter simply means that we express a matrix X of observations 6 | | (rows) and variables (columns) as the product of 3 other matrices, i.e., X=UDV^t. This last term (V^t) 7 | | represents the transpose of the matrix V. 8 | 9 | 1. principal component analysis (PCA) 10 | 11 | 2. singular value decomposition (SVD) -------------------------------------------------------------------------------- /notes/R/Probability.txt: -------------------------------------------------------------------------------- 1 | | If you had a ruler of infinite precision, would measuring the height of adults around the world be 2 | | continuous or discrete? 3 | 4 | 1: continuous 5 | 2: discrete 6 | 7 | Selection: 2 8 | 9 | | Not quite! Try again. 10 | 11 | | The ruler of infinite precision is the hint. Can you list all possible heights? 12 | 13 | 1: discrete 14 | 2: continuous 15 | 16 | Selection: 2 17 | 18 | | You are quite good my friend! 19 | 20 | |=========== | 11% 21 | 22 | | Is the drawing of a hand of cards continuous or discrete? 23 | 24 | 1: continuous 25 | 2: discrete 26 | 27 | Selection: 2 28 | 29 | | All that hard work is paying off! 30 | 31 | |============== | 14% 32 | 33 | | Continuous random variables are usually associated with measurements of time, distance, or some 34 | | biological process since they can take on any value, often within some specified range. Limitations of 35 | | precision in taking the measurements may imply that the values are discrete; we in fact consider them 36 | | continuous. 37 | 38 | A probability mass function (PMF) gives the probability that a discrete random variable is exactly 39 | | equal to some value. 40 | 41 | | For instance, suppose we have a coin which may or may not be fair. Let x=0 represent a 'heads' outcome 42 | | and x=1 represent a 'tails' outcome of a coin toss. If p is the probability of 'heads' which of the 43 | | following represents the PMF of the coin toss? The variable x is either 0 (heads) or 1 (tails). 44 | 45 | 1: (p^x)*(1-p)^(1-x) 46 | 2: (p^(1-x))*(1-p)^x 47 | 48 | A probability density function [PDF] is associated with a continuous random variable. To quote from 49 | | Wikipedia, it "is a function that describes the relative likelihood for this random variable to take 50 | | on a given value. The probability of the random variable falling within a particular range of values 51 | | is given by ... the area under the density function but above the horizontal axis and between the 52 | | lowest and greatest values of the range. 53 | 54 | | We'll repeat two requirements of a probability density function [PDF]. It must be non-negative everywhere, 55 | | and the area under it must equal one." 56 | 57 | The cumulative distribution function (CDF) of a random variable X, either discrete or continuous, is 58 | | the function F(x) equal to the probability that X is less than or equal to x. In the example above, 59 | | the area of the blue triangle represents the probability that the random variable was less than or 60 | | equal to the value 1.6. 61 | 62 | | When the random variable is continuous, as in the example, the PDF is the derivative of the CDF. So integrating the PDF (the line represented by the diagonal) 63 | | yields the CDF. When you evaluate the CDF at the limits of integration the result is an area. 64 | 65 | Now use the R function integrate to integrate mypdf with the parameters lower equal to 0 and upper equal to 1.6. See if you get the same area (probability) you 66 | | got before. 67 | 68 | > integrate(mypdf, 0, 1.6) 69 | 0.64 with absolute error < 7.1e-15 70 | 71 | | You are really on a roll! 72 | 73 | |========================================================================================================== | 69% 74 | 75 | | The survivor function S(x) of a random variable X is defined as the function of x equal to the probability that the random variable X is greater than the value x. 76 | | This is the complement of the CDF F(x), in our example, the portion of the lower triangle that is not shaded. 77 | 78 | 79 | | We'll close by repeating some important points. 80 | 81 | ... 82 | 83 | |========================================================================================================================================= | 89% 84 | 85 | | A probability model connects data to a population using assumptions. 86 | 87 | ... 88 | 89 | |============================================================================================================================================== | 91% 90 | 91 | | Be careful to distinguish between population medians and sample medians. 92 | 93 | ... 94 | 95 | |================================================================================================================================================== | 94% 96 | 97 | | A sample median is an estimator of a population median (the estimand). 98 | 99 | 100 | | We represent the conditional probability of an event A given that B has occurred with the notation 101 | | P(A|B). More specifically, we define the conditional probability of event A, given that B has occurred 102 | | with the following. 103 | 104 | ... 105 | 106 | |============== | 14% 107 | 108 | | P(A|B) = P(A & B)/ P(B) . P(A|B) is the probability that BOTH A and B occur divided by the probability 109 | | that B occurs. 110 | 111 | | From the definition of P(A|B), we can write P(A&B) = P(A|B) * P(B), right? Let's use this to express 112 | | P(B|A). 113 | | P(B|A) = P(B&A)/P(A) = P(A|B) * P(B)/P(A). This is a simple form of Bayes' Rule which relates the two 114 | | conditional probabilities. 115 | Suppose we don't know P(A) itself, but only know its conditional probabilities, that is, the 116 | | probability that it occurs if B occurs and the probability that it occurs if B doesn't occur. These 117 | | are P(A|B) and P(A|~B), respectively. We use ~B to represent 'not B' or 'B complement'. 118 | 119 | ... 120 | 121 | |=========================== | 29% 122 | 123 | | We can then express P(A) = P(A|B) * P(B) + P(A|~B) * P(~B) and substitute this is into the denominator 124 | | of Bayes' Formula. 125 | 126 | | P(B|A) = P(A|B) * P(B) / ( P(A|B) * P(B) + P(A|~B) * P(~B) ) -------------------------------------------------------------------------------- /notes/R/Regression Models.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Regression Models.txt -------------------------------------------------------------------------------- /notes/R/Rplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot.png -------------------------------------------------------------------------------- /notes/R/Rplot01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot01.png -------------------------------------------------------------------------------- /notes/R/Rplot02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot02.png -------------------------------------------------------------------------------- /notes/R/Rplot03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot03.png -------------------------------------------------------------------------------- /notes/R/Rplot04.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot04.png -------------------------------------------------------------------------------- /notes/R/Rplot05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot05.png -------------------------------------------------------------------------------- /notes/R/Statistial Interference.txt: -------------------------------------------------------------------------------- 1 | We want to emphasize a couple of important points here. First, a statistic (singular) is a number 2 | | computed from a sample of data. We use statistics to infer information about a population. Second, a 3 | | random variable is an outcome from an experiment. Deterministic processes, such as computing means or 4 | | variances, applied to random variables, produce additional random variables which have their own 5 | | distributions. It's important to keep straight which distributions you're talking about. 6 | 7 | Finally, there are two broad flavors of inference. The first is frequency, which uses "long run 8 | | proportion of times an event occurs in independent, identically distributed repetitions." The second 9 | | is Bayesian in which the probability estimate for a hypothesis is updated as additional evidence is 10 | | acquired. Both flavors require an understanding of probability so that's what the next lessons will 11 | | cover. -------------------------------------------------------------------------------- /notes/R/dendo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/dendo1.png -------------------------------------------------------------------------------- /notes/R/prob-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/prob-1.png -------------------------------------------------------------------------------- /notes/R/ways to cluster.txt: -------------------------------------------------------------------------------- 1 | What is Clustering? 2 | clustering organizes data points that are close into groups to find a relation or pattern. 3 | 4 | Types of Distances: 5 | 1. Euclidean - using Pythagoras theoram. 6 | 2. Continous - correlation similarity 7 | 3. Manhattan - using sum of all vectors. Like taxi driving through blocks of manhattan city from point A to B. 8 | 9 | Ways of examining and organizing multi-dimensional data: 10 | 1. Heirarchical Clustering 11 | 2. K-Means Clustering - R documentation tells us that the k-means method "aims to partition the points into k groups such that 12 | the sum of squares from points to the assigned cluster centres is minimized." 13 | 14 | Heirarchical Clustering techniques: 15 | 1. complete linkage 16 | 2. Average linkage 17 | 3. heat maps 18 | 19 | K-Means clustering techniques: 20 | 21 | -------------------------------------------------------------------------------- /src/__pycache__/helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/__pycache__/helper.cpython-36.pyc -------------------------------------------------------------------------------- /src/case_studies/case_study_1.1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | # If you need to use real values 4 | df = pd.DataFrame(pd.read_csv('../_datasets/WDIData_min.csv')) 5 | cols = df.iloc[0, 0:5] 6 | rows = df.iloc[1:10, 0:5].values 7 | 8 | # Dummy subset 9 | feature_names = ['CountryName', 10 | 'CountryCode', 11 | 'IndicatorName', 12 | 'IndicatorCode', 13 | 'Year', 14 | 'Value'] 15 | 16 | row_val = ['Arab World', 17 | 'ARB', 18 | 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 19 | 'SP.ADO.TFRT', 20 | '1960', 21 | '133.56090740552298'] 22 | 23 | row_vals = [['Arab World', 24 | 'ARB', 25 | 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 26 | 'SP.ADO.TFRT', 27 | '1960', 28 | '133.56090740552298'], 29 | ['Arab World', 30 | 'ARB', 31 | 'Age dependency ratio (% of working-age population)', 32 | 'SP.POP.DPND', 33 | '1960', 34 | '87.7976011532547'], 35 | ['Arab World', 36 | 'ARB', 37 | 'Age dependency ratio, old (% of working-age population)', 38 | 'SP.POP.DPND.OL', 39 | '1960', 40 | '6.634579191565161'], 41 | ['Arab World', 42 | 'ARB', 43 | 'Age dependency ratio, young (% of working-age population)', 44 | 'SP.POP.DPND.YG', 45 | '1960', 46 | '81.02332950839141'], 47 | ['Arab World', 48 | 'ARB', 49 | 'Arms exports (SIPRI trend indicator values)', 50 | 'MS.MIL.XPRT.KD', 51 | '1960', 52 | '3000000.0'], 53 | ['Arab World', 54 | 'ARB', 55 | 'Arms imports (SIPRI trend indicator values)', 56 | 'MS.MIL.MPRT.KD', 57 | '1960', 58 | '538000000.0'], 59 | ['Arab World', 60 | 'ARB', 61 | 'Birth rate, crude (per 1,000 people)', 62 | 'SP.DYN.CBRT.IN', 63 | '1960', 64 | '47.697888095096395'], 65 | ['Arab World', 66 | 'ARB', 67 | 'CO2 emissions (kt)', 68 | 'EN.ATM.CO2E.KT', 69 | '1960', 70 | '59563.9892169935'], 71 | ['Arab World', 72 | 'ARB', 73 | 'CO2 emissions (metric tons per capita)', 74 | 'EN.ATM.CO2E.PC', 75 | '1960', 76 | '0.6439635478877049'], 77 | ['Arab World', 78 | 'ARB', 79 | 'CO2 emissions from gaseous fuel consumption (% of total)', 80 | 'EN.ATM.CO2E.GF.ZS', 81 | '1960', 82 | '5.041291753975099'], 83 | ['Arab World', 84 | 'ARB', 85 | 'CO2 emissions from liquid fuel consumption (% of total)', 86 | 'EN.ATM.CO2E.LF.ZS', 87 | '1960', 88 | '84.8514729446567'], 89 | ['Arab World', 90 | 'ARB', 91 | 'CO2 emissions from liquid fuel consumption (kt)', 92 | 'EN.ATM.CO2E.LF.KT', 93 | '1960', 94 | '49541.707291032304'], 95 | ['Arab World', 96 | 'ARB', 97 | 'CO2 emissions from solid fuel consumption (% of total)', 98 | 'EN.ATM.CO2E.SF.ZS', 99 | '1960', 100 | '4.72698138789597'], 101 | ['Arab World', 102 | 'ARB', 103 | 'Death rate, crude (per 1,000 people)', 104 | 'SP.DYN.CDRT.IN', 105 | '1960', 106 | '19.7544519237187'], 107 | ['Arab World', 108 | 'ARB', 109 | 'Fertility rate, total (births per woman)', 110 | 'SP.DYN.TFRT.IN', 111 | '1960', 112 | '6.92402738655897'], 113 | ['Arab World', 114 | 'ARB', 115 | 'Fixed telephone subscriptions', 116 | 'IT.MLT.MAIN', 117 | '1960', 118 | '406833.0'], 119 | ['Arab World', 120 | 'ARB', 121 | 'Fixed telephone subscriptions (per 100 people)', 122 | 'IT.MLT.MAIN.P2', 123 | '1960', 124 | '0.6167005703199'], 125 | ['Arab World', 126 | 'ARB', 127 | 'Hospital beds (per 1,000 people)', 128 | 'SH.MED.BEDS.ZS', 129 | '1960', 130 | '1.9296220724398703'], 131 | ['Arab World', 132 | 'ARB', 133 | 'International migrant stock (% of population)', 134 | 'SM.POP.TOTL.ZS', 135 | '1960', 136 | '2.9906371279862403'], 137 | ['Arab World', 138 | 'ARB', 139 | 'International migrant stock, total', 140 | 'SM.POP.TOTL', 141 | '1960', 142 | '3324685.0']] 143 | 144 | # Zip lists: zipped_lists 145 | zipped_lists = zip(feature_names, row_val) 146 | 147 | # Create a dictionary: rs_dict 148 | rs_dict = dict(zipped_lists) 149 | 150 | # Print the dictionary 151 | print('--------- 1 --------') 152 | print(rs_dict) 153 | 154 | 155 | # Suppose you needed to repeat the same process done in the previous exercise to many, 156 | # many rows of data. Rewriting your code again and again could become very tedious, repetitive, 157 | # and unmaintainable. 158 | # 159 | # In this exercise, you will create a function to house the code you wrote earlier to make things 160 | # easier and much more concise. Why? This way, you only need to call the function and supply the appropriate 161 | # lists to create your dictionaries! Again, the lists feature_names and row_vals are preloaded and 162 | # these contain the header names of the dataset and actual values of a row from the dataset, respectively, 163 | 164 | # Define lists2dict() 165 | def list2dict(list1, list2): 166 | """Return a dictionary where list1 provides 167 | the keys and list2 provides the values.""" 168 | 169 | # Zip lists: zipped_lists 170 | zipped_lists = zip(list1, list2) 171 | 172 | # Create a dictionary: rs_dict 173 | rs_dict = dict(zipped_lists) 174 | 175 | # Return the dictionary 176 | return rs_dict 177 | 178 | 179 | # Call lists2dict: rs_fxn 180 | rs_fxn = list2dict(feature_names, row_vals) 181 | 182 | # Print rs_fxn 183 | print('--------- 2 --------') 184 | print(rs_fxn) 185 | 186 | # Using a list comprehension 187 | # 188 | # This time, you're going to use the lists2dict() function you defined in the last exercise to turn a bunch of 189 | # lists into a list of dictionaries with the help of a list comprehension. 190 | # 191 | # The lists2dict() function has already been preloaded, together with a couple of lists, feature_names and row_lists. 192 | # feature_names contains the header names of the World Bank dataset and row_lists is a list of lists, where each 193 | # sublist is a list of actual values of a row from the dataset. 194 | # 195 | # Your goal is to use a list comprehension to generate a list of dicts, where the keys are the header names and the 196 | # values are the row entries. 197 | 198 | # Print the first two lists in row_vals 199 | print('--------- 3 --------') 200 | print(row_vals[0]) 201 | print(row_vals[1]) 202 | 203 | # Turn list of lists into list of dicts: list_of_dicts 204 | list_of_dicts = [list2dict(feature_names, sublist) for sublist in row_vals] 205 | 206 | # Print the first two dictionaries in list_of_dicts 207 | print('--------- 4 --------') 208 | print(list_of_dicts[0]) 209 | 210 | # Turning this all into a DataFrame 211 | # 212 | # You've zipped lists together, created a function to house your code, and even used the function in a list 213 | # comprehension to generate a list of dictionaries. That was a lot of work and you did a great job! 214 | # 215 | # You will now use of all these to convert the list of dictionaries into a pandas DataFrame. You will see how 216 | # convenient it is to generate a DataFrame from dictionaries with the DataFrame() function from the pandas package. 217 | # 218 | # The lists2dict() function, feature_names list, and row_lists list have been preloaded for this exercise. 219 | # 220 | # Go for it! 221 | 222 | # Turn list of dicts into a DataFrame: df 223 | df = pd.DataFrame(list_of_dicts) 224 | 225 | # Print the head of the DataFrame 226 | print('--------- 5 --------') 227 | print(df.head()) 228 | -------------------------------------------------------------------------------- /src/case_studies/case_study_1.2.py: -------------------------------------------------------------------------------- 1 | # Processing data in chunks (1) 2 | # 3 | # Sometimes, data sources can be so large in size that storing the entire dataset in memory becomes too 4 | # resource-intensive. In this exercise, you will process the first 1000 rows of a file line by line, to create a 5 | # dictionary of the counts of how many times each country appears in a column in the dataset. 6 | # 7 | # The csv file 'world_dev_ind.csv' is in your current directory for your use. To begin, you need to open a 8 | # connection to this file using what is known as a context manager. For example, the command with open('datacamp.csv') 9 | # as datacamp binds the csv file 'datacamp.csv' as datacamp in the context manager. Here, the with statement is 10 | # the context manager, and its purpose is to ensure that _datasets are efficiently allocated when opening a 11 | # connection to a file. 12 | # 13 | # If you'd like to learn more about context managers, refer to the DataCamp course on Importing Data in Python 14 | # (https://www.datacamp.com/courses/importing-data-in-python-part-1). 15 | 16 | # Open a connection to the file 17 | with open('../_datasets/WDIData_min.csv') as file: 18 | # Skip the column names 19 | file.readline() 20 | 21 | # Initialize an empty dictionary: counts_dict 22 | counts_dict = {} 23 | 24 | # Process only the first 1000 rows 25 | for j in range(0, 1000): 26 | 27 | # Split the current line into a list: line 28 | line = file.readline().split(',') 29 | 30 | # Get the value for the first column: first_col 31 | first_col = line[0] 32 | 33 | # If the column value is in the dict, increment its value 34 | if first_col in counts_dict.keys(): 35 | counts_dict[first_col] += 1 36 | 37 | # Else, add to the dict and set value to 1 38 | else: 39 | counts_dict[first_col] = 1 40 | 41 | # Print the resulting dictionary 42 | print(counts_dict) 43 | 44 | 45 | # 46 | # Writing a generator to load data in chunks (2) 47 | # 48 | # In the previous exercise, you processed a file line by line for a given number of lines. What if, however, you want 49 | # to do this for the entire file? 50 | # 51 | # In this case, it would be useful to use generators. Generators allow users to lazily evaluate data. This concept of 52 | # lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an 53 | # efficient manner by yielding only chunks of data at a time instead of the whole thing at once. 54 | # 55 | # In this exercise, you will define a generator function read_large_file() that produces a generator object which 56 | # yields a single line from a file each time next() is called on it. The csv file 'world_dev_ind.csv' is in your 57 | # current directory for your use. 58 | # 59 | # Note that when you open a connection to a file, the resulting file object is already a generator! So out in the 60 | # wild, you won't have to explicitly create generator objects in cases such as this. However, for pedagogical reasons, 61 | # we are having you practice how to do this here with the read_large_file() function. Go for it! 62 | # Define read_large_file() 63 | 64 | def read_large_file(file_object): 65 | """A generator function to read a large file lazily.""" 66 | 67 | # DO NOT Loop indefinitely until the end of the file else it will run out of memory 68 | for i in range(0, 100): 69 | 70 | # Read a line from the file: data 71 | data = file_object.readline() 72 | 73 | # Break if this is the end of the file 74 | if not data: 75 | break 76 | 77 | # Yield the line of data 78 | yield data 79 | 80 | # Create a generator object for the file: gen_file 81 | gen_file = read_large_file(file) 82 | 83 | # Print the first three lines of the file 84 | print(next(gen_file)) 85 | print(next(gen_file)) 86 | print(next(gen_file)) 87 | 88 | 89 | # Writing a generator to load data in chunks (3) 90 | # 91 | # Great! You've just created a generator function that you can use to help you process large file_operations. 92 | # 93 | # Now let's use your generator function to process the World Bank dataset like you did previously. 94 | # You will process the file line by line, to create a dictionary of the counts of how many times each country 95 | # appears in a column in the dataset. For this exercise, however, you won't process just 1000 rows of data, you'll 96 | # process the entire dataset! 97 | # 98 | # The generator function read_large_file() and the csv file 'world_dev_ind.csv' are preloaded and ready for your use. 99 | # Go for it! 100 | # Initialize an empty dictionary: counts_dict 101 | counts_dict = {} 102 | with open('../_datasets/WDIData_min.csv') as file: 103 | # Iterate over the generator from read_large_file() 104 | for line in read_large_file(file): 105 | 106 | row = line.split(',') 107 | first_col = row[0] 108 | 109 | if first_col in counts_dict.keys(): 110 | counts_dict[first_col] += 1 111 | else: 112 | counts_dict[first_col] = 1 113 | 114 | # Print 115 | print(counts_dict) 116 | -------------------------------------------------------------------------------- /src/case_studies/case_study_1.3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | # Writing an iterator to load data in chunks (1) 5 | # 6 | # Another way to read data too large to store in memory in chunks is to read the file in as DataFrames of a certain 7 | # length, say, 100. For example, with the pandas package (imported as pd), you can do 8 | # pd.read_csv(filename, chunksize=100). This creates an iterable reader object, which means that you can use next() 9 | # on it. 10 | # 11 | # In this exercise, you will read a file in small DataFrame chunks with read_csv(). You're going to use the World 12 | # Bank Indicators data 'ind_pop_data.csv', available in your current directory, to look at the urban population indicator 13 | # for numerous countries and years. 14 | # Import the pandas package 15 | 16 | # Initialize reader object: df_reader 17 | df_reader = pd.read_csv('../_datasets/WDIData_min.csv', chunksize=10) 18 | 19 | # Print two chunks 20 | print(next(df_reader)) 21 | print(next(df_reader)) 22 | 23 | # Writing an iterator to load data in chunks (2) 24 | # 25 | # In the previous exercise, you used read_csv() to read in DataFrame chunks from a large dataset. In this exercise, 26 | # you will read in a file using a bigger DataFrame chunk size and then process the data from the first chunk. 27 | # 28 | # To process the data, you will create another DataFrame composed of only the rows from a specific country. You will 29 | # then zip together two of the columns from the new DataFrame, 'Total Population' and 'Urban population (% of 30 | # total)'. Finally, you will create a list of tuples from the zip object, where each tuple is composed of a value 31 | # from each of the two columns mentioned. 32 | # 33 | # You're going to use the data from 'ind_pop_data.csv', available in your current directory. Pandas has been imported 34 | # as pd. 35 | 36 | # Initialize reader object: urb_pop_reader 37 | urb_pop_reader = pd.read_csv('../_datasets/ind_pop_data.csv', chunksize=1000) 38 | 39 | # Get the first DataFrame chunk: df_urb_pop 40 | df_urb_pop = next(urb_pop_reader) 41 | 42 | # Check out the head of the DataFrame 43 | print(df_urb_pop.head()) 44 | 45 | # Check out specific country: df_pop_ceb 46 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB'] 47 | 48 | # Zip DataFrame columns of interest: pops 49 | pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)']) 50 | 51 | # Turn zip object into list: pops_list 52 | pops_list = list(pops) 53 | 54 | # Print pops_list 55 | print(pops_list) 56 | 57 | # Writing an iterator to load data in chunks (3) 58 | # You're getting used to reading and processing data in chunks by now. 59 | # Let's push your skills a little further by adding a column to a DataFrame. 60 | # 61 | # In this exercise, you will be using a list comprehension to create the values for a new column 'Total Urban 62 | # Population' from the list of tuples that you generated earlier. Recall from the previous exercise that the first 63 | # and second elements of each tuple consist of, respectively, values from the columns 'Total Population' and 'Urban 64 | # population (% of total)'. The values in this new column 'Total Urban Population', therefore, are the product of the 65 | # first and second element in each tuple. Furthermore, because the 2nd element is a percentage, you need to divide 66 | # the entire result by 100, or alternatively, multiply it by 0.01. 67 | # 68 | # You will also plot the data from this new column to create a visualization of the urban population data. 69 | # 70 | # You're going to use the data from 'ind_pop_data.csv', available in your current directory. The packages pandas and 71 | # matplotlib.pyplot have been imported as pd and plt respectively for your use. 72 | 73 | # Initialize reader object: urb_pop_reader(see above) 74 | 75 | # Get the first DataFrame chunk: df_urb_pop(see above) 76 | 77 | # Check out specific country: df_pop_ceb(see above) 78 | 79 | # Zip DataFrame columns of interest: pops(see above) 80 | 81 | # Turn zip object into list: pops_list(see above) 82 | 83 | # Use list comprehension to create new DataFrame column 'Total Urban Population' 84 | df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list] 85 | print(df_pop_ceb['Total Urban Population']) 86 | 87 | # Plot urban population data 88 | df_pop_ceb.plot(kind="scatter", x='Year', y='Total Urban Population') 89 | plt.show() 90 | 91 | # Writing an iterator to load data in chunks (4) 92 | # 93 | # In the previous exercises, you've only processed the data from the 94 | # first DataFrame chunk. This time, you will aggregate the results over all the DataFrame chunks in the dataset. This 95 | # basically means you will be processing the entire dataset now. This is neat because you're going to be able to 96 | # process the entire large dataset by just working on smaller pieces of it! 97 | # 98 | # You're going to use the data from 'ind_pop_data.csv', available in your current directory. The packages pandas and 99 | # matplotlib.pyplot have been imported as pd and plt respectively for your use. 100 | 101 | # Initialize reader object: urb_pop_reader(see above) 102 | 103 | # Initialize empty DataFrame: data 104 | data = pd.DataFrame() 105 | 106 | # Iterate over each DataFrame chunk 107 | for df_urb_pop in urb_pop_reader: 108 | # Check out specific country: df_pop_ceb(see above) 109 | 110 | # Zip DataFrame columns of interest: pops(see above) 111 | 112 | # Turn zip object into list: pops_list(see above) 113 | 114 | # Use list comprehension to create new DataFrame column 'Total Urban Population1'(similar to above) 115 | df_pop_ceb['Total Urban Population1'] = [int(tup[0] * tup[1]) for tup in pops_list] 116 | 117 | # Append DataFrame chunk to data: data 118 | data = data.append(df_pop_ceb) 119 | 120 | # Plot urban population data 121 | data.plot(kind='scatter', x='Year', y='Total Urban Population1') 122 | plt.show() -------------------------------------------------------------------------------- /src/case_studies/case_study_pipelining_and_scaling.py: -------------------------------------------------------------------------------- 1 | # Bringing it all together II: 2 | # 3 | # Pipeline for regression For this final exercise, you will return to the Gapminder 4 | # dataset. Guess what? Even this dataset has missing values that we dealt with for you in earlier chapters! Now, 5 | # you have all the tools to take care of them yourself! 6 | # 7 | # Your job is to build a pipeline that imputes the missing data, scales the features, and fits an ElasticNet to the 8 | # Gapminder data. You will then tune the l1_ratio of your ElasticNet using GridSearchCV. 9 | # 10 | # All the necessary modules have been imported, and the feature and target variable arrays have been pre-loaded as X 11 | # and y. 12 | 13 | import numpy as np 14 | import pandas as pd 15 | from sklearn.model_selection import GridSearchCV 16 | from sklearn.model_selection import train_test_split 17 | from sklearn.pipeline import Pipeline 18 | from sklearn.preprocessing import Imputer 19 | from sklearn.preprocessing import StandardScaler 20 | from sklearn.linear_model import ElasticNet 21 | 22 | from helper import path 23 | 24 | # Read 'gm_2008_region.csv' into a DataFrame: df 25 | df = pd.read_csv(path + 'gm_2008_region.csv') 26 | 27 | X = df.drop('life', axis=1) 28 | y = df['life'] 29 | 30 | # Setup the pipeline steps: steps 31 | steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)), 32 | ('scaler', StandardScaler()), 33 | ('elasticnet', ElasticNet())] 34 | 35 | # Create the pipeline: pipeline 36 | pipeline = Pipeline(steps) 37 | 38 | # Specify the hyperparameter space 39 | parameters = {'elasticnet__l1_ratio': np.linspace(0, 1, 30)} 40 | 41 | # Create train and test sets 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 43 | 44 | # Create the GridSearchCV object: gm_cv 45 | gm_cv = GridSearchCV(pipeline, param_grid=parameters) 46 | 47 | # Fit to the training set 48 | gm_cv.fit(X_train, y_train) 49 | 50 | # Compute and print the metrics 51 | r2 = gm_cv.score(X_test, y_test) 52 | print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_)) 53 | print("Tuned ElasticNet R squared: {}".format(r2)) 54 | -------------------------------------------------------------------------------- /src/case_studies/case_study_trumps_twitter_RTs.py: -------------------------------------------------------------------------------- 1 | # Import pandas as pd 2 | import pandas as pd 3 | 4 | # Import twitter data 5 | tweets_df = pd.DataFrame(pd.read_excel("../_datasets/Trump Tweets(2017).xlsx")) 6 | 7 | 8 | # Define count_entries() 9 | def count_entries(df, *args): 10 | """Return a dictionary with counts of 11 | occurrences as value for each key.""" 12 | 13 | # Initialize an empty dictionary: cols_count 14 | cols_count = {} 15 | 16 | # Iterate over column names in args 17 | for col_name in args: 18 | 19 | # Extract column from DataFrame: col 20 | col = df[col_name] 21 | 22 | # Iterate over the column in DataFrame 23 | for entry in col: 24 | 25 | # If entry is in cols_count, add 1 26 | if entry in cols_count.keys(): 27 | cols_count[entry] += 1 28 | 29 | # Else add the entry to cols_count, set the value to 1 30 | else: 31 | cols_count[entry] = 1 32 | 33 | # Return the cols_count dictionary 34 | return cols_count 35 | 36 | 37 | # Call count_entries(): result2 38 | result = count_entries(tweets_df, 'Tweet') 39 | 40 | # Filter our Retweets 41 | retweets = (lambda x: x[0:2] == 'RT', tweets_df['Tweet']) 42 | 43 | # Print result 44 | 45 | # print(list(result)) 46 | for tweet in retweets: 47 | print(tweet) 48 | -------------------------------------------------------------------------------- /src/case_studies/case_study_urban_population_trends.py: -------------------------------------------------------------------------------- 1 | # Case Study: Plot Urban population trends in various countries over the years based on publically available data set. 2 | # 3 | # In this case study, I have to define the function plot_pop() which takes two arguments: the filename of the file to 4 | # be processed, and the country code of the rows we want to process in the dataset. 5 | # 6 | # calling the function already does the following: 7 | # 8 | # Loading of the file chunk by chunk, 9 | # Creating the new column of urban population values, 10 | # and Plotting the urban population data. 11 | # 12 | # The function makes it convenient to repeat the same process for whatever file and country code we want to process 13 | # and visualize! 14 | # 15 | # We are using the data from 'ind_pop_data.csv', available in /_datasets/ directory. 16 | # The packages pandas and matplotlib.pyplot has been imported as pd and plt respectively. 17 | # 18 | # If you have enjoyed working with this data, you can continue exploring it using the pre-processed version available 19 | # on Kaggle. 20 | 21 | import pandas as pd 22 | import matplotlib.pyplot as plt 23 | 24 | def_file_path = '../../_datasets/' 25 | 26 | # Define plot_pop() 27 | def plot_pop(filename, country_code): 28 | # Initialize reader object: urb_pop_reader 29 | urb_pop_reader = pd.read_csv(filename, chunksize=1000) 30 | 31 | # Initialize empty DataFrame: data 32 | data = pd.DataFrame() 33 | 34 | # Iterate over each DataFrame chunk 35 | for df_urb_pop in urb_pop_reader: 36 | # Check out specific country: df_pop_ceb 37 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code] 38 | 39 | # Zip DataFrame columns of interest: pops 40 | pops = zip(df_pop_ceb['Total Population'], 41 | df_pop_ceb['Urban population (% of total)']) 42 | 43 | # Turn zip object into list: pops_list 44 | pops_list = list(pops) 45 | 46 | # Use list comprehension to create new DataFrame column 'Total Urban Population' 47 | df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list] 48 | 49 | # Append DataFrame chunk to data: data 50 | data = data.append(df_pop_ceb) 51 | 52 | # Plot urban population data 53 | data.plot(kind='scatter', x='Year', y='Total Urban Population') 54 | plt.show() 55 | 56 | 57 | # Set the filename: fn 58 | fn = 'ind_pop_data.csv' 59 | 60 | # Call plot_pop for country code 'CEB' 61 | plot_pop(def_file_path + fn, 'CEB') 62 | 63 | # Call plot_pop for country code 'ARB' 64 | plot_pop(def_file_path + fn, 'ARB') 65 | -------------------------------------------------------------------------------- /src/case_studies/case_study_webscraping_imdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author: Saransh Bansal 3 | Purpose: Scrape top 250 movies in IMDB and visualize the frequency of these top films released in specific years 4 | """ 5 | import os 6 | import re 7 | import sys 8 | 9 | import numpy as np 10 | import requests 11 | 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | from bs4 import BeautifulSoup 15 | 16 | os.getcwd() # current working directory 17 | 18 | # get the current encoding 19 | type = sys.getfilesystemencoding() 20 | 21 | # request the webpage 22 | req = requests.get("https://www.imdb.com/chart/top") 23 | page = req.text 24 | 25 | soup = BeautifulSoup(page, 'html.parser') 26 | print(soup.prettify()) 27 | 28 | # get top 250 movie names and years, may take ~30 seconds 29 | movie_names = [] 30 | movie_year = [0] * 250 31 | 32 | j = 0 33 | for i in range(250): 34 | title = str(soup.findAll('td', {'class': 'titleColumn'})[i]) 35 | movie_names.append(re.findall('>(.*?)', title)[0]) 36 | 37 | year = str(soup.findAll('span', {'class': 'secondaryInfo'})[i]) 38 | movie_year[i] = int(re.findall(r"\(([0-9_]+)\)", year)[0]) 39 | 40 | # keep track of the progress 41 | print('Extracted movie :: ' + movie_names[i] + ' (' + str(movie_year[i]) + ') ') 42 | j = j + 1 43 | 44 | print(movie_names) 45 | print(movie_year) 46 | 47 | 48 | def encode_title(item): 49 | return str(item.encode('utf-8')) 50 | 51 | 52 | # export to the text file 53 | open("top250names.txt", "w").write("\n".join(encode_title(item) for item in movie_names)) 54 | 55 | # compute the frequency table 56 | y = np.bincount(movie_year) 57 | ii = np.nonzero(y)[0] 58 | out = list(zip(ii, y[ii])) 59 | # create a dataframe 60 | df = pd.DataFrame(out, columns=['Year', 'Freq'], index=ii) 61 | # drop the first Year column since I already assign valid index 62 | df.drop(df.columns[0], axis=1) 63 | # plot 64 | plt.plot(ii, df['Freq']) 65 | plt.show() 66 | -------------------------------------------------------------------------------- /src/case_studies/top250names.txt: -------------------------------------------------------------------------------- 1 | b'The Shawshank Redemption' 2 | b'The Godfather' 3 | b'The Godfather: Part II' 4 | b'The Dark Knight' 5 | b'12 Angry Men' -------------------------------------------------------------------------------- /src/core/py_comprehensions.py: -------------------------------------------------------------------------------- 1 | # Create a list of strings: fellowship 2 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli'] 3 | 4 | # Create list comprehension: new_fellowship with condition in predicate expression 5 | new_fellowship = [member for member in fellowship if (len(member) >= 7)] 6 | 7 | # Create list comprehension: new_fellowship with condition in predicate expression 8 | new_fellowship_1 = [member if (len(member) >= 7) else '' for member in fellowship] 9 | 10 | # Print the new list 11 | print(new_fellowship) 12 | 13 | # Print the new list 14 | print(new_fellowship_1) 15 | 16 | # -----------------------------# 17 | 18 | # Create dict comprehension: new_fellowship_dict 19 | new_fellowship_dict = {member: len(member) for member in fellowship} 20 | 21 | # Print the new list 22 | print(new_fellowship_dict) 23 | -------------------------------------------------------------------------------- /src/core/py_enumeration_example.py: -------------------------------------------------------------------------------- 1 | # Create a list of strings: mutants 2 | mutants = ['charles xavier', 3 | 'bobby drake', 4 | 'kurt wagner', 5 | 'max eisenhardt', 6 | 'kitty pride'] 7 | 8 | # Create a list of tuples: mutant_list 9 | mutant_list = enumerate(mutants) 10 | 11 | # Print the list of tuples 12 | print(list(mutant_list)) 13 | 14 | print('----------\n') 15 | 16 | # Unpack and print the tuple pairs 17 | for index1, value1 in enumerate(mutants): 18 | print(index1, value1) 19 | 20 | print('----------\n') 21 | 22 | # Change the start index 23 | for index2, value2 in enumerate(mutants, start=1): 24 | print(index2, value2) 25 | -------------------------------------------------------------------------------- /src/core/py_filter_example.py: -------------------------------------------------------------------------------- 1 | # Import reduce from functools 2 | from functools import reduce 3 | 4 | # Create a list of strings: stark 5 | stark = ['robb', 'sansa', 'arya', 'eddard', 'jon'] 6 | 7 | # Use reduce() to apply a lambda function over stark: result 8 | result = reduce(lambda item1, item2: item1 + item2, stark) 9 | 10 | # Print the result 11 | print(result) 12 | -------------------------------------------------------------------------------- /src/core/py_generators.py: -------------------------------------------------------------------------------- 1 | # Create a list of strings 2 | lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey'] 3 | 4 | 5 | # Define generator function get_lengths 6 | def get_lengths(input_list): 7 | """Generator function that yields the 8 | length of the strings in input_list.""" 9 | 10 | # Yield the length of a string 11 | for person in input_list: 12 | yield len(person) 13 | 14 | 15 | # Print the values generated by get_lengths() 16 | for value in get_lengths(lannister): 17 | print(value) 18 | -------------------------------------------------------------------------------- /src/core/py_iterable_and_iterator.py: -------------------------------------------------------------------------------- 1 | # An ITERABLE is: 2 | # 3 | # anything that can be looped over (i.e. you can loop over a string or file) or 4 | # anything that can appear on the right-side of a for-loop: for x in iterable: ... or 5 | # anything you can call with iter() that will return an ITERATOR: iter(obj) or 6 | # an object that defines __iter__ that returns a fresh ITERATOR, 7 | # or it may have a __getitem__ method suitable for indexed lookup. 8 | # 9 | # An ITERATOR is an object: 10 | # 11 | # with state that remembers where it is during iteration, 12 | # with a __next__ method that: 13 | # returns the next value in the iteration 14 | # updates the state to point at the next value 15 | # signals when it is done by raising StopIteration 16 | # and that is self-iterable (meaning that it has an __iter__ method that returns self). 17 | # 18 | # Notes: 19 | # 20 | # The __next__ method in Python 3 is spelt next in Python 2, and 21 | # The builtin function next() calls that method on the object passed to it. 22 | # 23 | # EXAMPLES :: 24 | 25 | # s is a str object that is immutable 26 | # s has no state 27 | # s has a __getitem__() method 28 | s = 'cat' # s is an ITERABLE 29 | print(next(s)) # TypeError: 'str' object is not an iterator 30 | 31 | # t has state (it starts by pointing at the "c" 32 | # t has a next() method and an __iter__() method 33 | t = iter(s) # t is an ITERATOR 34 | 35 | next(t) # the next() function returns the next value and advances the state 36 | next(t) # the next() function returns the next value and advances 37 | next(t) # the next() function returns the next value and advances 38 | next(t) # next() raises StopIteration to signal that iteration is complete 39 | 40 | # >>> iter(t) is t # the iterator is self-iterable 41 | -------------------------------------------------------------------------------- /src/core/py_regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | mytext = str([ 4 | '20080620033027/http://www.mrvc.indianrail.gov.in/overview.htm). _Official webpage of Mumbai Railway Vikas Corporation_. Archived from [the original](http://www.mrvc.indianrail.gov.in/overview.htm) on 2008-06-20. Retrieved 2008-12-11.']) 5 | 6 | myregex = r'(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}' 7 | x = re.findall(myregex, mytext) 8 | 9 | result = [] 10 | for res in x: 11 | result.append(res.replace("www.", "").split('//')[-1].split('/')[0]) 12 | 13 | print(';'.join(result)) 14 | -------------------------------------------------------------------------------- /src/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/db/__init__.py -------------------------------------------------------------------------------- /src/db/py_mongo_integration.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | 3 | import pymongo 4 | from bson import ObjectId 5 | from pymongo import MongoClient 6 | 7 | MONGODB_HOST = 'localhost' 8 | MONGODB_PORT = 27017 9 | DB_NAME = 'testdb' 10 | COLLECTION_TEST = 'collection_1' 11 | COLLECTION_PROFILES = 'profiles' 12 | 13 | test_data = { 14 | 'title': 'My First MongoDB document', 15 | 'author': 'Saransh Bansal', 16 | 'likes': 100, 17 | } 18 | 19 | user_profiles = [{'user_id': 211, 'name': 'Luke'}, {'user_id': 212, 'name': 'Ziltoid'}] 20 | 21 | 22 | class MongoUtil(): 23 | db = None 24 | 25 | def __init__(self): 26 | client = MongoClient(MONGODB_HOST, MONGODB_PORT) 27 | self.db = client.DB_NAME 28 | 29 | def connect_to_mongo(self): 30 | return self.db 31 | 32 | def print_collection(self, coll_name): 33 | mycol = self.db[coll_name] 34 | print(mycol) 35 | 36 | def insert_document(self, coll_name, document=None): 37 | mycol = self.db[coll_name] 38 | mycol.insert_one(document) 39 | 40 | def update_document(self, coll_name, obj_id): 41 | mycol = self.db[coll_name] 42 | mycol.update_one({'_id': ObjectId(obj_id)}, {"$set": {"title": "abc"}}) 43 | 44 | def print_all(self, coll_name): 45 | results = self.db[coll_name].find() 46 | for node in results: 47 | pprint.pprint(node) 48 | 49 | def count_documents(self, coll_name): 50 | mycol = self.db[coll_name] 51 | count = mycol.count_documents({}) 52 | print(count) 53 | 54 | def create_profiles(self, coll_name): 55 | self.db[coll_name].insert_many(user_profiles) 56 | 57 | def create_index(self, coll_name, index_col): 58 | self.db[coll_name].create_index([(index_col, pymongo.ASCENDING)], 59 | unique=True) 60 | print(sorted(list(self.db[coll_name].index_information()))) 61 | 62 | 63 | instance = MongoUtil() 64 | 65 | instance.print_collection(COLLECTION_TEST) 66 | 67 | # instance.insert_document(COLLECTION_TEST, test_data) 68 | 69 | # instance.create_profiles(COLLECTION_PROFILES) 70 | 71 | instance.print_all(COLLECTION_TEST) 72 | 73 | print('\n') 74 | 75 | instance.print_all(COLLECTION_PROFILES) 76 | 77 | print('\n') 78 | 79 | instance.count_documents(COLLECTION_TEST) 80 | 81 | # instance.update_document(COLLECTION_TEST, '5d2c87e03c30f6680050c521') 82 | 83 | instance.create_index(COLLECTION_PROFILES, 'user_id') 84 | -------------------------------------------------------------------------------- /src/db/py_sql.py: -------------------------------------------------------------------------------- 1 | # Import necessary module 2 | import pandas as pd 3 | from sqlalchemy import create_engine 4 | 5 | from helper import path 6 | 7 | # Create engine: engine 8 | engine = create_engine('sqlite:///' + path + 'Chinook.sqlite') 9 | 10 | # Save the table names to a list: table_names 11 | table_names = engine.table_names() 12 | 13 | # Print the table names to the shell 14 | print(table_names) 15 | 16 | # Open engine connection: con 17 | con = engine.connect() 18 | 19 | # Perform query: rs 20 | rs = con.execute('select * from Album') 21 | 22 | # Save results of the query to DataFrame: df 23 | df1 = pd.DataFrame(rs.fetchall()) 24 | 25 | # Close connection 26 | con.close() 27 | 28 | # Print head of DataFrame df 29 | print(df1.head()) 30 | 31 | # --------------------------------------- 32 | 33 | # Perform query and save results to DataFrame: df 34 | with engine.connect() as con: 35 | rs = con.execute('select LastName, Title from Employee') 36 | df2 = pd.DataFrame(rs.fetchmany(size=3)) 37 | df2.columns = rs.keys() 38 | 39 | # Print the length of the DataFrame df 40 | print(len(df2)) 41 | 42 | # Print the head of the DataFrame df 43 | print(df2.head()) 44 | 45 | # --------------------------------------- 46 | # Open engine in context manager 47 | # Perform query and save results to DataFrame: df 48 | with engine.connect() as con: 49 | rs = con.execute('select * from Employee where EmployeeId >= 6') 50 | df3 = pd.DataFrame(rs.fetchall()) 51 | df3.columns = rs.keys() 52 | 53 | # Print the head of the DataFrame df 54 | print(df3.head()) 55 | -------------------------------------------------------------------------------- /src/db/py_sql_with_pandas.py: -------------------------------------------------------------------------------- 1 | # Import necessary module 2 | import random 3 | from datetime import datetime 4 | 5 | import pandas as pd 6 | from sqlalchemy import create_engine 7 | 8 | from helper import path 9 | 10 | # Create engine: engine 11 | engine = create_engine('sqlite:///' + path + 'Chinook.sqlite'); 12 | 13 | # Execute query and store records in DataFrame: df 14 | df1 = pd.read_sql_query('select * from Album', engine) 15 | 16 | # Execute query and store records in DataFrame: df 17 | df2 = pd.read_sql_query('select * from Employee where EmployeeId >= 6 order by BirthDate', engine) 18 | 19 | df3 = pd.read_sql_query('select Title, Name from Album al inner join Artist ar on al.ArtistID=ar.ArtistID', engine) 20 | 21 | df4 = pd.read_sql_query( 22 | 'select * from PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId where Milliseconds < 250000', 23 | engine) 24 | 25 | rand_dates = [datetime(random.randrange(2000, 2001), random.randrange(1, 6), random.randrange(1, 3)) for d in 26 | range(0, len(df4))] 27 | df4['dates'] = rand_dates 28 | 29 | df4 = df4.loc[:, ~df4.columns.duplicated()] 30 | # Print head of DataFrame 31 | # print(df1.head()) 32 | # print(df2.head()) 33 | # print(df3.head()) 34 | # print(df4.head()) 35 | df4 = df4.groupby([df4.dates.dt.year.rename('year'), df4.dates.dt.month.rename('month')]).size() 36 | print(df4) 37 | # df4.groupby(df4.dates).agg({'count'}) 38 | # print(df4.columns) 39 | # print(df4['dates'].value_counts()) 40 | 41 | -------------------------------------------------------------------------------- /src/file_operations/py_corrupt_file_read.py: -------------------------------------------------------------------------------- 1 | # Import matplotlib.pyplot as plt 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | 5 | from helper import path 6 | 7 | # Assign filename: file 8 | file = 'titanic_corrupt.txt' 9 | 10 | # Import file: data 11 | data = pd.read_csv(path + file, sep='\t', comment='#', na_values=['NA', 'NaN', "Nothing"]) 12 | 13 | # Print the head of the DataFrame 14 | print(data.head()) 15 | 16 | # Plot 'Age' variable in a histogram 17 | pd.DataFrame.hist(data[['Age']]) 18 | plt.xlabel('Age (years)') 19 | plt.ylabel('count') 20 | plt.show() 21 | -------------------------------------------------------------------------------- /src/file_operations/py_default_file_read_1.py: -------------------------------------------------------------------------------- 1 | from helper import path 2 | 3 | # Open a file: file 4 | file = open(path + 'moby_dick.txt', mode='r') 5 | 6 | # Print it 7 | print(file.read()) 8 | 9 | # Check whether file is closed 10 | print(file.closed) 11 | 12 | # Close file 13 | file.close() 14 | 15 | # Check whether file is closed 16 | print(file.closed) 17 | 18 | # Importing text file_operations line by line 19 | 20 | # Read & print the first 3 lines 21 | with open(path + 'moby_dick.txt') as file: 22 | print(file.readline()) 23 | print(file.readline()) 24 | print(file.readline()) 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/file_operations/py_numpy_file_read_1.py: -------------------------------------------------------------------------------- 1 | # Import package 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | from helper import path 6 | 7 | # Assign filename to variable: file 8 | file = 'digits.csv' 9 | 10 | # Load file as array: digits 11 | digits = np.loadtxt(path + file, delimiter=',', dtype=str) 12 | 13 | # Print datatype of digits 14 | print(type(digits)) 15 | 16 | # Select and reshape a row 17 | im = digits[21, 1:] 18 | im_sq = np.reshape(im, (8, 98)) 19 | 20 | # Plot reshaped data (matplotlib.pyplot already loaded as plt) 21 | plt.imshow(im_sq, cmap='Greys', interpolation='nearest') 22 | plt.show() 23 | -------------------------------------------------------------------------------- /src/file_operations/py_numpy_file_read_2.py: -------------------------------------------------------------------------------- 1 | # Import package 2 | import numpy as np 3 | import pandas as pd 4 | 5 | from helper import path 6 | 7 | # Assign the filename: file 8 | file = 'amis.csv' 9 | 10 | # Load the data: data 11 | data = np.loadtxt(path + file, delimiter=',', skiprows=1, usecols=[1, 3]) 12 | 13 | # Print data 14 | print(data) 15 | 16 | # Read flat file to data frame 17 | 18 | # Read the first 5 rows of the file into a DataFrame: data 19 | data = pd.read_csv(path + file, nrows=5, header=None) 20 | 21 | # Build a numpy array from the DataFrame: data_array 22 | data_array = np.array(data) 23 | 24 | # Print the datatype of data_array to the shell 25 | print(type(data_array)) 26 | -------------------------------------------------------------------------------- /src/file_operations/py_pandas_excel_read.py: -------------------------------------------------------------------------------- 1 | # Listing sheets in Excel file_operations 2 | # 3 | # Whether you like it or not, any working data scientist will need to deal with Excel spreadsheets at some point in 4 | # time. You won't always want to do so in Excel, however! 5 | # 6 | # Here, you'll learn how to use pandas to import Excel spreadsheets and how to list the names of the sheets in any 7 | # loaded .xlsx file. 8 | # 9 | # Recall from the video that, given an Excel file imported into a variable spreadsheet, you can retrieve a list of 10 | # the sheet names using the attribute spreadsheet.sheet_names. 11 | # 12 | # Specifically, you'll be loading and checking out the spreadsheet 'battledeath.xlsx', modified from the Peace 13 | # Research Institute Oslo's (PRIO) dataset. This data contains age-adjusted mortality rates due to war in various 14 | # countries over several years. 15 | 16 | # Import pandas 17 | import pandas as pd 18 | 19 | from helper import path 20 | 21 | # Assign spreadsheet filename: file 22 | file = 'battledeath.xlsx' 23 | 24 | # Load spreadsheet: xl 25 | xl = pd.ExcelFile(path + file) 26 | 27 | # Print sheet names 28 | print(xl.sheet_names) 29 | 30 | # ------------------------------------------- 31 | # Load a sheet into a DataFrame by name: df1 32 | df1 = xl.parse('2004') 33 | 34 | # Print the head of the DataFrame df1 35 | print(df1.head()) 36 | 37 | # Load a sheet into a DataFrame by index: df2 38 | # parse_args :: sheet index/name | skiprows | custom column names | parse_cols columns to show 39 | df2 = xl.parse(0, skiprows=[0]) 40 | 41 | # Print the head of the DataFrame df2 42 | print(df2.head()) 43 | 44 | # PS: both are ~ same! 45 | -------------------------------------------------------------------------------- /src/file_operations/py_pandas_file_read_1.py: -------------------------------------------------------------------------------- 1 | # Import pandas as pd 2 | import pandas as pd 3 | from helper import path 4 | # Assign the filename: file 5 | file = 'digits.csv' 6 | 7 | # Read the file into a DataFrame: df 8 | df = pd.read_csv(file) 9 | 10 | # View the head of the DataFrame 11 | print(df.head()) 12 | -------------------------------------------------------------------------------- /src/file_operations/py_pandas_read_csv.py: -------------------------------------------------------------------------------- 1 | # Import pandas as pd 2 | import pandas as pd 3 | 4 | # Import the cars.csv data: cars 5 | cars = pd.DataFrame(pd.read_csv("../_datasets/cars.csv")) 6 | tweets = pd.DataFrame(pd.read_csv("../_datasets/tweets.csv")) 7 | jobs = pd.DataFrame(pd.read_csv("../_datasets/Information_gain_job_advertisements.csv")) 8 | industries = pd.DataFrame(pd.read_json("../_datasets/industries.json")) 9 | 10 | # Print out cars 11 | print(cars.describe()) 12 | 13 | # Print out tweets 14 | print(tweets.keys()) 15 | 16 | # Print all columns of industries 17 | print(list(industries.resultList)) 18 | -------------------------------------------------------------------------------- /src/file_operations/py_pickle_read_test.py: -------------------------------------------------------------------------------- 1 | # Save a dictionary into a pickle file. 2 | import pickle 3 | 4 | from helper import path 5 | 6 | d = {'Aug': '85', 'Airline': '8', 'June': '69.4', 'Mar': '84.4'} 7 | pickle.dump(d, open(path + "data.pk1", "wb")) 8 | 9 | # Load the dictionary back from the pickle file. 10 | d = pickle.load(open(path + 'data.pk1', "rb")) 11 | 12 | print(d) 13 | 14 | print(type(d)) 15 | -------------------------------------------------------------------------------- /src/file_operations/py_read_hdf5_file.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | 3 | from helper import path 4 | 5 | # Assign filename: file 6 | file = 'NEONDS.hdf5' 7 | 8 | # Load file: data 9 | data = h5py.File(path + file, 'r') 10 | 11 | # Print the datatype of the loaded file 12 | print(type(data)) 13 | 14 | # Print the keys of the file 15 | for key in data.keys(): 16 | print(data[key]) 17 | -------------------------------------------------------------------------------- /src/file_operations/py_read_matlab_file.py: -------------------------------------------------------------------------------- 1 | # Import package 2 | import scipy.io 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | 6 | from helper import path 7 | 8 | # Load MATLAB file: mat 9 | mat = scipy.io.loadmat(path + 'albeck_gene_expression.mat') 10 | 11 | # Print the datatype type of mat 12 | print(type(mat)) 13 | 14 | # Print the keys of the MATLAB dictionary 15 | print(mat.keys()) 16 | 17 | # Print the type of the value corresponding to the key 'CYratioCyt' 18 | print(type(mat['fret'])) 19 | 20 | # Print the shape of the value corresponding to the key 'CYratioCyt' 21 | print(np.shape(mat['fret'])) 22 | 23 | # Subset the array and plot it 24 | data = mat['fret'][25, 5:] 25 | fig = plt.figure() 26 | plt.plot(data) 27 | plt.xlabel('time (min.)') 28 | plt.ylabel('normalized fluorescence (measure of expression)') 29 | plt.show() 30 | 31 | -------------------------------------------------------------------------------- /src/file_operations/py_read_sas_file.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | # Import sas7bdat package 5 | from sas7bdat import SAS7BDAT 6 | 7 | from helper import path 8 | 9 | # Save file to a DataFrame: df_sas 10 | with SAS7BDAT(path+'sales.sas7bdat') as file: 11 | df_sas = file.to_data_frame() 12 | 13 | # Print head of DataFrame 14 | print((df_sas.head())) 15 | 16 | # Plot histogram of DataFrame features (pandas and pyplot already imported) 17 | pd.DataFrame.hist(df_sas[['P']]) 18 | plt.ylabel('count') 19 | plt.show() 20 | -------------------------------------------------------------------------------- /src/file_operations/py_read_stata_file.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | 4 | # Import sas7bdat package 5 | 6 | from helper import path 7 | 8 | df = pd.read_stata(path + 'disarea.dta', 'rb') 9 | 10 | # Print head of DataFrame 11 | print(df.head()) 12 | 13 | 14 | # Plot histogram of DataFrame features (pandas and pyplot already imported) 15 | def plot(key): 16 | if key not in ['wbcode', 'country']: 17 | pd.DataFrame.hist(df[[key]]) 18 | plt.ylabel('count') 19 | plt.show() 20 | 21 | for key in df.keys(): 22 | plot(key) 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/file_operations/py_test_loops_algo.py: -------------------------------------------------------------------------------- 1 | # Here are some comparisons of the performances for in, set and bisect. Note the time (in second) is in log scale. 2 | 3 | import bisect 4 | import math 5 | import random 6 | import time 7 | 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def method_in(a, b, c): 12 | start_time = time.time() 13 | for i, x in enumerate(a): 14 | if x in b: 15 | c[i] = 1 16 | return time.time() - start_time 17 | 18 | 19 | def method_set_in(a, b, c): 20 | start_time = time.time() 21 | s = set(b) 22 | for i, x in enumerate(a): 23 | if x in s: 24 | c[i] = 1 25 | return time.time() - start_time 26 | 27 | 28 | def method_bisect(a, b, c): 29 | start_time = time.time() 30 | b.sort() 31 | for i, x in enumerate(a): 32 | index = bisect.bisect_left(b, x) 33 | if index < len(a): 34 | if x == b[index]: 35 | c[i] = 1 36 | return time.time() - start_time 37 | 38 | 39 | def profile(): 40 | time_method_in = [] 41 | time_method_set_in = [] 42 | time_method_bisect = [] 43 | 44 | Nls = [x for x in range(1000, 20000, 1000)] 45 | for N in Nls: 46 | a = [x for x in range(0, N)] 47 | random.shuffle(a) 48 | b = [x for x in range(0, N)] 49 | random.shuffle(b) 50 | c = [0 for x in range(0, N)] 51 | 52 | time_method_in.append(math.log(method_in(a, b, c))) 53 | time_method_set_in.append(math.log(method_set_in(a, b, c))) 54 | time_method_bisect.append(math.log(method_bisect(a, b, c))) 55 | 56 | plt.plot(Nls, time_method_in, marker='o', color='r', linestyle='-', label='in') 57 | plt.plot(Nls, time_method_set_in, marker='o', color='b', linestyle='-', label='set') 58 | plt.plot(Nls, time_method_bisect, marker='o', color='g', linestyle='-', label='bisect') 59 | plt.xlabel('list size', fontsize=18) 60 | plt.ylabel('log(time)', fontsize=18) 61 | plt.legend(loc='upper left') 62 | plt.show() 63 | 64 | 65 | profile() 66 | -------------------------------------------------------------------------------- /src/file_operations/read_in_chunks.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from helper import path 3 | 4 | 5 | # Define count_entries() 6 | def count_entries(csv_file, c_size, colname): 7 | """Return a dictionary with counts of 8 | occurrences as value for each key.""" 9 | 10 | # Initialize an empty dictionary: counts_dict 11 | counts_dict = {} 12 | 13 | # Iterate over the file chunk by chunk 14 | for chunk in pd.read_csv(csv_file, chunksize=c_size): 15 | 16 | # Iterate over the column in DataFrame 17 | for entry in chunk[colname]: 18 | if entry in counts_dict.keys(): 19 | counts_dict[entry] += 1 20 | else: 21 | counts_dict[entry] = 1 22 | 23 | # Return counts_dict 24 | return counts_dict 25 | 26 | 27 | # Call count_entries(): result_counts 28 | result_counts = count_entries(path + 'Information_gain_job_advertisements.csv', 10, 'Term') 29 | 30 | # Print result_counts 31 | print(result_counts) 32 | -------------------------------------------------------------------------------- /src/file_operations/read_tweets.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | import csv 5 | 6 | import tweepy # https://github.com/tweepy/tweepy 7 | 8 | # Twitter API credentials 9 | consumer_key = "" 10 | consumer_secret = "" 11 | access_key = "" 12 | access_secret = "" 13 | 14 | 15 | def get_all_tweets(screen_name): 16 | # Twitter only allows access to a users most recent 3240 tweets with this method 17 | 18 | # authorize twitter, initialize tweepy 19 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 20 | auth.set_access_token(access_key, access_secret) 21 | api = tweepy.API(auth) 22 | 23 | # initialize a list to hold all the tweepy Tweets 24 | alltweets = [] 25 | 26 | # make initial request for most recent tweets (200 is the maximum allowed count) 27 | new_tweets = api.user_timeline(screen_name=screen_name, count=200) 28 | 29 | # save most recent tweets 30 | alltweets.extend(new_tweets) 31 | 32 | # save the id of the oldest tweet less one 33 | oldest = alltweets[-1].id - 1 34 | 35 | # keep grabbing tweets until there are no tweets left to grab 36 | while len(new_tweets) > 0: 37 | print 38 | "getting tweets before %s" % oldest 39 | 40 | # all subsiquent requests use the max_id param to prevent duplicates 41 | new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest) 42 | 43 | # save most recent tweets 44 | alltweets.extend(new_tweets) 45 | 46 | # update the id of the oldest tweet less one 47 | oldest = alltweets[-1].id - 1 48 | 49 | print 50 | "...%s tweets downloaded so far" % (len(alltweets)) 51 | 52 | # transform the tweepy tweets into a 2D array that will populate the csv 53 | outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets] 54 | 55 | # write the csv 56 | with open('%s_tweets.csv' % screen_name, 'wb') as f: 57 | writer = csv.writer(f) 58 | writer.writerow(["id", "created_at", "text"]) 59 | writer.writerows(outtweets) 60 | 61 | pass 62 | 63 | 64 | if __name__ == '__main__': 65 | # pass in the username of the account you want to download 66 | get_all_tweets("J_tsar") 67 | -------------------------------------------------------------------------------- /src/grains_data_from_dataset.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | import numpy as np 4 | 5 | from helper import path 6 | 7 | with open('../' + path + 'seeds-width-vs-length.csv', 'r') as f: 8 | grains = list(csv.reader(f, delimiter=',')) 9 | grains = np.array(grains).astype(np.float) -------------------------------------------------------------------------------- /src/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/misc/__init__.py -------------------------------------------------------------------------------- /src/misc/py_test_loops_algo.py: -------------------------------------------------------------------------------- 1 | # Here are some comparisons of the performances for in, set and bisect. Note the time (in second) is in log scale. 2 | 3 | import bisect 4 | import math 5 | import random 6 | import time 7 | 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def method_in(a, b, c): 12 | start_time = time.time() 13 | for i, x in enumerate(a): 14 | if x in b: 15 | c[i] = 1 16 | return time.time() - start_time 17 | 18 | 19 | def method_set_in(a, b, c): 20 | start_time = time.time() 21 | s = set(b) 22 | for i, x in enumerate(a): 23 | if x in s: 24 | c[i] = 1 25 | return time.time() - start_time 26 | 27 | 28 | def method_bisect(a, b, c): 29 | start_time = time.time() 30 | b.sort() 31 | for i, x in enumerate(a): 32 | index = bisect.bisect_left(b, x) 33 | if index < len(a): 34 | if x == b[index]: 35 | c[i] = 1 36 | return time.time() - start_time 37 | 38 | 39 | def profile(): 40 | time_method_in = [] 41 | time_method_set_in = [] 42 | time_method_bisect = [] 43 | 44 | Nls = [x for x in range(1000, 20000, 1000)] 45 | for N in Nls: 46 | a = [x for x in range(0, N)] 47 | random.shuffle(a) 48 | b = [x for x in range(0, N)] 49 | random.shuffle(b) 50 | c = [0 for x in range(0, N)] 51 | 52 | time_method_in.append(math.log(method_in(a, b, c))) 53 | time_method_set_in.append(math.log(method_set_in(a, b, c))) 54 | time_method_bisect.append(math.log(method_bisect(a, b, c))) 55 | 56 | plt.plot(Nls, time_method_in, marker='o', color='r', linestyle='-', label='in') 57 | plt.plot(Nls, time_method_set_in, marker='o', color='b', linestyle='-', label='set') 58 | plt.plot(Nls, time_method_bisect, marker='o', color='g', linestyle='-', label='bisect') 59 | plt.xlabel('list size', fontsize=18) 60 | plt.ylabel('log(time)', fontsize=18) 61 | plt.legend(loc='upper left') 62 | plt.show() 63 | 64 | 65 | profile() 66 | -------------------------------------------------------------------------------- /src/misc/py_zip_example.py: -------------------------------------------------------------------------------- 1 | # Using zip 2 | # 3 | # Another interesting function that you've learned is zip(), which takes any number of iterables and 4 | # returns a zip object that is an iterator of tuples. If you wanted to print the values of a zip object, 5 | # you can convert it into a list and then print it. Printing just a zip object will not return the values unless you 6 | # unpack it first. In this exercise, you will explore this for yourself. 7 | # 8 | # Three lists of strings are pre-loaded: mutants, aliases, and powers. First, you will use list() and zip() on these 9 | # lists to generate a list of tuples. Then, you will create a zip object using zip(). Finally, you will unpack this 10 | # zip object in a for loop to print the values in each tuple. Observe the different output generated by printing the 11 | # list of tuples, then the zip object, and finally, the tuple values in the for loop. 12 | 13 | mutants = ['charles xavier', 14 | 'bobby drake', 15 | 'kurt wagner', 16 | 'max eisenhardt', 17 | 'kitty pride'] 18 | 19 | aliases = ['prof x', 'iceman', 'nightcrawler', 'magneto', 'shadowcat'] 20 | 21 | powers = ['telepathy', 22 | 'thermokinesis', 23 | 'teleportation', 24 | 'magnetokinesis', 25 | 'intangibility'] 26 | 27 | # Create a list of tuples: mutant_data 28 | mutant_data = list(zip(mutants, aliases, powers)) 29 | 30 | # Print the list of tuples 31 | print(mutant_data) 32 | 33 | # Create a zip object using the three lists: mutant_zip 34 | mutant_zip = zip(mutants, aliases, powers) 35 | 36 | # Print the zip object 37 | print(mutant_zip) 38 | 39 | # Unpack the zip object and print the tuple values 40 | for value1, value2, value3 in mutant_zip: 41 | print(value1, value2, value3) 42 | -------------------------------------------------------------------------------- /src/misc/random.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | a = [x for x in range(3, 13)] 4 | print(a) 5 | value = [] 6 | print(value[0] if value else 0) 7 | 8 | val = '2018,1' 9 | date_object = datetime.strptime(val, '%Y,%m') 10 | 11 | print(date_object.strftime("%b %y")) 12 | 13 | print('sdada' + '123') 14 | 15 | 16 | class B(object): 17 | def __init__(self): 18 | body = "aaa" 19 | self.context = { 20 | 'body': body, 21 | } 22 | 23 | 24 | class A(B): 25 | def __init__(self): 26 | super().__init__() 27 | self.context['body'] = self.context['body'] + "BBB" 28 | self.context = { 29 | **self.context, 30 | } 31 | print(self.context['body']) 32 | 33 | 34 | class C(B): 35 | def __init__(self): 36 | super().__init__() 37 | 38 | print(self.context['body']) 39 | 40 | 41 | b = B() 42 | a = A() 43 | c = C() 44 | 45 | data = { 46 | 'key': 100 47 | } 48 | 49 | print('{}\\xE2\\x80\\xAD\\xE2\\x80\\xAD'.format(data)) 50 | 51 | print('+381652522560') 52 | 53 | name = 'Larry Lam' 54 | if name: 55 | names = name.split(' ') 56 | given = names[0] 57 | family = names[len(names) - 1] 58 | print('{}, {}'.format(given, family)) 59 | 60 | 61 | print(' '.strip() or 'NA') 62 | 63 | -------------------------------------------------------------------------------- /src/misc/tensorflow_starter.py: -------------------------------------------------------------------------------- 1 | import math 2 | from builtins import print 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import tensorflow as tf 7 | from sklearn import metrics 8 | from tensorflow.python.data import Dataset 9 | 10 | tf.logging.set_verbosity(tf.logging.ERROR) 11 | pd.options.display.max_rows = 10 12 | pd.options.display.float_format = '{:.1f}'.format 13 | 14 | california_housing_dataframe = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv', 15 | sep=",") 16 | california_housing_dataframe = california_housing_dataframe.reindex( 17 | np.random.permutation(california_housing_dataframe.index)) 18 | california_housing_dataframe["median_house_value"] /= 1000.0 19 | california_housing_dataframe 20 | 21 | # Define the input feature: total_rooms. 22 | my_feature = california_housing_dataframe[["total_rooms"]] 23 | 24 | # Configure a numeric feature column for total_rooms. 25 | feature_columns = [tf.feature_column.numeric_column("total_rooms")] 26 | 27 | # Define the label. 28 | targets = california_housing_dataframe["median_house_value"] 29 | 30 | # Use gradient descent as the optimizer for training the model. 31 | my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001) 32 | my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0) 33 | 34 | # Configure the linear regression model with our feature columns and optimizer. 35 | # Set a learning rate of 0.0000001 for Gradient Descent. 36 | linear_regressor = tf.estimator.LinearRegressor( 37 | feature_columns=feature_columns, 38 | optimizer=my_optimizer 39 | ) 40 | 41 | 42 | def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None): 43 | """Trains a linear regression model of one feature. 44 | 45 | Args: 46 | features: pandas DataFrame of features 47 | targets: pandas DataFrame of targets 48 | batch_size: Size of batches to be passed to the model 49 | shuffle: True or False. Whether to shuffle the data. 50 | num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely 51 | Returns: 52 | Tuple of (features, labels) for next data batch 53 | """ 54 | 55 | # Convert pandas data into a dict of np arrays. 56 | features = {key: np.array(value) for key, value in dict(features).items()} 57 | 58 | # Construct a dataset, and configure batching/repeating. 59 | ds = Dataset.from_tensor_slices((features, targets)) # warning: 2GB limit 60 | ds = ds.batch(batch_size).repeat(num_epochs) 61 | 62 | # Shuffle the data, if specified. 63 | if shuffle: 64 | ds = ds.shuffle(buffer_size=10000) 65 | 66 | # Return the next batch of data. 67 | features, labels = ds.make_one_shot_iterator().get_next() 68 | return features, labels 69 | 70 | 71 | _ = linear_regressor.train( 72 | input_fn=lambda: my_input_fn(my_feature, targets), 73 | steps=100 74 | ) 75 | 76 | # Create an input function for predictions. 77 | # Note: Since we're making just one prediction for each example, we don't 78 | # need to repeat or shuffle the data here. 79 | prediction_input_fn =lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False) 80 | 81 | # Call predict() on the linear_regressor to make predictions. 82 | predictions = linear_regressor.predict(input_fn=prediction_input_fn) 83 | 84 | # Format predictions as a NumPy array, so we can calculate error metrics. 85 | predictions = np.array([item['predictions'][0] for item in predictions]) 86 | 87 | # Print Mean Squared Error and Root Mean Squared Error. 88 | mean_squared_error = metrics.mean_squared_error(predictions, targets) 89 | root_mean_squared_error = math.sqrt(mean_squared_error) 90 | print("Mean Squared Error (on training data): %0.3f" % mean_squared_error) 91 | print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error) -------------------------------------------------------------------------------- /src/ml-supervised/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-supervised/__init__.py -------------------------------------------------------------------------------- /src/ml-supervised/course-description.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-supervised/course-description.png -------------------------------------------------------------------------------- /src/ml-supervised/course-description.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \deftab720 7 | \pard\pardeftab720\partightenfactor0 8 | 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0 10 | At the end of day, the value of Data Scientists rests on their ability to describe the world and to make predictions. Machine Learning is the field of teaching machines and computers to learn from existing data to make predictions on new data - will a given tumor be benign or malignant? Which of your customers will take their business elsewhere? Is a particular email spam or not? In this course, you'll learn how to use Python to perform supervised learning, an essential component of Machine Learning. You'll learn how to build predictive models, how to tune their parameters and how to tell how well they will perform on unseen data, all the while using real world datasets. You'll do so using scikit-learn, one of the most popular and user-friendly machine learning libraries for Python.} -------------------------------------------------------------------------------- /src/ml-supervised/k-fold_cross_validation.py: -------------------------------------------------------------------------------- 1 | # 5-fold cross-validation 2 | # 3 | # Cross-validation is a vital step in evaluating a model. It maximizes the amount of data 4 | # that is used to train the model, as during the course of training, the model is not only trained, but also tested 5 | # on all of the available data. 6 | # 7 | # In this exercise, you will practice 5-fold cross validation on the Gapminder data. By default, scikit-learn's 8 | # cross_val_score() function uses R2 as the metric of choice for regression. Since you are performing 5-fold 9 | # cross-validation, the function will return 5 scores. Your job is to compute these 5 scores and then take their 10 | # average. 11 | # 12 | # The DataFrame has been loaded as df and split into the feature/target variable arrays X and y. The modules pandas 13 | # and numpy have been imported as pd and np, respectively. 14 | 15 | # Import the necessary modules 16 | import numpy as np 17 | import pandas as pd 18 | from sklearn.linear_model import LinearRegression 19 | from sklearn.model_selection import cross_val_score 20 | 21 | from helper import path 22 | 23 | # Read the CSV file into a DataFrame: df 24 | df = pd.read_csv(path + 'gm_2008_region.csv') 25 | 26 | # Create arrays for features and target variable 27 | X = df['fertility'].values 28 | y = df['life'].values 29 | 30 | # Reshape X and y 31 | X = X.reshape(-1, 1) 32 | y = y.reshape(-1, 1) 33 | 34 | # Create a linear regression object: reg 35 | reg = LinearRegression() 36 | 37 | # Compute 5-fold cross-validation scores: cv_scores 38 | cv_scores = cross_val_score(reg, X, y, cv=5) 39 | 40 | # Print the 5-fold cross-validation scores 41 | print(cv_scores) 42 | 43 | print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores))) 44 | 45 | # --------------------------------- 46 | 47 | # Test time for 3-fold & 10 fold operations :: %timeit cross_val_score(reg, X, y, cv = ____) 48 | # Perform 3-fold CV 49 | cvscores_3 = cross_val_score(reg, X, y, cv=3) 50 | print("Average 3-Fold CV Score: {}".format(np.mean(cvscores_3))) 51 | 52 | # Perform 10-fold CV 53 | cvscores_10 = cross_val_score(reg, X, y, cv=10) 54 | print("Average 10-Fold CV Score: {}".format(np.mean(cvscores_10))) 55 | -------------------------------------------------------------------------------- /src/ml-supervised/ml_centering_and_scaling.py: -------------------------------------------------------------------------------- 1 | # Centering and scaling your data 2 | # 3 | # In the video, Hugo demonstrated how significantly the performance of a model can 4 | # improve if the features are scaled. Note that this is not always the case: In the Congressional voting records 5 | # dataset, for example, all of the features are binary. In such a situation, scaling will have minimal impact. 6 | # 7 | # You will now explore scaling for yourself on a new dataset - White Wine Quality! Hugo used the Red Wine Quality 8 | # dataset in the video. We have used the 'quality' feature of the wine to create a binary target variable: If 9 | # 'quality' is less than 5, the target variable is 1, and otherwise, it is 0. 10 | # 11 | # The DataFrame has been pre-loaded as df, along with the feature and target variable arrays X and y. Explore it in 12 | # the IPython Shell. Notice how some features seem to have different units of measurement. 'density', for instance, 13 | # only takes values between 0 and 1, while 'total sulfur dioxide' has a maximum value of 289. As a result, 14 | # it may be worth scaling the features here. Your job in this exercise is to scale the features and compute the mean 15 | # and standard deviation of the unscaled features compared to the scaled features. 16 | 17 | 18 | # Import scale 19 | import numpy as np 20 | import pandas as pd 21 | from sklearn.preprocessing import scale 22 | 23 | from helper import path 24 | 25 | # Read 'white-wine.csv' into a DataFrame: df 26 | df = pd.read_csv(path + 'white-wine.csv') 27 | 28 | X = df.drop('quality', axis=1) 29 | y = df['quality'] 30 | 31 | # Scale the features: X_scaled 32 | X_scaled = scale(X) 33 | 34 | # Print the mean and standard deviation of the unscaled features 35 | print("Mean of Unscaled Features: {}".format(np.mean(X))) 36 | print("Standard Deviation of Unscaled Features: {}".format(np.std(X))) 37 | 38 | # Print the mean and standard deviation of the scaled features 39 | print("Mean of Scaled Features: {}".format(np.mean(X_scaled))) 40 | print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled))) 41 | 42 | # ----------- 43 | 44 | 45 | # Import the necessary modules 46 | from sklearn.preprocessing import StandardScaler 47 | from sklearn.pipeline import Pipeline 48 | from sklearn.neighbors import KNeighborsClassifier 49 | from sklearn.model_selection import train_test_split 50 | 51 | # Setup the pipeline steps: steps 52 | steps = [('scaler', StandardScaler()), 53 | ('knn', KNeighborsClassifier())] 54 | 55 | # Create the pipeline: pipeline 56 | pipeline = Pipeline(steps) 57 | 58 | # Create train and test sets 59 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 60 | 61 | # Fit the pipeline to the training set: knn_scaled 62 | knn_scaled = pipeline.fit(X_train, y_train) 63 | 64 | # Instantiate and fit a k-NN classifier to the unscaled data 65 | knn_unscaled = KNeighborsClassifier().fit(X_train, y_train) 66 | 67 | # Compute and print metrics 68 | print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test))) 69 | print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test))) 70 | -------------------------------------------------------------------------------- /src/ml-supervised/ml_manually_remove_missing_data.py: -------------------------------------------------------------------------------- 1 | # Regression with categorical features 2 | # 3 | # Having created the dummy variables from the 'Region' feature, you can build 4 | # regression models as you did before. Here, you'll use ridge regression to perform 5-fold cross-validation. 5 | # The feature array X and target variable array y have been pre-loaded. 6 | # 7 | # Dropping missing data 8 | # 9 | # The voting dataset from Chapter 1 contained a bunch of missing values that we dealt with for 10 | # you behind the scenes. Now, it's time for you to take care of these yourself! 11 | # 12 | # The unprocessed dataset has been loaded into a DataFrame df. Explore it in the IPython Shell with the .head() 13 | # method. You will see that there are certain data points labeled with a '?'. These denote missing values. As you saw 14 | # in the video, different datasets encode missing values in different ways. Sometimes it may be a '9999', 15 | # other times a 0 - real-world data can be very messy! If you're lucky, the missing values will already be encoded as 16 | # NaN. We use NaN because it is an efficient and simplified way of internally representing missing data, and it lets 17 | # us take advantage of pandas methods such as .dropna() and .fillna(), as well as scikit-learn's Imputation 18 | # transformer Imputer(). 19 | # In this exercise, your job is to convert the '?'s to NaNs, and then drop the rows that contain them from the 20 | # DataFrame. 21 | 22 | import matplotlib.pyplot as plt 23 | import numpy as np 24 | import pandas as pd 25 | 26 | from helper import path 27 | 28 | # Read 'gapminder.csv' into a DataFrame: df 29 | df = pd.read_csv(path + 'gm_2008_region.csv') 30 | 31 | # Create a boxplot of life expectancy per region 32 | df.boxplot('life', 'Region', rot=60) 33 | 34 | # Show the plot 35 | plt.show() 36 | 37 | # ---------------------- 38 | 39 | # Create arrays for features and target variable 40 | X = df['population'].values 41 | y = df['life'].values 42 | 43 | # Reshape X and y 44 | X = X.reshape(-1, 1) 45 | y = y.reshape(-1, 1) 46 | 47 | # Import necessary modules 48 | from sklearn.linear_model import Ridge 49 | from sklearn.model_selection import cross_val_score 50 | 51 | # Instantiate a ridge regressor: ridge 52 | ridge = Ridge(alpha=0.5, normalize=True) 53 | 54 | # Perform 5-fold cross-validation: ridge_cv 55 | ridge_cv = cross_val_score(ridge, X, y, cv=5) 56 | 57 | # Print the cross-validated scores 58 | print(ridge_cv) 59 | 60 | # ------------------------- 61 | 62 | # Convert '?' to NaN 63 | df[df == '?'] = np.nan 64 | 65 | # Print the number of NaNs 66 | print(df.isnull().sum()) 67 | 68 | # Print shape of original DataFrame 69 | print("Shape of Original DataFrame: {}".format(df.shape)) 70 | 71 | # Drop missing values and print shape of new DataFrame 72 | df = df.dropna() 73 | 74 | # Print shape of new DataFrame 75 | print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df.shape)) 76 | -------------------------------------------------------------------------------- /src/ml-supervised/ml_pipeline_with_hyperparameters.py: -------------------------------------------------------------------------------- 1 | # Bringing it all together I: 2 | # 3 | # Pipeline for classification It is time now to piece together everything you have 4 | # learned so far into a pipeline for classification! Your job in this exercise is to build a pipeline that includes 5 | # scaling and hyperparameter tuning to classify wine quality. 6 | # 7 | # You'll return to using the SVM classifier you were briefly introduced to earlier in this chapter. The 8 | # hyperparameters you will tune are C and gamma. C controls the regularization strength. It is analogous to the C you 9 | # tuned for logistic regression in Chapter 3, while gamma controls the kernel coefficient: Do not worry about this 10 | # now as it is beyond the scope of this course. 11 | # 12 | # The following modules have been pre-loaded: Pipeline, svm, train_test_split, GridSearchCV, classification_report, 13 | # accuracy_score. The feature and target variable arrays X and y have also been pre-loaded.e. 14 | 15 | import pandas as pd 16 | from sklearn.metrics import classification_report 17 | from sklearn.model_selection import GridSearchCV 18 | from sklearn.model_selection import train_test_split 19 | from sklearn.pipeline import Pipeline 20 | from sklearn.preprocessing import StandardScaler 21 | from sklearn.svm import SVC 22 | 23 | from helper import path 24 | 25 | # Read 'white-wine.csv' into a DataFrame: df 26 | df = pd.read_csv(path + 'white-wine.csv') 27 | 28 | X = df.drop('quality', axis=1) 29 | y = df['quality'] 30 | 31 | # Setup the pipeline 32 | steps = [('scaler', StandardScaler()), 33 | ('SVM', SVC())] 34 | 35 | pipeline = Pipeline(steps) 36 | 37 | # Specify the hyperparameter space 38 | parameters = {'SVM__C': [1, 10, 100], 39 | 'SVM__gamma': [0.1, 0.01]} 40 | 41 | # Create train and test sets 42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21) 43 | 44 | # Instantiate the GridSearchCV object: cv 45 | cv = GridSearchCV(pipeline, param_grid=parameters) 46 | 47 | # Fit to the training set 48 | cv.fit(X_train, y_train) 49 | 50 | # Predict the labels of the test set: y_pred 51 | y_pred = cv.predict(X_test) 52 | 53 | # Compute and print metrics 54 | print("Accuracy: {}".format(cv.score(X_test, y_test))) 55 | print(classification_report(y_test, y_pred)) 56 | print("Tuned Model Parameters: {}".format(cv.best_params_)) 57 | -------------------------------------------------------------------------------- /src/ml-supervised/ml_pipelines.py: -------------------------------------------------------------------------------- 1 | # Imputing missing data in a ML Pipeline I 2 | # 3 | # As you've come to appreciate, there are many steps to building a model, 4 | # from creating training and test sets, to fitting a classifier or regressor, to tuning its parameters, to evaluating 5 | # its performance on new data. Imputation can be seen as the first step of this machine learning process, 6 | # the entirety of which can be viewed within the context of a pipeline. Scikit-learn provides a pipeline constructor 7 | # that allows you to piece together these steps into one process and thereby simplify your workflow. 8 | # 9 | # You'll now practice setting up a pipeline with two steps: the imputation step, followed by the instantiation of a 10 | # classifier. You've seen three classifiers in this course so far: k-NN, logistic regression, and the decision tree. 11 | # You will now be introduced to a fourth one - the Support Vector Machine, or SVM. For now, do not worry about how it 12 | # works under the hood. It works exactly as you would expect of the scikit-learn estimators that you have worked 13 | # with previously, in that it has the same .fit() and .predict() methods as before. 14 | 15 | 16 | # Import the Imputer module 17 | from sklearn.preprocessing import Imputer 18 | from sklearn.svm import SVC 19 | 20 | # Setup the Imputation transformer: imp 21 | imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) 22 | 23 | # Instantiate the SVC classifier: clf 24 | clf = SVC() 25 | 26 | # Setup the pipeline with the required steps: steps 27 | steps = [('imputation', imp), 28 | ('SVM', clf)] 29 | 30 | # -------------- 31 | # Import necessary modules 32 | import pandas as pd 33 | from sklearn.preprocessing import Imputer 34 | from sklearn.pipeline import Pipeline 35 | from sklearn.svm import SVC 36 | from sklearn.metrics import classification_report 37 | from sklearn.model_selection import train_test_split 38 | from helper import path 39 | 40 | # Read 'white-wine.csv' into a DataFrame: df 41 | df = pd.read_csv(path + 'white-wine.csv') 42 | 43 | X = df.drop('quality', axis=1) 44 | y = df['quality'] 45 | # Setup the pipeline steps: steps 46 | steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)), 47 | ('SVM', SVC())] 48 | 49 | # Create the pipeline: pipeline 50 | pipeline = Pipeline(steps) 51 | 52 | # Create training and test sets 53 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 54 | 55 | # Fit the pipeline to the train set 56 | pipeline.fit(X_train, y_train) 57 | 58 | # Predict the labels of the test set 59 | y_pred = pipeline.predict(X_test) 60 | 61 | # Compute metrics 62 | print(classification_report(y_test, y_pred)) 63 | -------------------------------------------------------------------------------- /src/ml-supervised/py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-1.py: -------------------------------------------------------------------------------- 1 | # Hold-out set in practice I: Classification 2 | # 3 | # You will now practice evaluating a model with tuned hyperparameters on a 4 | # hold-out set. The feature array and target variable array from the diabetes dataset have been pre-loaded as X and y. 5 | # 6 | # In addition to C, logistic regression has a 'penalty' hyperparameter which specifies whether to use 'l1' or 'l2' 7 | # regularization. Your job in this exercise is to create a hold-out set, tune the 'C' and 'penalty' hyperparameters 8 | # of a logistic regression classifier using GridSearchCV on the training set, and then evaluate its performance 9 | # against the hold-out set. 10 | 11 | import numpy as np 12 | import pandas as pd 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.model_selection import GridSearchCV 15 | from sklearn.model_selection import train_test_split 16 | 17 | from helper import path 18 | 19 | # Read the CSV file into a DataFrame: df 20 | df = pd.read_csv(path + 'diabetes.csv') 21 | 22 | # Create arrays for features and target variable 23 | X = df.drop('diabetes', axis=1) 24 | y = df['diabetes'] 25 | 26 | # Create the hyperparameter grid 27 | c_space = np.logspace(-5, 8, 15) 28 | param_grid = {'C': c_space, 'penalty': ['l1', 'l2']} 29 | 30 | # Instantiate the logistic regression classifier: logreg 31 | logreg = LogisticRegression() 32 | 33 | # Create train and test sets 34 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 35 | 36 | # Instantiate the GridSearchCV object: logreg_cv 37 | logreg_cv = GridSearchCV(logreg, param_grid, cv=5) 38 | 39 | # Fit it to the training data 40 | logreg_cv.fit(X_train, y_train) 41 | 42 | # Print the optimal parameters and best score 43 | print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_)) 44 | print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_)) 45 | -------------------------------------------------------------------------------- /src/ml-supervised/py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-2.py: -------------------------------------------------------------------------------- 1 | # Hold-out set in practice II: Regression 2 | # 3 | # Remember lasso and ridge regression from the previous chapter? Lasso used 4 | # the L1 penalty to regularize, while ridge used the L2 penalty. There is another type of regularized regression 5 | # known as the elastic net. In elastic net regularization, the penalty term is a linear combination of the L1 and L2 6 | # penalties: 7 | # 8 | # a∗L1+b∗L2 In scikit-learn, this term is represented by the 'l1_ratio' parameter: An 'l1_ratio' of 1 corresponds to 9 | # an L1 penalty, and anything lower is a combination of L1 and L2. 10 | # 11 | # In this exercise, you will GridSearchCV to tune the 'l1_ratio' of an elastic net model trained on the Gapminder 12 | # data. As in the previous exercise, use a hold-out set to evaluate your model's performance. 13 | 14 | import numpy as np 15 | import pandas as pd 16 | from sklearn.linear_model import ElasticNet 17 | from sklearn.metrics import mean_squared_error 18 | from sklearn.model_selection import GridSearchCV 19 | from sklearn.model_selection import train_test_split 20 | 21 | from helper import path 22 | 23 | # Read the CSV file into a DataFrame: df 24 | df = pd.read_csv(path + 'diabetes.csv') 25 | 26 | # Create arrays for features and target variable 27 | X = df.drop('diabetes', axis=1) 28 | y = df['diabetes'] 29 | 30 | # Create train and test sets 31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 32 | 33 | # Create the hyperparameter grid 34 | l1_space = np.linspace(0, 1, 30) 35 | param_grid = {'l1_ratio': l1_space} 36 | 37 | # Instantiate the ElasticNet regressor: elastic_net 38 | elastic_net = ElasticNet() 39 | 40 | # Setup the GridSearchCV object: gm_cv 41 | gm_cv = GridSearchCV(elastic_net, param_grid, cv=5) 42 | 43 | # Fit it to the training data 44 | gm_cv.fit(X_train, y_train) 45 | 46 | # Predict on the test set and compute metrics 47 | y_pred = gm_cv.predict(X_test) 48 | r2 = gm_cv.score(X_test, y_test) 49 | mse = mean_squared_error(y_test, y_pred) 50 | print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_)) 51 | print("Tuned ElasticNet R squared: {}".format(r2)) 52 | print("Tuned ElasticNet MSE: {}".format(mse)) 53 | -------------------------------------------------------------------------------- /src/ml-supervised/py_hyperparamter_tuning_with_GridSearchCV.py: -------------------------------------------------------------------------------- 1 | # Hyperparameter tuning with GridSearchCV 2 | # 3 | # Hugo demonstrated how to use to tune the n_neighbors parameter of the 4 | # KNeighborsClassifier() using GridSearchCV on the voting dataset. You will now practice this yourself, but by using 5 | # logistic regression on the diabetes dataset instead! 6 | # 7 | # Like the alpha parameter of lasso and ridge regularization that you saw earlier, logistic regression also has a 8 | # regularization parameter: C. C controls the inverse of the regularization strength, and this is what you will tune 9 | # in this exercise. A large C can lead to an overfit model, while a small C can lead to an underfit model. 10 | # 11 | # The hyperparameter space for C has been setup for you. Your job is to use GridSearchCV and logistic regression to 12 | # find the optimal C in this hyperparameter space. The feature array is available as X and target variable array is 13 | # available as y. 14 | # 15 | # You may be wondering why you aren't asked to split the data into training and test sets. Good observation! Here, 16 | # we want you to focus on the process of setting up the hyperparameter grid and performing grid-search 17 | # cross-validation. In practice, you will indeed want to hold out a portion of your data for evaluation purposes, 18 | # and you will learn all about this in the next video! 19 | 20 | import numpy as np 21 | import pandas as pd 22 | # Import necessary modules 23 | from sklearn.linear_model import LogisticRegression 24 | from sklearn.model_selection import GridSearchCV 25 | 26 | from helper import path 27 | 28 | # Read the CSV file into a DataFrame: df 29 | df = pd.read_csv(path + 'diabetes.csv') 30 | 31 | # Create arrays for features and target variable 32 | X = df.drop('diabetes', axis=1) 33 | y = df['diabetes'] 34 | 35 | # Setup the hyperparameter grid 36 | c_space = np.logspace(-5, 8, 15) 37 | param_grid = {'C': c_space} 38 | 39 | # Instantiate a logistic regression classifier: logreg 40 | logreg = LogisticRegression() 41 | 42 | # Instantiate the GridSearchCV object: logreg_cv 43 | logreg_cv = GridSearchCV(logreg, param_grid, cv=5) 44 | 45 | # Fit it to the data 46 | logreg_cv.fit(X, y) 47 | 48 | # Print the tuned parameters and score 49 | print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 50 | print("Best score is {}".format(logreg_cv.best_score_)) 51 | -------------------------------------------------------------------------------- /src/ml-supervised/py_hyperparamter_tuning_with_RandomizedSearchCV.py: -------------------------------------------------------------------------------- 1 | # Hyperparameter tuning with RandomizedSearchCV 2 | # 3 | # GridSearchCV can be computationally expensive, especially if you are 4 | # searching over a large hyperparameter space and dealing with multiple hyperparameters. A solution to this is to use 5 | # RandomizedSearchCV, in which not all hyperparameter values are tried out. Instead, a fixed number of 6 | # hyperparameter settings is sampled from specified probability distributions. You'll practice using 7 | # RandomizedSearchCV in this exercise and see how this works. 8 | # 9 | # Here, you'll also be introduced to a new model: the Decision Tree. Don't worry about the specifics of how this 10 | # model works. Just like k-NN, linear regression, and logistic regression, decision trees in scikit-learn have .fit() 11 | # and .predict() methods that you can use in exactly the same way as before. Decision trees have many parameters 12 | # that can be tuned, such as max_features, max_depth, and min_samples_leaf: This makes it an ideal use case for 13 | # RandomizedSearchCV. 14 | # 15 | # As before, the feature array X and target variable array y of the diabetes dataset have been pre-loaded. The 16 | # hyperparameter settings have been specified for you. Your goal is to use RandomizedSearchCV to find the optimal 17 | # hyperparameters. Go for it! 18 | 19 | # Import the necessary modules 20 | import pandas as pd 21 | from scipy.stats import randint 22 | from sklearn.model_selection import RandomizedSearchCV 23 | from sklearn.tree import DecisionTreeClassifier 24 | 25 | from helper import path 26 | 27 | # Read the CSV file into a DataFrame: df 28 | df = pd.read_csv(path + 'diabetes.csv') 29 | 30 | # Create arrays for features and target variable 31 | X = df.drop('diabetes', axis=1) 32 | y = df['diabetes'] 33 | 34 | # Setup the parameters and distributions to sample from: param_dist 35 | param_dist = {"max_depth": [3, None], 36 | "max_features": randint(1, 9), 37 | "min_samples_leaf": randint(1, 9), 38 | "criterion": ["gini", "entropy"]} 39 | 40 | # Instantiate a Decision Tree classifier: tree 41 | tree = DecisionTreeClassifier() 42 | 43 | # Instantiate the RandomizedSearchCV object: tree_cv 44 | tree_cv = RandomizedSearchCV(tree, param_dist, cv=5) 45 | 46 | # Fit it to the data 47 | tree_cv.fit(X, y) 48 | 49 | # Print the tuned parameters and score 50 | print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_)) 51 | print("Best score is {}".format(tree_cv.best_score_)) 52 | -------------------------------------------------------------------------------- /src/ml-supervised/py_knn_classifier_modal.py: -------------------------------------------------------------------------------- 1 | # 2 | # k-Nearest Neighbors: Fit Having explored the Congressional voting records dataset, it is time now to build your 3 | # first classifier. In this exercise, you will fit a k-Nearest Neighbors classifier to the voting dataset, 4 | # which has once again been pre-loaded for you into a DataFrame df. 5 | # 6 | # In the video, Hugo discussed the importance of ensuring your data adheres to the format required by the 7 | # scikit-learn API. The features need to be in an array where each column is a feature and each row a different 8 | # observation or data point - in this case, a Congressman's voting record. The target needs to be a single column 9 | # with the same number of observations as the feature data. We have done this for you in this exercise. Notice we 10 | # named the feature array X and response variable y: This is in accordance with the common scikit-learn practice. 11 | # 12 | # Your job is to create an instance of a k-NN classifier with 6 neighbors (by specifying the n_neighbors parameter) 13 | # and then fit it to the data. The data has been pre-loaded into a DataFrame called df. # 14 | 15 | import pandas as pd 16 | from sklearn.neighbors import KNeighborsClassifier 17 | 18 | from helper import path 19 | 20 | # this dataset won't work. Can't run this program. 21 | file = 'house-votes-84.csv' 22 | 23 | df = pd.read_csv(path + file) 24 | 25 | # Explore Data 26 | print(df.describe()) 27 | 28 | # Create arrays for the features(X) and the response variable/target(y) 29 | X = df.drop('party', axis=1).values 30 | y = df['party'].values 31 | 32 | # Create a k-NN classifier with 6 neighbors 33 | knn = KNeighborsClassifier(n_neighbors=60) 34 | 35 | # Fit the classifier to the data 36 | knn.fit(X, y) 37 | 38 | # Predict the labels for the training data X 39 | y_pred = knn.predict(X) 40 | 41 | # This is our prediction based of knn-classifier - Prediction: ['democrat'] 42 | print(y_pred) 43 | -------------------------------------------------------------------------------- /src/ml-supervised/py_knn_classifier_modal_train_test.py: -------------------------------------------------------------------------------- 1 | # Import necessary modules 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from sklearn import datasets 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.neighbors import KNeighborsClassifier 7 | 8 | # Load the digits dataset: digits 9 | digits = datasets.load_digits() 10 | 11 | # Create feature and target arrays 12 | X = digits.data 13 | y = digits.target 14 | 15 | # Split into training and test set 16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 17 | 18 | # Create a k-NN classifier with 7 neighbors: knn 19 | knn = KNeighborsClassifier(n_neighbors=7) 20 | 21 | # Fit the classifier to the training data 22 | knn.fit(X_train, y_train) 23 | 24 | # Print the accuracy 25 | print(knn.score(X_test, y_test)) 26 | 27 | # -------------------------# 28 | 29 | # Test model accuracy on Training data and Test data and plot a graph 30 | 31 | # Setup arrays to store train and test accuracies 32 | neighbors = np.arange(1, 9) 33 | train_accuracy = np.empty(len(neighbors)) 34 | test_accuracy = np.empty(len(neighbors)) 35 | 36 | # Loop over different values of k 37 | for i, k in enumerate(neighbors): 38 | # Setup a k-NN Classifier with k neighbors: knn 39 | knn = KNeighborsClassifier(n_neighbors=k) 40 | 41 | # Fit the classifier to the training data 42 | knn.fit(X_train, y_train) 43 | 44 | # Compute accuracy on the training set 45 | train_accuracy[i] = knn.score(X_train, y_train) 46 | 47 | # Compute accuracy on the testing set 48 | test_accuracy[i] = knn.score(X_test, y_test) 49 | 50 | # Generate plot 51 | plt.title('k-NN: Varying Number of Neighbors') 52 | plt.plot(neighbors, test_accuracy, label='Testing Accuracy') 53 | plt.plot(neighbors, train_accuracy, label='Training Accuracy') 54 | plt.legend() 55 | plt.xlabel('Number of Neighbors') 56 | plt.ylabel('Accuracy') 57 | plt.show() 58 | -------------------------------------------------------------------------------- /src/ml-supervised/py_knn_classifiers_performance_metrics.py: -------------------------------------------------------------------------------- 1 | # Metrics for classification 2 | # 3 | # In Chapter 1, you evaluated the performance of your k-NN classifier based on its 4 | # accuracy. However, as Andy discussed, accuracy is not always an informative metric. In this exercise, you will dive 5 | # more deeply into evaluating the performance of binary classifiers by computing a confusion matrix and generating a 6 | # classification report. 7 | # 8 | # You may have noticed in the video that the classification report consisted of three rows, and an additional support 9 | # column. The support gives the number of samples of the true response that lie in that class - so in the video 10 | # example, the support was the number of Republicans or Democrats in the test set on which the classification report 11 | # was computed. The precision, recall, and f1-score columns, then, gave the respective metrics for that particular 12 | # class. 13 | # 14 | # Here, you'll work with the PIMA Indians dataset obtained from the UCI Machine Learning Repository. The goal is to 15 | # predict whether or not a given female patient will contract diabetes based on features such as BMI, age, 16 | # and number of pregnancies. Therefore, it is a binary classification problem. A target value of 0 indicates that the 17 | # patient does not have diabetes, while a value of 1 indicates that the patient does have diabetes. As in Chapters 1 18 | # and 2, the dataset has been preprocessed to deal with missing values. 19 | # 20 | # The dataset has been loaded into a DataFrame df and the feature and target variable arrays X and y have been 21 | # created for you. In addition, sklearn.model_selection.train_test_split and sklearn.neighbors.KNeighborsClassifier 22 | # have already been imported. 23 | # 24 | # Your job is to train a k-NN classifier to the data and evaluate its performance by generating a confusion matrix 25 | # and classification report. 26 | 27 | 28 | # Import numpy and pandas 29 | import pandas as pd 30 | from sklearn.model_selection import train_test_split 31 | # Import necessary modules 32 | from sklearn.neighbors import KNeighborsClassifier 33 | 34 | from helper import path 35 | 36 | # Read the CSV file into a DataFrame: df 37 | df = pd.read_csv(path + 'diabetes.csv') 38 | 39 | # Create arrays for features and target variable 40 | X = df['age'].values 41 | y = df['diabetes'].values 42 | 43 | # Reshape X and y 44 | X = X.reshape(-1, 1) 45 | y = y.reshape(-1, 1) 46 | 47 | # Import necessary modules 48 | from sklearn.metrics import classification_report 49 | from sklearn.metrics import confusion_matrix 50 | 51 | # Create training and test set 52 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 53 | 54 | # Instantiate a k-NN classifier: knn 55 | knn = KNeighborsClassifier(n_neighbors=6) 56 | 57 | # Fit the classifier to the training data 58 | knn.fit(X_train, y_train) 59 | 60 | # Predict the labels of the test data: y_pred 61 | y_pred = knn.predict(X_test) 62 | 63 | # Generate the confusion matrix and classification report 64 | print(confusion_matrix(y_test, y_pred)) 65 | print(classification_report(y_test, y_pred)) 66 | -------------------------------------------------------------------------------- /src/ml-supervised/py_lasso_regularized_linear_regression.py: -------------------------------------------------------------------------------- 1 | # What is Lasso Regression? 2 | # 3 | # http://www.statisticshowto.com/lasso-regression/ 4 | # 5 | # Lasso regression is a type of linear 6 | # regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean. 7 | # The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of 8 | # regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain 9 | # parts of model selection, like variable selection/parameter elimination. 10 | # 11 | # The acronym “LASSO” stands for Least Absolute Shrinkage and Selection Operator. Lasso regression is what is called 12 | # the Penalized regression method, often used in machine learning to select the subset of variables. It is a 13 | # supervised machine learning method. Specifically, LASSO is a Shrinkage and Variable Selection method for linear 14 | # regression models. LASSO, is actually an acronym for Least Absolute Selection and Shrinkage Operator. 15 | # 16 | # 0:34 17 | # 18 | # The LASSO imposes a constraint on the sum of the absolute values of the model parameters, where the sum has a 19 | # specified constant as an upper bound. This constraint causes regression coefficients for some variables to shrink 20 | # towards zero. This is the shrinkage process. The shrinkage process allows for better interpretation of the model 21 | # and identifies the variables most strongly associated with the target corresponds variable. That is the variable 22 | # selection process. It goes to obtain the subset of predictors that minimizes prediction error. 23 | # 24 | # So why use Lasso instead of just using ordinary least squares multiple regression? 25 | # 26 | # Well, first, it can provide greater prediction accuracy. If the true relationship between the response variable and 27 | # the predictors is approximately linear and you have a large number of observations, then OLS regression parameter 28 | # estimates will have low bias and low variance. However, if you have a relatively small number of observations and a 29 | # large number of predictors, then the variance of the OLS perimeter estimates will be higher. In this case, 30 | # Lasso Regression is useful because shrinking the regression coefficient can reduce variance without a substantial 31 | # increase in bias. 1:43 Second, Lasso Regression can increase model interpretability. Often times, at least some of 32 | # the explanatory variables in an OLS multiple regression analysis are not really associated with the response 33 | # variable. As a result, we often end up with a model that's over fitted and more difficult to interpret. With Lasso 34 | # Regression, the regression coefficients for unimportant variables are reduced to zero which effectively removes 35 | # them from the model and produces a simpler model that selects only the most important predictors. In Lasso 36 | # Regression, a tuning parameter called lambda is applied to the regression model to control the strength of the 37 | # penalty. As lambda increases, more coefficients are reduced to zero that is fewer predictors are selected and there 38 | # is more shrinkage of the non-zero coefficient. With Lasso Regression where lambda is equal to zero then we have an 39 | # OLS regression analysis. Bias increases and variance decreases as lambda increases. To demonstrate how lasso 40 | # regression works, let's use and example from the ad help data set in which our goal is to identify a set of 41 | # variables that best predicts the extent to which students feel connected to their school. We will use the same 42 | # ad-health data set that we used for the decision tree in random forced machine learning applications. The response 43 | # or target variable is a quantitative variable that measures school connectedness. The response values range from 6 44 | # to 38, where higher values indicate a greater connection with the school. There are a total of 23 Categorical and 45 | # Quantitative predictor variables. This is a pretty large number of predictor variables, so using OLS multiple 46 | # regression analysis would not be ideal, particularly if the goal is to identify a smaller subset of these 47 | # predictors that most accurately predicts school connectedness. Categorical predictors include gender and race and 48 | # ethnicity. Although Lasso Regression models can handle categorical variables with more than two levels In 49 | # conducting my data management, I created a series of five binary categorical variables for race and ethnicity, 50 | # Hispanic, White, Black, Native American, and Asian. I did this to improve interpratability of the selected model. 51 | # Binary substitutes variables for measure with individual questions of about whether the adolescent had ever used 52 | # alcohol, marijuana, cocaine, or inhalants. Additional categorical variables include the availability of cigarettes 53 | # in the home, whether or not either parent was on public assistance, and any experience with being expelled from 54 | # school. Finally, quantitative predictive variables include age, alcohol problems, and a measure of deviance. That 55 | # includes such behaviors as vandalism, other property damage, lying, stealing, running away, 56 | 57 | import matplotlib.pyplot as plt 58 | # Import the necessary modules 59 | import pandas as pd 60 | # Import Lasso 61 | from sklearn.linear_model import Lasso 62 | 63 | from helper import path 64 | 65 | # Read the CSV file into a DataFrame: df 66 | df = pd.read_csv(path + 'gm_2008_region.csv') 67 | print(df.info()) 68 | print(df.describe()) 69 | print(df.head()) 70 | 71 | # Create arrays for features and target variable 72 | X = df['population'].values 73 | y = df['life'].values 74 | 75 | # Reshape X and y 76 | X = X.reshape(-1, 1) 77 | # y = y.reshape(-1, 1) 78 | 79 | # Instantiate a lasso regressor: lasso 80 | lasso = Lasso(alpha=0.4, normalize=True) 81 | 82 | # Fit the regressor to the data 83 | lasso.fit(X, y) 84 | 85 | # Compute and print the coefficients 86 | lasso_coef = lasso.coef_ 87 | print(lasso_coef) 88 | 89 | df_columns = df.keys() 90 | print(df_columns) 91 | 92 | # Plot the coefficients 93 | plt.plot(range(len(df_columns)), lasso_coef) 94 | plt.xticks(range(len(df_columns)), df_columns.values, rotation=60) 95 | plt.margins(0.02) 96 | plt.show() 97 | -------------------------------------------------------------------------------- /src/ml-supervised/py_linear_regression_modal.py: -------------------------------------------------------------------------------- 1 | # Fit & predict for regression 2 | # 3 | # Now, you will fit a linear regression and predict life expectancy using just one 4 | # feature. You saw Andy do this earlier using the 'RM' feature of the Boston housing dataset. In this exercise, 5 | # you will use the 'fertility' feature of the Gapminder dataset. Since the goal is to predict life expectancy, 6 | # the target variable here is 'life'. The array for the target variable has been pre-loaded as y and the array for 7 | # 'fertility' has been pre-loaded as X_fertility. 8 | # 9 | # A scatter plot with 'fertility' on the x-axis and 'life' on the y-axis has been generated. As you can see, 10 | # there is a strongly negative correlation, so a linear regression should be able to capture this trend. Your job is 11 | # to fit a linear regression and then predict the life expectancy, overlaying these predicted values on the plot to 12 | # generate a regression line. You will also compute and print the R2 score using sckit-learn's .score() method. 13 | 14 | # Import numpy and pandas 15 | import numpy as np 16 | import pandas as pd 17 | from matplotlib import pyplot as plt 18 | 19 | # Import LinearRegression 20 | from sklearn.linear_model import LinearRegression 21 | 22 | from helper import path 23 | 24 | # Read the CSV file into a DataFrame: df 25 | df = pd.read_csv(path + 'gm_2008_region.csv') 26 | 27 | # Create arrays for features and target variable 28 | X_fertility = df['fertility'].values 29 | y = df['life'].values 30 | 31 | # Print the dimensions of X and y before reshaping 32 | print("Dimensions of y before reshaping: {}".format(y.shape)) 33 | print("Dimensions of X before reshaping: {}".format(X_fertility.shape)) 34 | 35 | # Reshape X and y 36 | y = y.reshape(-1, 1) 37 | X_fertility = X_fertility.reshape(-1, 1) 38 | 39 | # Print the dimensions of X and y after reshaping 40 | print("Dimensions of y after reshaping: {}".format(y.shape)) 41 | print("Dimensions of X after reshaping: {}".format(X_fertility.shape)) 42 | 43 | # Create the regressor: reg 44 | reg = LinearRegression() 45 | 46 | # Create the prediction space 47 | prediction_space = np.linspace(min(X_fertility), max(X_fertility)).reshape(-1, 1) 48 | 49 | # Fit the model to the data 50 | reg.fit(X_fertility, y) 51 | 52 | # Compute predictions over the prediction space: y_pred 53 | y_pred = reg.predict(prediction_space) 54 | 55 | # Print R^2 56 | print(reg.score(X_fertility, y)) 57 | 58 | # Plot regression line over original scatter plot 59 | plt.scatter(X_fertility, y, color='blue') 60 | plt.plot(prediction_space, y_pred, color='black', linewidth=3) 61 | plt.show() 62 | -------------------------------------------------------------------------------- /src/ml-supervised/py_linear_regression_modal_train_test.py: -------------------------------------------------------------------------------- 1 | # Fit & predict for regression 2 | # Import numpy and pandas 3 | import numpy as np 4 | import pandas as pd 5 | # Import necessary modules 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.model_selection import train_test_split 9 | 10 | from helper import path 11 | 12 | # Read the CSV file into a DataFrame: df 13 | df = pd.read_csv(path + 'gm_2008_region.csv') 14 | 15 | # Create arrays for features and target variable 16 | X_fertility = df['fertility'].values 17 | y = df['life'].values 18 | 19 | # Reshape X and y 20 | X_fertility = X_fertility.reshape(-1, 1) 21 | y = y.reshape(-1, 1) 22 | 23 | # Create training and test sets 24 | X_train, X_test, y_train, y_test = train_test_split(X_fertility, y, test_size=0.3, random_state=42) 25 | 26 | # Create the regressor: reg_all 27 | reg_all = LinearRegression() 28 | 29 | # Fit the regressor to the training data 30 | reg_all.fit(X_train, y_train) 31 | 32 | # Predict on the test data: y_pred 33 | y_pred = reg_all.predict(X_test) 34 | 35 | # Compute and print R^2 and RMSE 36 | print("R^2: {}".format(reg_all.score(X_test, y_test))) 37 | rmse = np.sqrt(mean_squared_error(y_test, y_pred)) 38 | print("Root Mean Squared Error: {}".format(rmse)) 39 | -------------------------------------------------------------------------------- /src/ml-supervised/py_logistic_regression_modal.py: -------------------------------------------------------------------------------- 1 | # Building a logistic regression model 2 | # 3 | # Time to build your first logistic regression model! As Hugo showed in the 4 | # video, scikit-learn makes it very easy to try different models, since the Train-Test-Split/Instantiate/Fit/Predict 5 | # paradigm applies to all classifiers and regressors - which are known in scikit-learn as 'estimators'. You'll see 6 | # this now for yourself as you train a logistic regression model on exactly the same data as in the previous 7 | # exercise. Will it outperform k-NN? There's only one way to find out! 8 | # 9 | # The feature and target variable arrays X and y have been pre-loaded, and train_test_split has been imported for you 10 | # from sklearn.model_selection. 11 | 12 | # Import the necessary modules 13 | import pandas as pd 14 | from sklearn.linear_model import LogisticRegression 15 | from sklearn.metrics import confusion_matrix, classification_report 16 | from sklearn.model_selection import train_test_split 17 | 18 | from helper import path 19 | 20 | # Read the CSV file into a DataFrame: df 21 | df = pd.read_csv(path + 'diabetes.csv') 22 | 23 | # Create arrays for features and target variable 24 | X = df['age'].values 25 | y = df['diabetes'].values 26 | 27 | # Reshape X and y 28 | X = X.reshape(-1, 1) 29 | y = y.reshape(-1, 1) 30 | 31 | # Create training and test sets 32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) 33 | 34 | # Create the classifier: logreg 35 | logreg = LogisticRegression() 36 | 37 | # Fit the classifier to the training data 38 | logreg.fit(X_train, y_train) 39 | 40 | # Predict the labels of the test set: y_pred 41 | y_pred = logreg.predict(X_test) 42 | 43 | # Compute and print the confusion matrix and classification report 44 | print(confusion_matrix(y_test, y_pred)) 45 | print(classification_report(y_test, y_pred)) 46 | 47 | # ------------------------- 48 | 49 | # Plotting an ROC curve 50 | # 51 | # Classification reports and confusion matrices are great methods to quantitatively evaluate model performance, 52 | # while ROC curves provide a way to visually evaluate models. As Hugo demonstrated in the video, most classifiers in 53 | # scikit-learn have a .predict_proba() method which returns the probability of a given sample being in a particular 54 | # class. Having built a logistic regression model, you'll now evaluate its performance by plotting an ROC curve. In 55 | # doing so, you'll make use of the .predict_proba() method and become familiar with its functionality. 56 | # 57 | # Here, you'll continue working with the PIMA Indians diabetes dataset. The classifier has already been fit to the 58 | # training data and is available as logreg 59 | 60 | # Import necessary modules 61 | from sklearn.metrics import roc_curve 62 | from matplotlib import pyplot as plt 63 | 64 | # Compute predicted probabilities: y_pred_prob 65 | y_pred_prob = logreg.predict_proba(X_test)[:, 1] 66 | 67 | # Generate ROC curve values: fpr, tpr, thresholds 68 | fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) 69 | 70 | # Plot ROC curve 71 | plt.plot([0, 1], [0, 1], 'k--') 72 | plt.plot(fpr, tpr) 73 | plt.xlabel('False Positive Rate') 74 | plt.ylabel('True Positive Rate') 75 | plt.title('ROC Curve') 76 | plt.show() 77 | 78 | # ------------------------- 79 | 80 | # Calculating ROC AUC score 81 | # Larger area under ROC curve = better model 82 | 83 | from sklearn.metrics import roc_auc_score 84 | 85 | print(roc_auc_score(y_test, y_pred_prob)) 86 | -------------------------------------------------------------------------------- /src/ml-supervised/py_ridge_regularized_linear_regression.py: -------------------------------------------------------------------------------- 1 | # Regularization II: Ridge 2 | # 3 | # Lasso is great for feature selection, but when building regression models, 4 | # Ridge regression should be your first choice. 5 | # 6 | # Recall that lasso performs regularization by adding to the loss function a penalty term of the absolute value of 7 | # each coefficient multiplied by some alpha. This is also known as L1 regularization because the regularization term 8 | # is the L1 norm of the coefficients. This is not the only way to regularize, however. 9 | # 10 | # If instead you took the sum of the squared values of the coefficients multiplied by some alpha - like in Ridge 11 | # regression - you would be computing the L2 norm. In this exercise, you will practice fitting ridge regression 12 | # models over a range of different alphas, and plot cross-validated R2 scores for each, using this function that we 13 | # have defined for you, which plots the R2 score as well as standard error for each alpha: 14 | 15 | 16 | def display_plot(cv_scores, cv_scores_std): 17 | fig = plt.figure() 18 | ax = fig.add_subplot(1, 1, 1) 19 | ax.plot(alpha_space, cv_scores) 20 | 21 | std_error = cv_scores_std / np.sqrt(10) 22 | 23 | ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2) 24 | ax.set_ylabel('CV Score +/- Std Error') 25 | ax.set_xlabel('Alpha') 26 | ax.axhline(np.max(cv_scores), linestyle='--', color='.5') 27 | ax.set_xlim([alpha_space[0], alpha_space[-1]]) 28 | ax.set_xscale('log') 29 | plt.show() 30 | 31 | 32 | import matplotlib.pyplot as plt 33 | import numpy as np 34 | import pandas as pd 35 | from sklearn.linear_model import Ridge 36 | from sklearn.model_selection import cross_val_score 37 | 38 | from helper import path 39 | 40 | # Read the CSV file into a DataFrame: df 41 | df = pd.read_csv(path + 'gm_2008_region.csv') 42 | print(df.info()) 43 | print(df.describe()) 44 | print(df.head()) 45 | 46 | # Create arrays for features and target variable 47 | X = df['population'].values 48 | y = df['life'].values 49 | 50 | # Reshape X and y 51 | X = X.reshape(-1, 1) 52 | # y = y.reshape(-1, 1) 53 | 54 | # Setup the array of alphas and lists to store scores 55 | alpha_space = np.logspace(-4, 0, 50) 56 | ridge_scores = [] 57 | ridge_scores_std = [] 58 | 59 | # Create a ridge regressor: ridge 60 | ridge = Ridge(normalize=True) 61 | 62 | # Compute scores over range of alphas 63 | for alpha in alpha_space: 64 | # Specify the alpha value to use: ridge.alpha 65 | ridge.alpha = alpha 66 | 67 | # Perform 10-fold CV: ridge_cv_scores 68 | ridge_cv_scores = cross_val_score(ridge, X, y, cv=10) 69 | 70 | # Append the mean of ridge_cv_scores to ridge_scores 71 | ridge_scores.append(np.mean(ridge_cv_scores)) 72 | 73 | # Append the std of ridge_cv_scores to ridge_scores_std 74 | ridge_scores_std.append(np.std(ridge_cv_scores)) 75 | 76 | # Display the plot 77 | display_plot(ridge_scores, ridge_scores_std) 78 | -------------------------------------------------------------------------------- /src/ml-supervised/py_sklearn_digits_dataset.py: -------------------------------------------------------------------------------- 1 | # The digits recognition dataset Up until now, you have been performing binary classification, since the target 2 | # variable had two possible outcomes. Hugo, however, got to perform multi-class classification in the videos, 3 | # where the target variable could take on three possible outcomes. Why does he get to have all the fun?! In the 4 | # following exercises, you'll be working with the MNIST digits recognition dataset, which has 10 classes, the digits 5 | # 0 through 9! A reduced version of the MNIST dataset is one of scikit-learn's included datasets, and that is the one 6 | # we will use in this exercise. 7 | # 8 | # Each sample in this scikit-learn dataset is an 8x8 image representing a handwritten digit. Each pixel is 9 | # represented by an integer in the range 0 to 16, indicating varying levels of black. Recall that scikit-learn's 10 | # built-in datasets are of type Bunch, which are dictionary-like objects. Helpfully for the MNIST dataset, 11 | # scikit-learn provides an 'images' key in addition to the 'data' and 'target' keys that you have seen with the Iris 12 | # data. Because it is a 2D array of the images corresponding to each sample, this 'images' key is useful for 13 | # visualizing the images, as you'll see in this exercise (for more on plotting 2D arrays, see Chapter 2 of DataCamp's 14 | # course on Data Visualization with Python). On the other hand, the 'data' key contains the feature array - that is, 15 | # the images as a flattened array of 64 pixels. 16 | # 17 | # Notice that you can access the keys of these Bunch objects in two different ways: By using the . notation, 18 | # as in digits.images, or the [] notation, as in digits['images']. 19 | # 20 | # For more on the MNIST data, check out this exercise in Part 1 of DataCamp's Importing Data in Python course. There, 21 | # the full version of the MNIST dataset is used, in which the images are 28x28. It is a famous dataset in machine 22 | # learning and computer vision, and frequently used as a benchmark to evaluate the performance of a new model. 23 | 24 | import matplotlib.pyplot as plt 25 | # Import necessary modules 26 | from sklearn import datasets 27 | 28 | # Load the digits dataset: digits 29 | digits = datasets.load_digits() 30 | 31 | # Print the keys and DESCR of the dataset 32 | print(digits.keys()) 33 | print(digits.DESCR) 34 | 35 | # Print the shape of the images and data keys 36 | print(digits.images.shape) 37 | print(digits.data.shape) 38 | 39 | # Display digit 1010 40 | plt.imshow(digits.images[1010], cmap=plt.cm.gray_r, interpolation='nearest') 41 | plt.show() -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/01-how-many-clusters.py: -------------------------------------------------------------------------------- 1 | """ 2 | How many clusters? 3 | 4 | You are given an array points of size 300x2, where each row gives the (x, y) co-ordinates of a point on a map. Make a 5 | scatter plot of these points, and use the scatter plot to guess how many clusters there are. 6 | 7 | matplotlib.pyplot has already been imported as plt. In the IPython Shell: 8 | 9 | Create an array called xs that contains the values of points[:,0] - that is, column 0 of points. 10 | Create an array called ys that contains the values of points[:,1] - that is, column 1 of points. 11 | Make a scatter plot by passing xs and ys to the plt.scatter() function. 12 | Call the plt.show() function to show your plot. 13 | How many clusters do you see? 14 | """ 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | 18 | from helper import points 19 | 20 | print(type(points)) 21 | points = np.array(points) 22 | 23 | xs = points[:, 0] 24 | 25 | ys = points[:, 1] 26 | 27 | plt.scatter(xs, ys, alpha=0.5) 28 | 29 | plt.show() 30 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/02-clustering-2d-points.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clustering 2D points 3 | 4 | From the scatter plot of the previous exercise, you saw that the points seem to separate into 3 clusters. You'll now 5 | create a KMeans model to find 3 clusters, and fit it to the data points from the previous exercise. After the model 6 | has been fit, you'll obtain the cluster labels for some new points using the .predict() method. 7 | 8 | You are given the array points from the previous exercise, and also an array new_points. 9 | 10 | INSTRUCTIONS 11 | 100XP 12 | INSTRUCTIONS 13 | 100XP 14 | Import KMeans from sklearn.cluster. 15 | Using KMeans(), create a KMeans instance called model to find 3 clusters. To specify the number of clusters, use the n_clusters keyword argument. 16 | Use the .fit() method of model to fit the model to the array of points points. 17 | Use the .predict() method of model to predict the cluster labels of new_points, assigning the result to labels. 18 | Hit 'Submit Answer' to see the cluster labels of new_points. 19 | ''' 20 | import numpy as np 21 | # Import KMeans 22 | from sklearn.cluster import KMeans 23 | 24 | from helper import points, new_points, smart_print 25 | 26 | # Convert to np-arrays 27 | points = np.array(points) 28 | new_points = np.array(new_points) 29 | 30 | # Create a KMeans instance with 3 clusters: model 31 | model = KMeans(n_clusters=3) 32 | 33 | # Fit model to points 34 | model.fit(points) 35 | 36 | # Determine the cluster labels of new_points: labels 37 | labels = model.predict(new_points) 38 | 39 | # Print cluster labels of new_points 40 | smart_print(labels) 41 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/03-inspect-your-clustering.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Inspect your clustering 3 | 4 | Let's now inspect the clustering you performed in the previous exercise! 5 | 6 | A solution to the previous exercise has already run, so new_points is an array of points and labels is the array of their cluster labels. 7 | 8 | INSTRUCTIONS 9 | 100XP 10 | Import matplotlib.pyplot as plt. 11 | Assign column 0 of new_points to xs, and column 1 of new_points to ys. 12 | Make a scatter plot of xs and ys, specifying the c=labels keyword arguments to color the points by their cluster label. Also specify alpha=0.5. 13 | Compute the coordinates of the centroids using the .cluster_centers_ attribute of model. 14 | Assign column 0 of centroids to centroids_x, and column 1 of centroids to centroids_y. 15 | Make a scatter plot of centroids_x and centroids_y, using 'D' (a diamond) as a marker by specifying the marker parameter. Set the size of the markers to be 50 using s=50. 16 | ''' 17 | # Import pyplot 18 | import matplotlib.pyplot as plt 19 | from numpy import array 20 | from sklearn.cluster import KMeans 21 | 22 | from helper import points, new_points 23 | 24 | # Convert to np-arrays 25 | points = array(points) 26 | new_points = array(new_points) 27 | 28 | # Create a KMeans instance with 3 clusters: model 29 | model = KMeans(n_clusters=3) 30 | 31 | # Fit model to points 32 | model.fit(points) 33 | 34 | # Determine the cluster labels of new_points: labels 35 | labels = model.predict(new_points) 36 | # Assign the columns of new_points: xs and ys 37 | xs = new_points[:, 0] 38 | ys = new_points[:, 1] 39 | 40 | # Make a scatter plot of xs and ys, using labels to define the colors 41 | plt.scatter(xs, ys, alpha=0.5, c=labels) 42 | 43 | # Assign the cluster centers: centroids 44 | centroids = model.cluster_centers_ 45 | 46 | # Assign the columns of centroids: centroids_x, centroids_y 47 | centroids_x = centroids[:, 0] 48 | centroids_y = centroids[:, 1] 49 | 50 | # Make a scatter plot of centroids_x and centroids_y 51 | plt.scatter(centroids_x, centroids_y, marker='D', s=50) 52 | plt.show() 53 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/04-how-many-clusters-of-grain.py: -------------------------------------------------------------------------------- 1 | ''' 2 | How many clusters of grain? 3 | 4 | In the video, you learned how to choose a good number of clusters for a dataset using the k-means inertia graph. You 5 | are given an array samples containing the measurements (such as area, perimeter, length, and several others) of 6 | samples of grain. What's a good number of clusters in this case? 7 | 8 | KMeans and PyPlot (plt) have already been imported for you. 9 | 10 | This dataset was sourced from the UCI Machine Learning Repository. 11 | 12 | INSTRUCTIONS 13 | 100XP 14 | For each of the given values of k, perform the following steps: 15 | Create a KMeans instance called model with k clusters. 16 | Fit the model to the grain data samples. 17 | Append the value of the inertia_ attribute of model to the list inertias. 18 | The code to plot ks vs inertias has been written for you, so hit 'Submit Answer' to see the plot! 19 | ''' 20 | # Import pyplot 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | from sklearn.cluster import KMeans 24 | 25 | from helper import points 26 | 27 | samples = np.array(points) 28 | 29 | ks = range(1, 6) 30 | inertias = [] 31 | 32 | for k in ks: 33 | # Create a KMeans instance with k clusters: model 34 | model = KMeans(n_clusters=k) 35 | 36 | # Fit model to samples 37 | model.fit(samples) 38 | 39 | # Append the inertia to the list of inertias 40 | inertias.append(model.inertia_) 41 | 42 | # Plot ks vs inertias 43 | plt.plot(ks, inertias, '-o') 44 | plt.xlabel('number of clusters, k') 45 | plt.ylabel('inertia') 46 | plt.xticks(ks) 47 | plt.show() 48 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/05-evaluating-the-grain-clustering.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Evaluating the grain clustering 3 | 4 | In the previous exercise, you observed from the inertia plot that 3 is a good number of clusters for the grain data. 5 | In fact, the grain samples come from a mix of 3 different grain varieties: "Kama", "Rosa" and "Canadian". In this 6 | exercise, cluster the grain samples into three clusters, and compare the clusters to the grain varieties using a 7 | cross-tabulation. 8 | 9 | You have the array samples of grain samples, and a list varieties giving the grain variety for each sample. Pandas ( 10 | pd) and KMeans have already been imported for you. 11 | 12 | INSTRUCTIONS 100XP Create a KMeans model called model with 3 clusters. Use the .fit_predict() method of model to fit 13 | it to samples and derive the cluster labels. Using .fit_predict() is the same as using .fit() followed by .predict(). 14 | Create a DataFrame df with two columns named 'labels' and 'varieties', using labels and varieties, respectively, 15 | for the column values. This has been done for you. Use the pd.crosstab() function on df['labels'] and df['varieties'] 16 | to count the number of times each grain variety coincides with each cluster label. Assign the result to ct. Hit 17 | 'Submit Answer' to see the cross-tabulation! ''' 18 | 19 | # Import pyplot 20 | import numpy as np 21 | import pandas as pd 22 | from sklearn.cluster import KMeans 23 | 24 | from helper import points 25 | 26 | samples = np.array(points) 27 | 28 | # Create a KMeans model with 3 clusters: model 29 | model = KMeans(n_clusters=3) 30 | 31 | # Use fit_predict to fit model and obtain cluster labels: labels 32 | labels = model.fit_predict(samples) 33 | 34 | # Create a DataFrame with labels and varieties as columns: df 35 | df = pd.DataFrame({'labels': labels, 'varieties': varieties}) 36 | 37 | # Create crosstab: ct 38 | ct = pd.crosstab(df['labels'], df['varieties']) 39 | 40 | # Display ct 41 | print(ct) 42 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/06-07-scaling-&-clustering-the-fish-data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Scaling fish data for clustering 3 | 4 | You are given an array samples giving measurements of fish. Each row represents an individual fish. The measurements, such as weight in grams, length in centimeters, and the percentage ratio of height to length, have very different scales. In order to cluster this data effectively, you'll need to standardize these features first. In this exercise, you'll build a pipeline to standardize and cluster the data. 5 | 6 | These fish measurement data were sourced from the Journal of Statistics Education. 7 | 8 | INSTRUCTIONS 9 | 100XP 10 | INSTRUCTIONS 11 | 100XP 12 | Import: 13 | make_pipeline from sklearn.pipeline. 14 | StandardScaler from sklearn.preprocessing. 15 | KMeans from sklearn.cluster. 16 | Create an instance of StandardScaler called scaler. 17 | Create an instance of KMeans with 4 clusters called kmeans. 18 | Create a pipeline called pipeline that chains scaler and kmeans. To do this, you just need to pass them in as arguments to make_pipeline(). 19 | ''' 20 | from sklearn.cluster import KMeans 21 | # Perform the necessary imports 22 | from sklearn.pipeline import make_pipeline 23 | from sklearn.preprocessing import StandardScaler 24 | 25 | # Create scaler: scaler 26 | scaler = StandardScaler() 27 | 28 | # Create KMeans instance: kmeans 29 | kmeans = KMeans(n_clusters=4) 30 | 31 | # Create pipeline: pipeline 32 | pipeline = make_pipeline(scaler, kmeans) 33 | 34 | ''' 35 | Clustering the fish data 36 | 37 | You'll now use your standardization and clustering pipeline from the previous exercise to cluster the fish by their measurements, and then create a cross-tabulation to compare the cluster labels with the fish species. 38 | 39 | As before, samples is the 2D array of fish measurements. Your pipeline is available as pipeline, and the species of every fish sample is given by the list species. 40 | 41 | INSTRUCTIONS 42 | 100XP 43 | Import pandas as pd. 44 | Fit the pipeline to the fish measurements samples. 45 | Obtain the cluster labels for samples by using the .predict() method of pipeline. 46 | Using pd.DataFrame(), create a DataFrame df with two columns named 'labels' and 'species', using labels and species, respectively, for the column values. 47 | Using pd.crosstab(), create a cross-tabulation ct of df['labels'] and df['species']. 48 | ''' 49 | # Import pandas 50 | import pandas as pd 51 | import numpy as np 52 | 53 | from helper import points 54 | 55 | samples = np.array(points) 56 | 57 | # Fit the pipeline to samples 58 | pipeline.fit(samples) 59 | 60 | # Calculate the cluster labels: labels 61 | labels = pipeline.predict(samples) 62 | 63 | # Create a DataFrame with labels and species as columns: df 64 | df = pd.DataFrame({'labels': labels, 'species': species}) 65 | 66 | # Create crosstab: ct 67 | ct = pd.crosstab(df['labels'], df['species']) 68 | 69 | # Display ct 70 | print(ct) 71 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/08-09-scaling-&-clustering-which-stocks-move-together.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clustering stocks using KMeans 3 | 4 | In this exercise, you'll cluster companies using their daily stock price movements (i.e. the dollar difference between the closing and opening prices for each trading day). You are given a NumPy array movements of daily price movements from 2010 to 2015 (obtained from Yahoo! Finance), where each row corresponds to a company, and each column corresponds to a trading day. 5 | 6 | Some stocks are more expensive than others. To account for this, include a Normalizer at the beginning of your pipeline. The Normalizer will separately transform each company's stock price to a relative scale before the clustering begins. 7 | 8 | Note that Normalizer() is different to StandardScaler(), which you used in the previous exercise. While StandardScaler() standardizes features (such as the features of the fish data from the previous exercise) by removing the mean and scaling to unit variance, Normalizer() rescales each sample - here, each company's stock price - independently of the other. 9 | 10 | KMeans and make_pipeline have already been imported for you. 11 | 12 | INSTRUCTIONS 13 | 100XP 14 | INSTRUCTIONS 15 | 100XP 16 | Import Normalizer from sklearn.preprocessing. 17 | Create an instance of Normalizer called normalizer. 18 | Create an instance of KMeans called kmeans with 10 clusters. 19 | Using make_pipeline(), create a pipeline called pipeline that chains normalizer and kmeans. 20 | Fit the pipeline to the movements array. 21 | ''' 22 | from sklearn.cluster import KMeans 23 | # Import Normalizer 24 | from sklearn.preprocessing import Normalizer 25 | 26 | # Create a normalizer: normalizer 27 | normalizer = Normalizer() 28 | 29 | # Create a KMeans model with 10 clusters: kmeans 30 | kmeans = KMeans(n_clusters=10) 31 | 32 | # Make a pipeline chaining normalizer and kmeans: pipeline 33 | pipeline = make_pipeline(normalizer, kmeans) 34 | 35 | # Fit pipeline to the daily price movements 36 | pipeline.fit(movements) 37 | 38 | ''' 39 | Which stocks move together? 40 | 41 | In the previous exercise, you clustered companies by their daily stock price movements. So which company have stock 42 | prices that tend to change in the same way? You'll now inspect the cluster labels from your clustering to find out. 43 | 44 | Your solution to the previous exercise has already been run. Recall that you constructed a Pipeline pipeline 45 | containing a KMeans model and fit it to the NumPy array movements of daily stock movements. In addition, 46 | a list companies of the company names is available. 47 | 48 | INSTRUCTIONS 100XP INSTRUCTIONS 100XP Import pandas as pd. Use the .predict() method of the pipeline to predict the 49 | labels for movements. Align the cluster labels with the list of company names companies by creating a DataFrame df 50 | with labels and companies as columns. This has been done for you. Use the .sort_values() method of df to sort the 51 | DataFrame by the 'labels' column, and print the result. Hit 'Submit Answer' and take a moment to see which companies 52 | are together in each cluster! ''' 53 | # Import pandas 54 | import pandas as pd 55 | 56 | # Predict the cluster labels: labels 57 | labels = pipeline.predict(movements) 58 | 59 | # Create a DataFrame aligning labels and companies: df 60 | df = pd.DataFrame({'labels': labels, 'companies': companies}) 61 | 62 | # Display df sorted by cluster label 63 | print(df.sort_values('labels')) 64 | -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/ch1_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/01-clustering-for-dataset-exploration/ch1_slides.pdf -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/chapter-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/01-clustering-for-dataset-exploration/chapter-details.png -------------------------------------------------------------------------------- /src/ml-unsupervised/01-clustering-for-dataset-exploration/chapter-details.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \deftab720 7 | \pard\pardeftab720\partightenfactor0 8 | 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0 10 | Learn how to discover the underlying groups (or "clusters") in a dataset. By the end of this chapter, you'll be clustering companies using their stock market prices, and distinguishing different species by clustering their measurements.} -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/01-hierarchical-clustering-of-the-grain-data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Hierarchical clustering of the grain data 3 | 4 | In the video, you learned that the SciPy linkage() function performs hierarchical clustering on an array of samples. 5 | Use the linkage() function to obtain a hierarchical clustering of the grain samples, and use dendrogram() to 6 | visualize the result. A sample of the grain measurements is provided in the array samples, while the variety of each 7 | grain sample is given by the list varieties. 8 | 9 | INSTRUCTIONS 100XP Import: linkage and dendrogram from scipy.cluster.hierarchy. matplotlib.pyplot as plt. Perform 10 | hierarchical clustering on samples using the linkage() function with the method='complete' keyword argument. Assign 11 | the result to mergings. Plot a dendrogram using the dendrogram() function on mergings. Specify the keyword arguments 12 | labels=varieties, leaf_rotation=90, and leaf_font_size=6. ''' 13 | 14 | import matplotlib.pyplot as plt 15 | # Perform the necessary imports 16 | from scipy.cluster.hierarchy import linkage, dendrogram 17 | 18 | # Calculate the linkage: mergings 19 | mergings = linkage(samples, method='complete') 20 | 21 | # Plot the dendrogram, using varieties as labels 22 | dendrogram(mergings, 23 | labels=varieties, 24 | leaf_rotation=90, 25 | leaf_font_size=6, 26 | ) 27 | plt.show() 28 | -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/02-hierarchies-of-stocks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Hierarchies of stocks 3 | 4 | In chapter 1, you used k-means clustering to cluster companies according to their stock price movements. Now, 5 | you'll perform hierarchical clustering of the companies. You are given a NumPy array of price movements movements, 6 | where the rows correspond to companies, and a list of the company names companies. SciPy hierarchical clustering 7 | doesn't fit into a sklearn pipeline, so you'll need to use the normalize() function from sklearn.preprocessing 8 | instead of Normalizer. 9 | 10 | linkage and dendrogram have already been imported from sklearn.cluster.hierarchy, and PyPlot has been imported as plt. 11 | 12 | INSTRUCTIONS 13 | 100XP 14 | INSTRUCTIONS 15 | 100XP 16 | Import normalize from sklearn.preprocessing. 17 | Rescale the price movements for each stock by using the normalize() function on movements. 18 | Apply the linkage() function to normalized_movements, using 'complete' linkage, to calculate the hierarchical clustering. Assign the result to mergings. 19 | Plot a dendrogram of the hierarchical clustering, using the list companies of company names as the labels. In addition, specify the leaf_rotation=90, and leaf_font_size=6 keyword arguments as you did in the previous exercise. 20 | ''' 21 | # Import normalize 22 | from sklearn.preprocessing import normalize 23 | 24 | # Normalize the movements: normalized_movements 25 | normalized_movements = normalize(movements) 26 | 27 | # Calculate the linkage: mergings 28 | mergings = linkage(normalized_movements, method='complete') 29 | 30 | # Plot the dendrogram 31 | dendrogram(mergings, labels=companies, leaf_rotation=90, leaf_font_size=6) 32 | plt.show() 33 | -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/03-different-linkage-different-hierarchical-clustering.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Different linkage, different hierarchical clustering! 3 | 4 | In the video, you saw a hierarchical clustering of the voting countries at the Eurovision song contest using 5 | 'complete' linkage. Now, perform a hierarchical clustering of the voting countries with 'single' linkage, and compare 6 | the resulting dendrogram with the one in the video. Different linkage, different hierarchical clustering! 7 | 8 | You are given an array samples. Each row corresponds to a voting country, and each column corresponds to a 9 | performance that was voted for. The list country_names gives the name of each voting country. This dataset was 10 | obtained from Eurovision. 11 | 12 | INSTRUCTIONS 13 | 100XP 14 | INSTRUCTIONS 15 | 100XP 16 | Import: 17 | linkage and dendrogram from scipy.cluster.hierarchy. 18 | matplotlib.pyplot as plt. 19 | Perform hierarchical clustering on samples using the linkage() function with the method='single' keyword argument. Assign the result to mergings. 20 | Plot a dendrogram of the hierarchical clustering, using the list country_names as the labels. In addition, specify the leaf_rotation=90, and leaf_font_size=6 keyword arguments as you have done earlier. 21 | ''' 22 | # Perform the necessary imports 23 | import matplotlib.pyplot as plt 24 | from pytz import country_names 25 | from scipy.cluster.hierarchy import linkage, dendrogram 26 | 27 | # Calculate the linkage: mergings 28 | mergings = linkage(samples, method='single') 29 | 30 | # Plot the dendrogram 31 | dendrogram(mergings, labels=country_names, leaf_rotation=90, leaf_font_size=6) 32 | plt.show() 33 | -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/04-extracting-the-cluster-labels.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Extracting the cluster labels 3 | 4 | In the previous exercise, you saw that the intermediate clustering of the grain samples at height 6 has 3 clusters. 5 | Now, use the fcluster() function to extract the cluster labels for this intermediate clustering, and compare the 6 | labels with the grain varieties using a cross-tabulation. 7 | 8 | The hierarchical clustering has already been performed and mergings is the result of the linkage() function. The list 9 | varieties gives the variety of each grain sample. 10 | 11 | INSTRUCTIONS 12 | 100XP 13 | Import: 14 | pandas as pd. 15 | fcluster from scipy.cluster.hierarchy. 16 | Perform a flat hierarchical clustering by using the fcluster() function on mergings. Specify a maximum height of 6 and the keyword argument criterion='distance'. 17 | Create a DataFrame df with two columns named 'labels' and 'varieties', using labels and varieties, respectively, for the column values. This has been done for you. 18 | Create a cross-tabulation ct between df['labels'] and df['varieties'] to count the number of times each grain variety coincides with each cluster label. 19 | ''' 20 | # Perform the necessary imports 21 | import pandas as pd 22 | from scipy.cluster.hierarchy import fcluster 23 | 24 | # Use fcluster to extract labels: labels 25 | labels = fcluster(mergings, 6, criterion='distance') 26 | 27 | # Create a DataFrame with labels and varieties as columns: df 28 | df = pd.DataFrame({'labels': labels, 'varieties': varieties}) 29 | 30 | # Create crosstab: ct 31 | ct = pd.crosstab(df['labels'], df['varieties']) 32 | 33 | # Display ct 34 | print(ct) 35 | -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/05-tsne-visualization-of-grain-dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | t-SNE visualization of grain dataset 3 | 4 | In the video, you saw t-SNE applied to the iris dataset. In this exercise, you'll apply t-SNE to the grain samples 5 | data and inspect the resulting t-SNE features using a scatter plot. You are given an array samples of grain samples 6 | and a list variety_numbers giving the variety number of each grain sample. 7 | 8 | INSTRUCTIONS 100XP Import TSNE from sklearn.manifold. Create a TSNE instance called model with learning_rate=200. 9 | Apply the .fit_transform() method of model to samples. Assign the result to tsne_features. Select the column 0 of 10 | tsne_features. Assign the result to xs. Select the column 1 of tsne_features. Assign the result to ys. Make a scatter 11 | plot of the t-SNE features xs and ys. To color the points by the grain variety, specify the additional keyword 12 | argument c=variety_numbers. ''' 13 | # Import TSNE 14 | from sklearn.manifold import TSNE 15 | 16 | # Create a TSNE instance: model 17 | model = TSNE(learning_rate=200) 18 | 19 | # Apply fit_transform to samples: tsne_features 20 | tsne_features = model.fit_transform(samples) 21 | 22 | # Select the 0th feature: xs 23 | xs = tsne_features[:, 0] 24 | 25 | # Select the 1st feature: ys 26 | ys = tsne_features[:, 1] 27 | 28 | # Scatter plot, coloring by variety_numbers 29 | plt.scatter(xs, ys, c=variety_numbers) 30 | plt.show() 31 | -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/06-a-tsne-map-of-the-stock-market.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A t-SNE map of the stock market 3 | 4 | t-SNE provides great visualizations when the individual samples can be labeled. In this exercise, you'll apply t-SNE to the company stock price data. A scatter plot of the resulting t-SNE features, labeled by the company names, gives you a map of the stock market! The stock price movements for each company are available as the array normalized_movements (these have already been normalized for you). The list companies gives the name of each company. PyPlot (plt) has been imported for you. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | INSTRUCTIONS 9 | 100XP 10 | Import TSNE from sklearn.manifold. 11 | Create a TSNE instance called model with learning_rate=50. 12 | Apply the .fit_transform() method of model to normalized_movements. Assign the result to tsne_features. 13 | Select column 0 and column 1 of tsne_features. 14 | Make a scatter plot of the t-SNE features xs and ys. Specify the additional keyword argument alpha=0.5. 15 | Code to label each point with its company name has been written for you using plt.annotate(), so just hit 'Submit Answer' to see the visualization! 16 | ''' 17 | # Import TSNE 18 | from sklearn.manifold import TSNE 19 | 20 | # Create a TSNE instance: model 21 | model = TSNE(learning_rate=50) 22 | 23 | # Apply fit_transform to normalized_movements: tsne_features 24 | tsne_features = model.fit_transform(normalized_movements) 25 | 26 | # Select the 0th feature: xs 27 | xs = tsne_features[:,0] 28 | 29 | # Select the 1th feature: ys 30 | ys = tsne_features[:,1] 31 | 32 | # Scatter plot 33 | plt.scatter(xs, ys, alpha=0.5) 34 | 35 | # Annotate the points 36 | for x, y, company in zip(xs, ys, companies): 37 | plt.annotate(company, (x, y), fontsize=5, alpha=0.75) 38 | plt.show() 39 | -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/ch2_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/ch2_slides.pdf -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/chapter-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/chapter-details.png -------------------------------------------------------------------------------- /src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/chapter-details.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \deftab720 7 | \pard\pardeftab720\partightenfactor0 8 | 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0 10 | In this chapter, you'll learn about two unsupervised learning techniques for data visualization, hierarchical clustering and t-SNE. Hierarchical clustering merges the data samples into ever-coarser clusters, yielding a tree visualization of the resulting cluster hierarchy. t-SNE maps the data samples into 2d space so that the proximity of the samples to one another can be visualized.} -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/01-correlated-data-in-nature.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Correlated data in nature 3 | 4 | You are given an array grains giving the width and length of samples of grain. You suspect that width and length will be correlated. To confirm this, make a scatter plot of width vs length and measure their Pearson correlation. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | Import: 9 | matplotlib.pyplot as plt. 10 | pearsonr from scipy.stats. 11 | Assign column 0 of grains to width and column 1 of grains to length. 12 | Make a scatter plot with width on the x-axis and length on the y-axis. 13 | Use the pearsonr() function to calculate the Pearson correlation of width and length. 14 | ''' 15 | 16 | # Perform the necessary imports 17 | import matplotlib.pyplot as plt 18 | from grains_data_from_dataset import grains 19 | from scipy.stats import pearsonr 20 | 21 | # Assign the 0th column of grains: width 22 | width = grains[:, 0] 23 | 24 | # Assign the 1st column of grains: length 25 | length = grains[:, 1] 26 | # Scatter plot width vs length 27 | plt.scatter(width, length) 28 | plt.axis('equal') 29 | plt.show() 30 | 31 | # Calculate the Pearson correlation 32 | correlation, pvalue = pearsonr(width, length) 33 | 34 | # Display the correlation 35 | print(correlation) 36 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/02-decorrelating-the-grain-measurements-with-pca.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Decorrelating the grain measurements with PCA 3 | 4 | You observed in the previous exercise that the width and length measurements of the grain are correlated. Now, you'll use PCA to decorrelate these measurements, then plot the decorrelated points and measure their Pearson correlation. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | Import PCA from sklearn.decomposition. 9 | Create an instance of PCA called model. 10 | Use the .fit_transform() method of model to apply the PCA transformation to grains. Assign the result to pca_features. 11 | The subsequent code to extract, plot, and compute the Pearson correlation of the first two columns pca_features has been written for you, so hit 'Submit Answer' to see the result! 12 | ''' 13 | import matplotlib.pyplot as plt 14 | from scipy.stats import pearsonr 15 | # Import PCA 16 | from sklearn.decomposition import PCA 17 | 18 | from grains_data_from_dataset import grains 19 | 20 | # Create PCA instance: model 21 | model = PCA() 22 | 23 | # Apply the fit_transform method of model to grains: pca_features 24 | pca_features = model.fit_transform(grains) 25 | 26 | # Assign 0th column of pca_features: xs 27 | xs = pca_features[:, 0] 28 | 29 | # Assign 1st column of pca_features: ys 30 | ys = pca_features[:, 1] 31 | 32 | # Scatter plot xs vs ys 33 | plt.scatter(xs, ys) 34 | plt.axis('equal') 35 | plt.show() 36 | 37 | # Calculate the Pearson correlation of xs and ys 38 | correlation, pvalue = pearsonr(xs, ys) 39 | 40 | # Display the correlation 41 | print(correlation) 42 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/03-the-first-principal-component.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The first principal component 3 | 4 | The first principal component of the data is the direction in which the data varies the most. In this exercise, 5 | your job is to use PCA to find the first principal component of the length and width measurements of the grain 6 | samples, and represent it as an arrow on the scatter plot. 7 | 8 | The array grains gives the length and width of the grain samples. PyPlot (plt) and PCA have already been imported for 9 | you. 10 | 11 | INSTRUCTIONS 12 | 100XP 13 | INSTRUCTIONS 14 | 100XP 15 | Make a scatter plot of the grain measurements. This has been done for you. 16 | Create a PCA instance called model. 17 | Fit the model to the grains data. 18 | Extract the coordinates of the mean of the data using the .mean_ attribute of model. 19 | Get the first principal component of model using the .components_[0,:] attribute. 20 | Plot the first principal component as an arrow on the scatter plot, using the plt.arrow() function. You have to specify the first two arguments - mean[0] and mean[1]. 21 | ''' 22 | import matplotlib.pyplot as plt 23 | from sklearn.decomposition import PCA 24 | 25 | from grains_data_from_dataset import grains 26 | 27 | # Make a scatter plot of the untransformed points 28 | plt.scatter(grains[:, 0], grains[:, 1]) 29 | 30 | # Create a PCA instance: model 31 | model = PCA() 32 | 33 | # Fit model to points 34 | model.fit(grains) 35 | 36 | # Get the mean of the grain samples: mean 37 | mean = model.mean_ 38 | 39 | # Get the first principal component: first_pc 40 | first_pc = model.components_[0, :] 41 | 42 | # Plot first_pc as an arrow, starting at mean 43 | plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01) 44 | 45 | # Keep axes on same scale 46 | plt.axis('equal') 47 | plt.show() 48 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/04-variance-of-the-pca-features.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Variance of the PCA features 3 | 4 | The fish dataset is 6-dimensional. But what is its intrinsic dimension? Make a plot of the variances of the PCA 5 | features to find out. As before, samples is a 2D array, where each row represents a fish. You'll need to standardize 6 | the features first. 7 | 8 | INSTRUCTIONS 100XP Create an instance of StandardScaler called scaler. Create a PCA instance called pca. Use the 9 | make_pipeline() function to create a pipeline chaining scaler and pca. Use the .fit() method of pipeline to fit it to 10 | the fish samples samples. Extract the number of components used using the .n_components_ attribute of pca. Place this 11 | inside a range() function and store the result as features. Use the plt.bar() function to plot the explained 12 | variances, with features on the x-axis and pca.explained_variance_ on the y-axis. ''' 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | # Perform the necessary imports 16 | from sklearn.decomposition import PCA 17 | from sklearn.pipeline import make_pipeline 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | from helper import points 21 | 22 | samples = np.array(points) 23 | 24 | # Create scaler: scaler 25 | scaler = StandardScaler() 26 | 27 | # Create a PCA instance: pca 28 | pca = PCA() 29 | 30 | # Create pipeline: pipeline 31 | pipeline = make_pipeline(scaler, pca) 32 | 33 | # Fit the pipeline to 'samples' 34 | pipeline.fit(samples) 35 | 36 | # Plot the explained variances 37 | features = range(pca.n_components_) 38 | plt.bar(features, pca.explained_variance_) 39 | plt.xlabel('PCA feature') 40 | plt.ylabel('variance') 41 | plt.xticks(features) 42 | plt.show() 43 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/05-dimension-reduction-of-the-fish-measuremenys.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Dimension reduction of the fish measurements 3 | 4 | In a previous exercise, you saw that 2 was a reasonable choice for the "intrinsic dimension" of the fish 5 | measurements. Now use PCA for dimensionality reduction of the fish measurements, retaining only the 2 most important 6 | components. 7 | 8 | The fish measurements have already been scaled for you, and are available as scaled_samples. 9 | 10 | INSTRUCTIONS 11 | 100XP 12 | Import PCA from sklearn.decomposition. 13 | Create a PCA instance called pca with n_components=2. 14 | Use the .fit() method of pca to fit it to the scaled fish measurements scaled_samples. 15 | Use the .transform() method of pca to transform the scaled_samples. Assign the result to pca_features. 16 | ''' 17 | # Import PCA 18 | from sklearn.decomposition import PCA 19 | 20 | from helper import scaled_samples 21 | 22 | # Create a PCA instance: pca 23 | pca = PCA() 24 | 25 | # Create a PCA model with 2 components: pca 26 | pca = PCA(n_components=2) 27 | 28 | # Fit the PCA instance to the scaled samples 29 | pca.fit(scaled_samples) 30 | 31 | # Transform the scaled samples: pca_features 32 | pca_features = pca.transform(scaled_samples) 33 | 34 | # Print the shape of pca_features 35 | print(pca_features.shape) 36 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/06-a-tfidf-word-frequency-array.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A tf-idf word-frequency array 3 | 4 | In this exercise, you'll create a tf-idf word frequency array for a toy collection of documents. For this, 5 | use the TfidfVectorizer from sklearn. It transforms a list of documents into a word frequency array, which it outputs 6 | as a csr_matrix. It has fit() and transform() methods like other sklearn objects. 7 | 8 | You are given a list documents of toy documents about pets. Its contents have been printed in the IPython Shell. 9 | 10 | INSTRUCTIONS 11 | 100XP 12 | Import TfidfVectorizer from sklearn.feature_extraction.text. 13 | Create a TfidfVectorizer instance called tfidf. 14 | Apply .fit_transform() method of tfidf to documents and assign the result to csr_mat. This is a word-frequency array in csr_matrix format. 15 | Inspect csr_mat by calling its .toarray() method and printing the result. This has been done for you. 16 | The columns of the array correspond to words. Get the list of words by calling the .get_feature_names() method of tfidf, and assign the result to words. 17 | ''' 18 | # Import TfidfVectorizer 19 | from sklearn.feature_extraction.text import TfidfVectorizer 20 | 21 | documents = ['cats say meow', 'dogs say woof', 'dogs chase cats'] 22 | 23 | # Create a TfidfVectorizer: tfidf 24 | tfidf = TfidfVectorizer() 25 | 26 | # Apply fit_transform to document: csr_mat 27 | csr_mat = tfidf.fit_transform(documents) 28 | 29 | # Print result of toarray() method 30 | print(csr_mat.toarray()) 31 | 32 | # Get the words: words 33 | words = tfidf.get_feature_names() 34 | 35 | # Print words 36 | print(words) 37 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/07-clustering-wikipedia-part-1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clustering Wikipedia part I 3 | 4 | You saw in the video that TruncatedSVD is able to perform PCA on sparse arrays in csr_matrix format, such as word-frequency arrays. Combine your knowledge of TruncatedSVD and k-means to cluster some popular pages from Wikipedia. In this exercise, build the pipeline. In the next exercise, you'll apply it to the word-frequency array of some Wikipedia articles. 5 | 6 | Create a Pipeline object consisting of a TruncatedSVD followed by KMeans. (This time, we've precomputed the word-frequency matrix for you, so there's no need for a TfidfVectorizer). 7 | 8 | The Wikipedia dataset you will be working with was obtained from here. 9 | 10 | INSTRUCTIONS 11 | 100XP 12 | Import: 13 | TruncatedSVD from sklearn.decomposition. 14 | KMeans from sklearn.cluster. 15 | make_pipeline from sklearn.pipeline. 16 | Create a TruncatedSVD instance called svd with n_components=50. 17 | Create a KMeans instance called kmeans with n_clusters=6. 18 | Create a pipeline called pipeline consisting of svd and kmeans. 19 | ''' 20 | from sklearn.cluster import KMeans 21 | # Perform the necessary imports 22 | from sklearn.decomposition import TruncatedSVD 23 | from sklearn.pipeline import make_pipeline 24 | 25 | # Create a TruncatedSVD instance: svd 26 | svd = TruncatedSVD(n_components=50) 27 | 28 | # Create a KMeans instance: kmeans 29 | kmeans = KMeans(n_clusters=6) 30 | 31 | # Create a pipeline: pipeline 32 | pipeline = make_pipeline(svd, kmeans) 33 | 34 | -------------- 35 | 36 | ''' 37 | Clustering Wikipedia part II 38 | 39 | It is now time to put your pipeline from the previous exercise to work! You are given an array articles of tf-idf 40 | word-frequencies of some popular Wikipedia articles, and a list titles of their titles. Use your pipeline to cluster 41 | the Wikipedia articles. 42 | 43 | A solution to the previous exercise has been pre-loaded for you, so a Pipeline pipeline chaining TruncatedSVD with 44 | KMeans is available. 45 | 46 | INSTRUCTIONS 47 | 100XP 48 | Import pandas as pd. 49 | Fit the pipeline to the word-frequency array articles. 50 | Predict the cluster labels. 51 | Align the cluster labels with the list titles of article titles by creating a DataFrame df with labels and titles as columns. This has been done for you. 52 | Use the .sort_values() method of df to sort the DataFrame by the 'label' column, and print the result. 53 | Hit 'Submit Answer' and take a moment to investigate your amazing clustering of Wikipedia pages! 54 | ''' 55 | # Import pandas 56 | import pandas as pd 57 | from helper import titles 58 | 59 | # Fit the pipeline to articles 60 | pipeline.fit(articles) 61 | 62 | # Calculate the cluster labels: labels 63 | labels = pipeline.predict(articles) 64 | 65 | # Create a DataFrame aligning labels and titles: df 66 | df = pd.DataFrame({'label': labels, 'article': titles}) 67 | 68 | # Display df sorted by cluster label 69 | print(df.sort_values('label')) 70 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/08-clustering-wikipedia-part-2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Clustering Wikipedia part II 3 | 4 | It is now time to put your pipeline from the previous exercise to work! You are given an array articles of tf-idf 5 | word-frequencies of some popular Wikipedia articles, and a list titles of their titles. Use your pipeline to cluster 6 | the Wikipedia articles. 7 | 8 | A solution to the previous exercise has been pre-loaded for you, so a Pipeline pipeline chaining TruncatedSVD with 9 | KMeans is available. 10 | 11 | INSTRUCTIONS 12 | 100XP 13 | Import pandas as pd. 14 | Fit the pipeline to the word-frequency array articles. 15 | Predict the cluster labels. 16 | Align the cluster labels with the list titles of article titles by creating a DataFrame df with labels and titles as columns. This has been done for you. 17 | Use the .sort_values() method of df to sort the DataFrame by the 'label' column, and print the result. 18 | Hit 'Submit Answer' and take a moment to investigate your amazing clustering of Wikipedia pages! 19 | ''' 20 | # Import pandas 21 | import pandas as pd 22 | # Fit the pipeline to articles 23 | pipeline.fit(articles) 24 | 25 | # Calculate the cluster labels: labels 26 | labels = pipeline.predict(articles) 27 | 28 | # Create a DataFrame aligning labels and titles: df 29 | df = pd.DataFrame({'label': labels, 'article': titles}) 30 | 31 | # Display df sorted by cluster label 32 | print(df.sort_values('label')) 33 | -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/ch3_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/ch3_slides.pdf -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/chapter-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/chapter-details.png -------------------------------------------------------------------------------- /src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/chapter-details.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \deftab720 7 | \pard\pardeftab720\partightenfactor0 8 | 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0 10 | Dimension reduction summarizes a dataset using its common occuring patterns. In this chapter, you'll learn about the most fundamental of dimension reduction techniques, "Principal Component Analysis" ("PCA"). PCA is often used before supervised learning to improve model performance and generalization. It can also be useful for unsupervised learning. For example, you'll employ a variant of PCA will allow you to cluster Wikipedia articles by their content!\cf2 \cb3 .} -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/01-nmf-applied-to-wikipedia-articles.py: -------------------------------------------------------------------------------- 1 | ''' 2 | NMF applied to Wikipedia articles 3 | 4 | In the video, you saw NMF applied to transform a toy word-frequency array. Now it's your turn to apply NMF, this time using the tf-idf word-frequency array of Wikipedia articles, given as a csr matrix articles. Here, fit the model and transform the articles. In the next exercise, you'll explore the result. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | Import NMF from sklearn.decomposition. 9 | Create an NMF instance called model with 6 components. 10 | Fit the model to the word count data articles. 11 | Use the .transform() method of model to transform articles, and assign the result to nmf_features. 12 | Print nmf_features to get a first idea what it looks like. 13 | ''' 14 | # Import NMF 15 | from sklearn.decomposition import NMF 16 | 17 | # Create an NMF instance: model 18 | model = NMF(n_components=6) 19 | 20 | # Fit the model to articles 21 | model.fit(articles) 22 | 23 | # Transform the articles: nmf_features 24 | nmf_features = model.transform(articles) 25 | 26 | # Print the NMF features 27 | print(nmf_features) 28 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/02-nmf-features-of-the-wikipedia-articles.py: -------------------------------------------------------------------------------- 1 | ''' 2 | NMF features of the Wikipedia articles 3 | 4 | Now you will explore the NMF features you created in the previous exercise. A solution to the previous exercise has been pre-loaded, so the array nmf_features is available. Also available is a list titles giving the title of each Wikipedia article. 5 | 6 | When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component. In the next video, you'll see why: NMF components represent topics (for instance, acting!). 7 | 8 | INSTRUCTIONS 9 | 100XP 10 | Import pandas as pd. 11 | Create a DataFrame df from nmf_features using pd.DataFrame(). Set the index to titles using index=titles. 12 | Use the .loc[] accessor of df to select the row with title 'Anne Hathaway', and print the result. These are the NMF features for the article about the actress Anne Hathaway. 13 | Repeat the last step for 'Denzel Washington' (another actor). 14 | ''' 15 | # Import pandas 16 | import pandas as pd 17 | 18 | # Create a pandas DataFrame: df 19 | df = pd.DataFrame(nmf_features, index=titles) 20 | 21 | # Print the row for 'Anne Hathaway' 22 | print(df.loc['Anne Hathaway']) 23 | 24 | # Print the row for 'Denzel Washington' 25 | print(df.loc['Denzel Washington']) 26 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/03-nmf-learns-topics-of-documents.py: -------------------------------------------------------------------------------- 1 | ''' 2 | NMF learns topics of documents 3 | 4 | In the video, you learned when NMF is applied to documents, the components correspond to topics of documents, and the NMF features reconstruct the documents from the topics. Verify this for yourself for the NMF model that you built earlier using the Wikipedia articles. Previously, you saw that the 3rd NMF feature value was high for the articles about actors Anne Hathaway and Denzel Washington. In this exercise, identify the topic of the corresponding NMF component. 5 | 6 | The NMF model you built earlier is available as model, while words is a list of the words that label the columns of the word-frequency array. 7 | 8 | After you are done, take a moment to recognise the topic that the articles about Anne Hathaway and Denzel Washington have in common! 9 | 10 | INSTRUCTIONS 11 | 100XP 12 | Import pandas as pd. 13 | Create a DataFrame components_df from model.components_, setting columns=words so that columns are labeled by the words. 14 | Print components_df.shape to check the dimensions of the DataFrame. 15 | Use the .iloc[] accessor on the DataFrame components_df to select row 3. Assign the result to component. 16 | Call the .nlargest() method of component, and print the result. This gives the five words with the highest values for that component. 17 | ''' 18 | # Import pandas 19 | import pandas as pd 20 | 21 | # Create a DataFrame: components_df 22 | components_df = pd.DataFrame(model.components_, columns=words) 23 | 24 | # Print the shape of the DataFrame 25 | print(components_df.shape) 26 | 27 | # Select row 3: component 28 | component = components_df.iloc[3] 29 | 30 | # Print result of nlargest 31 | print(component.nlargest()) -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/04-explore-the-led-digits-dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Explore the LED digits dataset 3 | 4 | In the following exercises, you'll use NMF to decompose grayscale images into their commonly occurring patterns. Firstly, explore the image dataset and see how it is encoded as an array. You are given 100 images as a 2D array samples, where each row represents a single 13x8 image. The images in your dataset are pictures of a LED digital display. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | Import matplotlib.pyplot as plt. 9 | Select row 0 of samples and assign the result to digit. For example, to select column 2 of an array a, you could use a[:,2]. Remember that since samples is a NumPy array, you can't use the .loc[] or iloc[] accessors to select specific rows or columns. 10 | Print digit. This has been done for you. Notice that it is a 1D array of 0s and 1s. 11 | Use the .reshape() method of digit to get a 2D array with shape (13, 8). Assign the result to bitmap. 12 | Print bitmap, and notice that the 1s show the digit 7! 13 | Use the plt.imshow() function to display bitmap as an image. 14 | ''' 15 | import csv 16 | 17 | import numpy as np 18 | # Import pyplot 19 | from matplotlib import pyplot as plt 20 | 21 | from helper import path 22 | 23 | with open('../' + path + 'lcd-digits.csv', 'r') as f: 24 | samples = list(csv.reader(f, delimiter=',')) 25 | samples = np.array(samples).astype(np.float) 26 | 27 | # Select the 0th row: digit 28 | digit = samples[0, :] 29 | 30 | # Print digit 31 | print(digit) 32 | 33 | # Reshape digit to a 13x8 array: bitmap 34 | bitmap = digit.reshape(13, 8) 35 | 36 | # Print bitmap 37 | print(bitmap) 38 | 39 | # Use plt.imshow to display bitmap 40 | plt.imshow(bitmap, cmap='gray', interpolation='nearest') 41 | plt.colorbar() 42 | plt.show() 43 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/05-nmf-learns-the-parts-of-images.py: -------------------------------------------------------------------------------- 1 | ''' 2 | NMF learns the parts of images 3 | 4 | Now use what you've learned about NMF to decompose the digits dataset. You are again given the digit images as a 2D array samples. This time, you are also provided with a function show_as_image() that displays the image encoded by any 1D array: 5 | 6 | def show_as_image(sample): 7 | bitmap = sample.reshape((13, 8)) 8 | plt.figure() 9 | plt.imshow(bitmap, cmap='gray', interpolation='nearest') 10 | plt.colorbar() 11 | plt.show() 12 | After you are done, take a moment to look through the plots and notice how NMF has expressed the digit as a sum of the components! 13 | 14 | INSTRUCTIONS 15 | 100XP 16 | Import NMF from sklearn.decomposition. 17 | Create an NMF instance called model with 7 components. (7 is the number of cells in an LED display). 18 | Apply the .fit_transform() method of model to samples. Assign the result to features. 19 | To each component of the model (accessed via model.components_), apply the show_as_image() function to that component inside the loop. 20 | Assign the row 0 of features to digit_features. 21 | Print digit_features. 22 | ''' 23 | import csv 24 | 25 | import numpy as np 26 | # Import pyplot 27 | from matplotlib import pyplot as plt 28 | # Import NMF 29 | from sklearn.decomposition import NMF 30 | 31 | from helper import path 32 | 33 | with open('../' + path + 'lcd-digits.csv', 'r') as f: 34 | samples = list(csv.reader(f, delimiter=',')) 35 | samples = np.array(samples).astype(np.float) 36 | 37 | 38 | def show_as_image(sample): 39 | bitmap = sample.reshape((13, 8)) 40 | plt.figure() 41 | plt.imshow(bitmap, cmap='gray', interpolation='nearest') 42 | plt.colorbar() 43 | plt.show() 44 | 45 | 46 | # Create an NMF model: model 47 | model = NMF(n_components=7) 48 | 49 | # Apply fit_transform to samples: features 50 | features = model.fit_transform(samples) 51 | 52 | # Call show_as_image on each component 53 | for component in model.components_: 54 | show_as_image(component) 55 | 56 | # Assign the 0th row of features: digit_features 57 | digit_features = features[0, :] 58 | 59 | # Print digit_features 60 | print(digit_features) 61 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/06-pca-doesnt-learn-parts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | PCA doesn't learn parts 3 | 4 | Unlike NMF, PCA doesn't learn the parts of things. Its components do not correspond to topics (in the case of documents) or to parts of images, when trained on images. Verify this for yourself by inspecting the components of a PCA model fit to the dataset of LED digit images from the previous exercise. The images are available as a 2D array samples. Also available is a modified version of the show_as_image() function which colors a pixel red if the value is negative. 5 | 6 | After submitting the answer, notice that the components of PCA do not represent meaningful parts of images of LED digits! 7 | 8 | INSTRUCTIONS 9 | 100XP 10 | Import PCA from sklearn.decomposition. 11 | Create a PCA instance called model with 7 components. 12 | Apply the .fit_transform() method of model to samples. Assign the result to features. 13 | To each component of the model (accessed via model.components_), apply the show_as_image() function to that component inside the loop. 14 | ''' 15 | import csv 16 | 17 | import numpy as np 18 | # Import pyplot 19 | from matplotlib import pyplot as plt 20 | # Import PCA 21 | from sklearn.decomposition import PCA 22 | 23 | from helper import path 24 | 25 | with open('../' + path + 'lcd-digits.csv', 'r') as f: 26 | samples = list(csv.reader(f, delimiter=',')) 27 | samples = np.array(samples).astype(np.float) 28 | 29 | 30 | def show_as_image(sample): 31 | bitmap = sample.reshape((13, 8)) 32 | bitmap[bitmap >= 0] = 1 33 | bitmap[bitmap < 0] = 0 34 | plt.figure() 35 | plt.imshow(bitmap, cmap='gist_yarg', interpolation='nearest', vmin=-.1, vmax=1.1) 36 | plt.colorbar() 37 | plt.show() 38 | 39 | 40 | # Create a PCA instance: model 41 | model = PCA(n_components=7) 42 | 43 | # Apply fit_transform to samples: features 44 | features = model.fit_transform(samples) 45 | 46 | # Call show_as_image on each component 47 | for component in model.components_: 48 | show_as_image(component) 49 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/07-which-articles-are-similar-to-cristiano-ronaldo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Which articles are similar to 'Cristiano Ronaldo'? 3 | 4 | In the video, you learned how to use NMF features and the cosine similarity to find similar articles. Apply this to your NMF model for popular Wikipedia articles, by finding the articles most similar to the article about the footballer Cristiano Ronaldo. The NMF features you obtained earlier are available as nmf_features, while titles is a list of the article titles. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | Import normalize from sklearn.preprocessing. 9 | Apply the normalize() function to nmf_features. Store the result as norm_features. 10 | Create a DataFrame df from norm_features, using titles as an index. 11 | Use the .loc[] accessor of df to select the row of 'Cristiano Ronaldo'. Assign the result to article. 12 | Apply the .dot() method of df to article to calculate the cosine similarity of every row with article. 13 | Print the result of the .nlargest() method of similarities to display the most similiar articles. This has been done for you, so hit 'Submit Answer' to see the result! 14 | ''' 15 | # Perform the necessary imports 16 | import pandas as pd 17 | from sklearn.preprocessing import normalize 18 | 19 | # Normalize the NMF features: norm_features 20 | norm_features = normalize(nmf_features) 21 | 22 | # Create a DataFrame: df 23 | df = pd.DataFrame(norm_features, index=titles) 24 | 25 | # Select the row corresponding to 'Cristiano Ronaldo': article 26 | article = df.loc['Cristiano Ronaldo'] 27 | 28 | # Compute the dot products: similarities 29 | similarities = df.dot(article) 30 | 31 | # Display those with the largest cosine similarity 32 | print(similarities.nlargest()) -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/08-recommend-musical-artists-part-1.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Recommend musical artists part I 3 | 4 | In this exercise and the next, you'll use what you've learned about NMF to recommend popular music artists! You are given a sparse array artists whose rows correspond to artists and whose column correspond to users. The entries give the number of times each artist was listened to by each user. 5 | 6 | In this exercise, build a pipeline and transform the array into normalized NMF features. The first step in the pipeline, MaxAbsScaler, transforms the data so that all users have the same influence on the model, regardless of how many different artists they've listened to. In the next exercise, you'll use the resulting normalized NMF features for recommendation! 7 | 8 | This data is part of a larger dataset available here. 9 | 10 | INSTRUCTIONS 11 | 100XP 12 | Import: 13 | NMF from sklearn.decomposition. 14 | Normalizer and MaxAbsScaler from sklearn.preprocessing. 15 | make_pipeline from sklearn.pipeline. 16 | Create an instance of MaxAbsScaler called scaler. 17 | Create an NMF instance with 20 components called nmf. 18 | Create an instance of Normalizer called normalizer. 19 | Create a pipeline called pipeline that chains together scaler, nmf, and normalizer. 20 | Apply the .fit_transform() method of pipeline to artists. Assign the result to norm_features. 21 | ''' 22 | # Perform the necessary imports 23 | from sklearn.decomposition import NMF 24 | from sklearn.preprocessing import Normalizer, MaxAbsScaler 25 | from sklearn.pipeline import make_pipeline 26 | 27 | # Create a MaxAbsScaler: scaler 28 | scaler = MaxAbsScaler() 29 | 30 | # Create an NMF model: nmf 31 | nmf = NMF(n_components=20) 32 | 33 | # Create a Normalizer: normalizer 34 | normalizer = Normalizer() 35 | 36 | # Create a pipeline: pipeline 37 | pipeline = make_pipeline(scaler, nmf, normalizer) 38 | 39 | # Apply fit_transform to artists: norm_features 40 | norm_features = pipeline.fit_transform(artists) 41 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/09-recommend-musical-artists-part-2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Recommend musical artists part II 3 | 4 | Suppose you were a big fan of Bruce Springsteen - which other musicial artists might you like? Use your NMF features from the previous exercise and the cosine similarity to find similar musical artists. A solution to the previous exercise has been run, so norm_features is an array containing the normalized NMF features as rows. The names of the musical artists are available as the list artist_names. 5 | 6 | INSTRUCTIONS 7 | 100XP 8 | Import pandas as pd. 9 | Create a DataFrame df from norm_features, using artist_names as an index. 10 | Use the .loc[] accessor of df to select the row of 'Bruce Springsteen'. Assign the result to artist. 11 | Apply the .dot() method of df to artist to calculate the dot product of every row with artist. Save the result as similarities. 12 | Print the result of the .nlargest() method of similarities to display the artists most similar to 'Bruce Springsteen'. 13 | ''' 14 | # Import pandas 15 | import pandas as pd 16 | 17 | # Create a DataFrame: df 18 | df = pd.DataFrame(norm_features, index=artist_names) 19 | 20 | # Select row of 'Bruce Springsteen': artist 21 | artist = df.loc['Bruce Springsteen'] 22 | 23 | # Compute cosine similarities: similarities 24 | similarities = df.dot(artist) 25 | 26 | # Display those with highest cosine similarity 27 | print(similarities.nlargest()) 28 | -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/ch4_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/04-discovering-interpretable-features/ch4_slides.pdf -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/chapter-details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/04-discovering-interpretable-features/chapter-details.png -------------------------------------------------------------------------------- /src/ml-unsupervised/04-discovering-interpretable-features/chapter-details.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;\red44\green44\blue44;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;\cssrgb\c22745\c22745\c22745;\cssrgb\c100000\c100000\c100000;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \deftab720 7 | \pard\pardeftab720\partightenfactor0 8 | 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0 10 | In this chapter, you'll learn about a dimension reduction technique called "Non-negative matrix factorization" ("NMF") that expresses samples as combinations of interpretable parts. For example, it expresses documents as combinations of topics, and images in terms of commonly occurring visual patterns. You'll also learn to use NMF to build recommender systems that can find you similar articles to read, or musical artists that match your listening history!} -------------------------------------------------------------------------------- /src/ml-unsupervised/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/__init__.py -------------------------------------------------------------------------------- /src/ml-unsupervised/course-description.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/course-description.png -------------------------------------------------------------------------------- /src/ml-unsupervised/k-means_clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.cluster import KMeans 4 | 5 | from helper import path 6 | 7 | # Read the CSV file into a DataFrame: df 8 | df = pd.read_csv(path + 'data_1024.csv', sep='\t') 9 | 10 | f1 = df['Distance_Feature'].values 11 | f2 = df['Speeding_Feature'].values 12 | 13 | X = np.matrix(zip(f1, f2)) 14 | kmeans = KMeans(n_clusters=2).fit(X) 15 | -------------------------------------------------------------------------------- /src/python_core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/python_core/__init__.py -------------------------------------------------------------------------------- /src/python_core/output_questions/1.py: -------------------------------------------------------------------------------- 1 | # what will be the output? 2 | 3 | n = [1, 2, 3, 4, 5, 10, 3, 100, 9, 24] 4 | 5 | n1 = [i for i in n if i > 5] 6 | 7 | for e in n: 8 | print('inter-> ') 9 | if e < 5: 10 | print('removing: {}'.format(e)) 11 | n.remove(e) 12 | print('list after removal: {}'.format(n)) 13 | 14 | print(n) 15 | print(n1) 16 | -------------------------------------------------------------------------------- /src/python_core/output_questions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/python_core/output_questions/__init__.py --------------------------------------------------------------------------------