├── .gitignore
├── README.md
├── _datasets
    ├── Action_words.csv
    ├── Cab Charges May.xlsx
    ├── Chinook.sqlite
    ├── Information_gain_job_advertisements.csv
    ├── Interactions.csv
    ├── LIGO_data.hdf5
    ├── NEONDS.hdf5
    ├── Shift Roster.xlsx
    ├── Trump Tweets(2017).xlsx
    ├── WDIData_min.csv
    ├── airline.sas7bdat
    ├── albeck_gene_expression.mat
    ├── amis.csv
    ├── auto.csv
    ├── battledeath.xlsx
    ├── boston.csv
    ├── cars.csv
    ├── company-stock-movements-2010-2015-incl.csv
    ├── data.pk1
    ├── data_1024.csv
    ├── diabetes.csv
    ├── digits.csv
    ├── disarea.dta
    ├── eurovision-2016.csv
    ├── fish.csv
    ├── gm_2008_region.csv
    ├── house-votes-84.csv
    ├── ind_pop_data.csv
    ├── industries.json
    ├── lcd-digits.csv
    ├── mnist_kaggle_some_rows.csv
    ├── moby_dick.txt
    ├── sales.sas7bdat
    ├── seaslug.txt
    ├── seeds-width-vs-length.csv
    ├── seeds.csv
    ├── titanic.csv
    ├── titanic_corrupt.txt
    ├── tweets.csv
    ├── white-wine.csv
    ├── wine.csv
    └── winequality-red.csv
├── certifications
    ├── Datacamp_course1.pdf
    ├── Datacamp_course17.pdf
    ├── Datacamp_course18.pdf
    ├── Datacamp_course2.pdf
    ├── Datacamp_course3.pdf
    ├── Datacamp_course4.pdf
    └── Datacamp_course5.pdf
├── notes
    ├── Datacamp
    │   ├── Data Science With Python Course 5.docx
    │   ├── Iterators and Generators-Python.docx
    │   ├── K-Means Clustering.rtf
    │   └── Regression Models.txt
    ├── Google ML
    │   ├── Google's ML Crash Course Notes.docx
    │   └── Tensorflow Estimators - documentation.docx
    └── R
    │   ├── Exploratory Data Analysis Dimension Reduction.txt
    │   ├── Probability.txt
    │   ├── Regression Models.txt
    │   ├── Rplot.png
    │   ├── Rplot01.png
    │   ├── Rplot02.png
    │   ├── Rplot03.png
    │   ├── Rplot04.png
    │   ├── Rplot05.png
    │   ├── Statistial Interference.txt
    │   ├── dendo1.png
    │   ├── prob-1.png
    │   └── ways to cluster.txt
└── src
    ├── __pycache__
        └── helper.cpython-36.pyc
    ├── case_studies
        ├── case_study_1.1.py
        ├── case_study_1.2.py
        ├── case_study_1.3.py
        ├── case_study_pipelining_and_scaling.py
        ├── case_study_trumps_twitter_RTs.py
        ├── case_study_urban_population_trends.py
        ├── case_study_webscraping_imdb.py
        └── top250names.txt
    ├── core
        ├── py_comprehensions.py
        ├── py_enumeration_example.py
        ├── py_filter_example.py
        ├── py_generators.py
        ├── py_iterable_and_iterator.py
        └── py_regex.py
    ├── db
        ├── __init__.py
        ├── py_mongo_integration.py
        ├── py_sql.py
        └── py_sql_with_pandas.py
    ├── file_operations
        ├── py_corrupt_file_read.py
        ├── py_default_file_read_1.py
        ├── py_numpy_file_read_1.py
        ├── py_numpy_file_read_2.py
        ├── py_pandas_excel_read.py
        ├── py_pandas_file_read_1.py
        ├── py_pandas_read_csv.py
        ├── py_pickle_read_test.py
        ├── py_read_hdf5_file.py
        ├── py_read_matlab_file.py
        ├── py_read_sas_file.py
        ├── py_read_stata_file.py
        ├── py_test_loops_algo.py
        ├── read_in_chunks.py
        └── read_tweets.py
    ├── grains_data_from_dataset.py
    ├── helper.py
    ├── misc
        ├── __init__.py
        ├── py_test_loops_algo.py
        ├── py_zip_example.py
        ├── random.py
        └── tensorflow_starter.py
    ├── ml-supervised
        ├── __init__.py
        ├── course-description.png
        ├── course-description.rtf
        ├── k-fold_cross_validation.py
        ├── ml_centering_and_scaling.py
        ├── ml_manually_remove_missing_data.py
        ├── ml_pipeline_with_hyperparameters.py
        ├── ml_pipelines.py
        ├── py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-1.py
        ├── py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-2.py
        ├── py_hyperparamter_tuning_with_GridSearchCV.py
        ├── py_hyperparamter_tuning_with_RandomizedSearchCV.py
        ├── py_knn_classifier_modal.py
        ├── py_knn_classifier_modal_train_test.py
        ├── py_knn_classifiers_performance_metrics.py
        ├── py_lasso_regularized_linear_regression.py
        ├── py_linear_regression_modal.py
        ├── py_linear_regression_modal_train_test.py
        ├── py_logistic_regression_modal.py
        ├── py_ridge_regularized_linear_regression.py
        └── py_sklearn_digits_dataset.py
    ├── ml-unsupervised
        ├── 01-clustering-for-dataset-exploration
        │   ├── 01-how-many-clusters.py
        │   ├── 02-clustering-2d-points.py
        │   ├── 03-inspect-your-clustering.py
        │   ├── 04-how-many-clusters-of-grain.py
        │   ├── 05-evaluating-the-grain-clustering.py
        │   ├── 06-07-scaling-&-clustering-the-fish-data.py
        │   ├── 08-09-scaling-&-clustering-which-stocks-move-together.py
        │   ├── ch1_slides.pdf
        │   ├── chapter-details.png
        │   └── chapter-details.rtf
        ├── 02-visualization-with-hierarchical-clustering-and-t-sne
        │   ├── 01-hierarchical-clustering-of-the-grain-data.py
        │   ├── 02-hierarchies-of-stocks.py
        │   ├── 03-different-linkage-different-hierarchical-clustering.py
        │   ├── 04-extracting-the-cluster-labels.py
        │   ├── 05-tsne-visualization-of-grain-dataset.py
        │   ├── 06-a-tsne-map-of-the-stock-market.py
        │   ├── ch2_slides.pdf
        │   ├── chapter-details.png
        │   └── chapter-details.rtf
        ├── 03-decorrelating-your-data-and-dimension-reduction
        │   ├── 01-correlated-data-in-nature.py
        │   ├── 02-decorrelating-the-grain-measurements-with-pca.py
        │   ├── 03-the-first-principal-component.py
        │   ├── 04-variance-of-the-pca-features.py
        │   ├── 05-dimension-reduction-of-the-fish-measuremenys.py
        │   ├── 06-a-tfidf-word-frequency-array.py
        │   ├── 07-clustering-wikipedia-part-1.py
        │   ├── 08-clustering-wikipedia-part-2.py
        │   ├── ch3_slides.pdf
        │   ├── chapter-details.png
        │   └── chapter-details.rtf
        ├── 04-discovering-interpretable-features
        │   ├── 01-nmf-applied-to-wikipedia-articles.py
        │   ├── 02-nmf-features-of-the-wikipedia-articles.py
        │   ├── 03-nmf-learns-topics-of-documents.py
        │   ├── 04-explore-the-led-digits-dataset.py
        │   ├── 05-nmf-learns-the-parts-of-images.py
        │   ├── 06-pca-doesnt-learn-parts.py
        │   ├── 07-which-articles-are-similar-to-cristiano-ronaldo.py
        │   ├── 08-recommend-musical-artists-part-1.py
        │   ├── 09-recommend-musical-artists-part-2.py
        │   ├── ch4_slides.pdf
        │   ├── chapter-details.png
        │   └── chapter-details.rtf
        ├── __init__.py
        ├── course-description.png
        └── k-means_clustering.py
    └── python_core
        ├── __init__.py
        └── output_questions
            ├── 1.py
            └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.idea
2 | **/venv
3 | **/.idea


--------------------------------------------------------------------------------
/_datasets/Action_words.csv:
--------------------------------------------------------------------------------
 1 | Accomplished
 2 | Achieved
 3 | Adapted
 4 | Arranged
 5 | Attained
 6 | Built
 7 | Captured
 8 | Commandeered
 9 | Completed
10 | Converted
11 | Crafted
12 | Created
13 | Cut
14 | Delivered
15 | Demonstrated
16 | Designed
17 | Developed
18 | Devised
19 | Directed
20 | Distributed
21 | Doubled
22 | Drove
23 | Earned
24 | Eliminated
25 | Encouraged
26 | Enforced
27 | Engineered
28 | Ensured
29 | Established
30 | Expanded
31 | Expedited
32 | Founded
33 | Generated
34 | Guided
35 | Identified
36 | Implemented
37 | Improved
38 | Improvised
39 | Increased
40 | Initiated
41 | Inspired
42 | Installed
43 | Instigated
44 | Instructed
45 | Interpreted
46 | Introduced
47 | Launched
48 | Led
49 | Liaised
50 | Modernized
51 | Motivated
52 | Negotiated
53 | Organized
54 | Promoted
55 | Redesigned
56 | Revitalized
57 | Started
58 | Streamlined
59 | Strengthened
60 | Structured
61 | Supervised
62 | Transformed
63 | Uncovered
64 | Widened
65 | Won


--------------------------------------------------------------------------------
/_datasets/Cab Charges May.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Cab Charges May.xlsx


--------------------------------------------------------------------------------
/_datasets/Chinook.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Chinook.sqlite


--------------------------------------------------------------------------------
/_datasets/LIGO_data.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/LIGO_data.hdf5


--------------------------------------------------------------------------------
/_datasets/NEONDS.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/NEONDS.hdf5


--------------------------------------------------------------------------------
/_datasets/Shift Roster.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Shift Roster.xlsx


--------------------------------------------------------------------------------
/_datasets/Trump Tweets(2017).xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/Trump Tweets(2017).xlsx


--------------------------------------------------------------------------------
/_datasets/airline.sas7bdat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/airline.sas7bdat


--------------------------------------------------------------------------------
/_datasets/albeck_gene_expression.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/albeck_gene_expression.mat


--------------------------------------------------------------------------------
/_datasets/battledeath.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/battledeath.xlsx


--------------------------------------------------------------------------------
/_datasets/cars.csv:
--------------------------------------------------------------------------------
1 | Unnamed: 0  cars_per_cap        country drives_right
2 | 0         US           809  United States         True
3 | 1        AUS           731      Australia        False
4 | 2        JAP           588          Japan        False
5 | 3         IN            18          India        False
6 | 4         RU           200         Russia         True
7 | 5        MOR            70        Morocco         True
8 | 6         EG            45          Egypt         True


--------------------------------------------------------------------------------
/_datasets/data.pk1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/data.pk1


--------------------------------------------------------------------------------
/_datasets/disarea.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/disarea.dta


--------------------------------------------------------------------------------
/_datasets/fish.csv:
--------------------------------------------------------------------------------
 1 | Bream,242.0,23.2,25.4,30.0,38.4,13.4
 2 | Bream,290.0,24.0,26.3,31.2,40.0,13.8
 3 | Bream,340.0,23.9,26.5,31.1,39.8,15.1
 4 | Bream,363.0,26.3,29.0,33.5,38.0,13.3
 5 | Bream,430.0,26.5,29.0,34.0,36.6,15.1
 6 | Bream,450.0,26.8,29.7,34.7,39.2,14.2
 7 | Bream,500.0,26.8,29.7,34.5,41.1,15.3
 8 | Bream,390.0,27.6,30.0,35.0,36.2,13.4
 9 | Bream,450.0,27.6,30.0,35.1,39.9,13.8
10 | Bream,500.0,28.5,30.7,36.2,39.3,13.7
11 | Bream,475.0,28.4,31.0,36.2,39.4,14.1
12 | Bream,500.0,28.7,31.0,36.2,39.7,13.3
13 | Bream,500.0,29.1,31.5,36.4,37.8,12.0
14 | Bream,600.0,29.4,32.0,37.2,40.2,13.9
15 | Bream,600.0,29.4,32.0,37.2,41.5,15.0
16 | Bream,700.0,30.4,33.0,38.3,38.8,13.8
17 | Bream,700.0,30.4,33.0,38.5,38.8,13.5
18 | Bream,610.0,30.9,33.5,38.6,40.5,13.3
19 | Bream,650.0,31.0,33.5,38.7,37.4,14.8
20 | Bream,575.0,31.3,34.0,39.5,38.3,14.1
21 | Bream,685.0,31.4,34.0,39.2,40.8,13.7
22 | Bream,620.0,31.5,34.5,39.7,39.1,13.3
23 | Bream,680.0,31.8,35.0,40.6,38.1,15.1
24 | Bream,700.0,31.9,35.0,40.5,40.1,13.8
25 | Bream,725.0,31.8,35.0,40.9,40.0,14.8
26 | Bream,720.0,32.0,35.0,40.6,40.3,15.0
27 | Bream,714.0,32.7,36.0,41.5,39.8,14.1
28 | Bream,850.0,32.8,36.0,41.6,40.6,14.9
29 | Bream,1000.0,33.5,37.0,42.6,44.5,15.5
30 | Bream,920.0,35.0,38.5,44.1,40.9,14.3
31 | Bream,955.0,35.0,38.5,44.0,41.1,14.3
32 | Bream,925.0,36.2,39.5,45.3,41.4,14.9
33 | Bream,975.0,37.4,41.0,45.9,40.6,14.7
34 | Bream,950.0,38.0,41.0,46.5,37.9,13.7
35 | Roach,40.0,12.9,14.1,16.2,25.6,14.0
36 | Roach,69.0,16.5,18.2,20.3,26.1,13.9
37 | Roach,78.0,17.5,18.8,21.2,26.3,13.7
38 | Roach,87.0,18.2,19.8,22.2,25.3,14.3
39 | Roach,120.0,18.6,20.0,22.2,28.0,16.1
40 | Roach,0.0,19.0,20.5,22.8,28.4,14.7
41 | Roach,110.0,19.1,20.8,23.1,26.7,14.7
42 | Roach,120.0,19.4,21.0,23.7,25.8,13.9
43 | Roach,150.0,20.4,22.0,24.7,23.5,15.2
44 | Roach,145.0,20.5,22.0,24.3,27.3,14.6
45 | Roach,160.0,20.5,22.5,25.3,27.8,15.1
46 | Roach,140.0,21.0,22.5,25.0,26.2,13.3
47 | Roach,160.0,21.1,22.5,25.0,25.6,15.2
48 | Roach,169.0,22.0,24.0,27.2,27.7,14.1
49 | Roach,161.0,22.0,23.4,26.7,25.9,13.6
50 | Roach,200.0,22.1,23.5,26.8,27.6,15.4
51 | Roach,180.0,23.6,25.2,27.9,25.4,14.0
52 | Roach,290.0,24.0,26.0,29.2,30.4,15.4
53 | Roach,272.0,25.0,27.0,30.6,28.0,15.6
54 | Roach,390.0,29.5,31.7,35.0,27.1,15.3
55 | Smelt,6.7,9.3,9.8,10.8,16.1,9.7
56 | Smelt,7.5,10.0,10.5,11.6,17.0,10.0
57 | Smelt,7.0,10.1,10.6,11.6,14.9,9.9
58 | Smelt,9.7,10.4,11.0,12.0,18.3,11.5
59 | Smelt,9.8,10.7,11.2,12.4,16.8,10.3
60 | Smelt,8.7,10.8,11.3,12.6,15.7,10.2
61 | Smelt,10.0,11.3,11.8,13.1,16.9,9.8
62 | Smelt,9.9,11.3,11.8,13.1,16.9,8.9
63 | Smelt,9.8,11.4,12.0,13.2,16.7,8.7
64 | Smelt,12.2,11.5,12.2,13.4,15.6,10.4
65 | Smelt,13.4,11.7,12.4,13.5,18.0,9.4
66 | Smelt,12.2,12.1,13.0,13.8,16.5,9.1
67 | Smelt,19.7,13.2,14.3,15.2,18.9,13.6
68 | Smelt,19.9,13.8,15.0,16.2,18.1,11.6
69 | Pike,200.0,30.0,32.3,34.8,16.0,9.7
70 | Pike,300.0,31.7,34.0,37.8,15.1,11.0
71 | Pike,300.0,32.7,35.0,38.8,15.3,11.3
72 | Pike,300.0,34.8,37.3,39.8,15.8,10.1
73 | Pike,430.0,35.5,38.0,40.5,18.0,11.3
74 | Pike,345.0,36.0,38.5,41.0,15.6,9.7
75 | Pike,456.0,40.0,42.5,45.5,16.0,9.5
76 | Pike,510.0,40.0,42.5,45.5,15.0,9.8
77 | Pike,540.0,40.1,43.0,45.8,17.0,11.2
78 | Pike,500.0,42.0,45.0,48.0,14.5,10.2
79 | Pike,567.0,43.2,46.0,48.7,16.0,10.0
80 | Pike,770.0,44.8,48.0,51.2,15.0,10.5
81 | Pike,950.0,48.3,51.7,55.1,16.2,11.2
82 | Pike,1250.0,52.0,56.0,59.7,17.9,11.7
83 | Pike,1600.0,56.0,60.0,64.0,15.0,9.6
84 | Pike,1550.0,56.0,60.0,64.0,15.0,9.6
85 | Pike,1650.0,59.0,63.4,68.0,15.9,11.0
86 | 


--------------------------------------------------------------------------------
/_datasets/moby_dick.txt:
--------------------------------------------------------------------------------
 1 | CHAPTER 1. Loomings.
 2 | 
 3 | Call me Ishmael. Some years ago--never mind how long precisely--having
 4 | little or no money in my purse, and nothing particular to interest me on
 5 | shore, I thought I would sail about a little and see the watery part of
 6 | the world. It is a way I have of driving off the spleen and regulating
 7 | the circulation. Whenever I find myself growing grim about the mouth;
 8 | whenever it is a damp, drizzly November in my soul; whenever I find
 9 | myself involuntarily pausing before coffin warehouses, and bringing up
10 | the rear of every funeral I meet; and especially whenever my hypos get
11 | such an upper hand of me, that it requires a strong moral principle to
12 | prevent me from deliberately stepping into the street, and methodically
13 | knocking people's hats off--then, I account it high time to get to sea
14 | as soon as I can. This is my substitute for pistol and ball. With a
15 | philosophical flourish Cato throws himself upon his sword; I quietly
16 | take to the ship. There is nothing surprising in this. If they but knew
17 | it, almost all men in their degree, some time or other, cherish very
18 | nearly the same feelings towards the ocean with me.


--------------------------------------------------------------------------------
/_datasets/sales.sas7bdat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/_datasets/sales.sas7bdat


--------------------------------------------------------------------------------
/_datasets/seaslug.txt:
--------------------------------------------------------------------------------
 1 | Time	Percent
 2 | 99	0.067
 3 | 99	0.133
 4 | 99	0.067
 5 | 99	0
 6 | 99	0
 7 | 0	0.5
 8 | 0	0.467
 9 | 0	0.857
10 | 0	0.5
11 | 0	0.357
12 | 0	0.533
13 | 5	0.467
14 | 5	0.467
15 | 5	0.125
16 | 5	0.4
17 | 5	0.214
18 | 5	0.4
19 | 10	0.067
20 | 10	0.067
21 | 10	0.333
22 | 10	0.333
23 | 10	0.133
24 | 10	0.133
25 | 15	0.267
26 | 15	0.286
27 | 15	0.333
28 | 15	0.214
29 | 15	0
30 | 15	0
31 | 20	0.267
32 | 20	0.2
33 | 20	0.267
34 | 20	0.437
35 | 20	0.077
36 | 20	0.067
37 | 25	0.133
38 | 25	0.267
39 | 25	0.412
40 | 25	0
41 | 25	0.067
42 | 25	0.133
43 | 30	0
44 | 30	0.071
45 | 30	0
46 | 30	0.067
47 | 30	0.067
48 | 30	0.133


--------------------------------------------------------------------------------
/_datasets/seeds-width-vs-length.csv:
--------------------------------------------------------------------------------
  1 | 3.312,5.763
  2 | 3.333,5.554
  3 | 3.337,5.291
  4 | 3.379,5.324
  5 | 3.562,5.658
  6 | 3.312,5.386
  7 | 3.259,5.563
  8 | 3.302,5.42
  9 | 3.465,6.053
 10 | 3.505,5.884
 11 | 3.242,5.714
 12 | 3.201,5.438
 13 | 3.199,5.439
 14 | 3.156,5.479
 15 | 3.114,5.482
 16 | 3.333,5.351
 17 | 3.383,5.119
 18 | 3.514,5.527
 19 | 3.466,5.205
 20 | 3.049,5.226
 21 | 3.129,5.658
 22 | 3.168,5.52
 23 | 3.507,5.618
 24 | 2.936,5.099
 25 | 3.245,5.789
 26 | 3.421,5.833
 27 | 3.026,5.395
 28 | 2.956,5.395
 29 | 3.221,5.541
 30 | 3.065,5.516
 31 | 2.975,5.454
 32 | 3.371,5.757
 33 | 3.186,5.717
 34 | 3.15,5.585
 35 | 3.328,5.712
 36 | 3.485,5.709
 37 | 3.464,5.826
 38 | 3.683,5.832
 39 | 3.288,5.656
 40 | 3.298,5.397
 41 | 3.156,5.348
 42 | 3.158,5.351
 43 | 3.201,5.138
 44 | 3.396,5.877
 45 | 3.462,5.579
 46 | 3.155,5.376
 47 | 3.393,5.701
 48 | 3.377,5.57
 49 | 3.291,5.545
 50 | 3.258,5.678
 51 | 3.272,5.585
 52 | 3.434,5.674
 53 | 3.113,5.715
 54 | 3.199,5.504
 55 | 3.113,5.741
 56 | 3.212,5.702
 57 | 3.377,5.388
 58 | 3.412,5.384
 59 | 3.419,5.662
 60 | 3.032,5.159
 61 | 2.85,5.008
 62 | 2.879,4.902
 63 | 3.042,5.076
 64 | 3.07,5.395
 65 | 3.026,5.262
 66 | 3.119,5.139
 67 | 3.19,5.63
 68 | 3.158,5.609
 69 | 3.153,5.569
 70 | 2.882,5.412
 71 | 3.561,6.191
 72 | 3.484,5.998
 73 | 3.594,5.978
 74 | 3.93,6.154
 75 | 3.486,6.017
 76 | 3.438,5.927
 77 | 3.403,6.064
 78 | 3.814,6.579
 79 | 3.639,6.445
 80 | 3.566,5.85
 81 | 3.467,5.875
 82 | 3.857,6.006
 83 | 3.864,6.285
 84 | 3.772,6.384
 85 | 3.801,6.366
 86 | 3.651,6.173
 87 | 3.764,6.084
 88 | 3.67,6.549
 89 | 4.033,6.573
 90 | 4.032,6.45
 91 | 3.785,6.581
 92 | 3.796,6.172
 93 | 3.693,6.272
 94 | 3.86,6.037
 95 | 3.485,6.666
 96 | 3.463,6.139
 97 | 3.81,6.341
 98 | 3.552,6.449
 99 | 3.512,6.271
100 | 3.684,6.219
101 | 3.525,5.718
102 | 3.694,5.89
103 | 3.892,6.113
104 | 3.681,6.369
105 | 3.755,6.248
106 | 3.786,6.037
107 | 3.806,6.152
108 | 3.573,6.033
109 | 3.763,6.675
110 | 3.674,6.153
111 | 3.769,6.107
112 | 3.791,6.303
113 | 3.902,6.183
114 | 3.737,6.259
115 | 3.991,6.563
116 | 3.719,6.416
117 | 3.897,6.051
118 | 3.815,6.245
119 | 3.769,6.227
120 | 3.857,6.493
121 | 3.962,6.315
122 | 3.563,6.059
123 | 3.387,5.762
124 | 3.771,5.98
125 | 3.582,5.363
126 | 3.869,6.111
127 | 3.594,6.285
128 | 3.687,5.979
129 | 3.773,6.513
130 | 3.69,5.791
131 | 3.755,5.979
132 | 3.825,6.144
133 | 3.268,5.884
134 | 3.395,5.845
135 | 3.408,5.776
136 | 3.465,5.477
137 | 3.574,6.145
138 | 3.231,5.92
139 | 3.286,5.832
140 | 3.472,5.872
141 | 2.994,5.472
142 | 3.073,5.541
143 | 3.074,5.389
144 | 2.967,5.224
145 | 2.777,5.314
146 | 2.687,5.279
147 | 2.719,5.176
148 | 2.967,5.267
149 | 2.911,5.386
150 | 2.648,5.317
151 | 2.84,5.263
152 | 2.776,5.405
153 | 2.833,5.408
154 | 2.693,5.22
155 | 2.755,5.175
156 | 2.675,5.25
157 | 2.849,5.053
158 | 2.745,5.394
159 | 2.678,5.444
160 | 2.695,5.304
161 | 2.879,5.451
162 | 2.81,5.35
163 | 2.847,5.267
164 | 2.968,5.333
165 | 2.794,5.011
166 | 2.941,5.105
167 | 2.897,5.319
168 | 2.837,5.417
169 | 2.668,5.176
170 | 2.715,5.09
171 | 2.701,5.325
172 | 2.845,5.167
173 | 2.763,5.088
174 | 2.763,5.136
175 | 2.641,5.278
176 | 2.821,4.981
177 | 2.71,5.186
178 | 2.642,5.145
179 | 2.758,5.18
180 | 2.893,5.357
181 | 2.775,5.09
182 | 3.017,5.236
183 | 2.909,5.24
184 | 2.85,5.108
185 | 3.026,5.495
186 | 2.683,5.363
187 | 2.716,5.413
188 | 2.675,5.088
189 | 2.821,5.089
190 | 2.787,4.899
191 | 2.717,5.046
192 | 2.804,5.091
193 | 2.953,5.132
194 | 2.63,5.18
195 | 2.975,5.236
196 | 3.126,5.16
197 | 3.054,5.224
198 | 3.128,5.32
199 | 2.911,5.41
200 | 3.155,5.073
201 | 2.989,5.219
202 | 3.135,4.984
203 | 2.81,5.009
204 | 3.091,5.183
205 | 2.96,5.204
206 | 2.981,5.137
207 | 2.795,5.14
208 | 3.232,5.236
209 | 2.836,5.175
210 | 2.974,5.243
211 | 


--------------------------------------------------------------------------------
/_datasets/seeds.csv:
--------------------------------------------------------------------------------
  1 | 15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
  2 | 14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
  3 | 14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
  4 | 13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
  5 | 16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
  6 | 14.38,14.21,0.8951,5.386,3.312,2.462,4.956,1
  7 | 14.69,14.49,0.8799,5.563,3.259,3.586,5.219,1
  8 | 14.11,14.1,0.8911,5.42,3.302,2.7,5,1
  9 | 16.63,15.46,0.8747,6.053,3.465,2.04,5.877,1
 10 | 16.44,15.25,0.888,5.884,3.505,1.969,5.533,1
 11 | 15.26,14.85,0.8696,5.714,3.242,4.543,5.314,1
 12 | 14.03,14.16,0.8796,5.438,3.201,1.717,5.001,1
 13 | 13.89,14.02,0.888,5.439,3.199,3.986,4.738,1
 14 | 13.78,14.06,0.8759,5.479,3.156,3.136,4.872,1
 15 | 13.74,14.05,0.8744,5.482,3.114,2.932,4.825,1
 16 | 14.59,14.28,0.8993,5.351,3.333,4.185,4.781,1
 17 | 13.99,13.83,0.9183,5.119,3.383,5.234,4.781,1
 18 | 15.69,14.75,0.9058,5.527,3.514,1.599,5.046,1
 19 | 14.7,14.21,0.9153,5.205,3.466,1.767,4.649,1
 20 | 12.72,13.57,0.8686,5.226,3.049,4.102,4.914,1
 21 | 14.16,14.4,0.8584,5.658,3.129,3.072,5.176,1
 22 | 14.11,14.26,0.8722,5.52,3.168,2.688,5.219,1
 23 | 15.88,14.9,0.8988,5.618,3.507,0.7651,5.091,1
 24 | 12.08,13.23,0.8664,5.099,2.936,1.415,4.961,1
 25 | 15.01,14.76,0.8657,5.789,3.245,1.791,5.001,1
 26 | 16.19,15.16,0.8849,5.833,3.421,0.903,5.307,1
 27 | 13.02,13.76,0.8641,5.395,3.026,3.373,4.825,1
 28 | 12.74,13.67,0.8564,5.395,2.956,2.504,4.869,1
 29 | 14.11,14.18,0.882,5.541,3.221,2.754,5.038,1
 30 | 13.45,14.02,0.8604,5.516,3.065,3.531,5.097,1
 31 | 13.16,13.82,0.8662,5.454,2.975,0.8551,5.056,1
 32 | 15.49,14.94,0.8724,5.757,3.371,3.412,5.228,1
 33 | 14.09,14.41,0.8529,5.717,3.186,3.92,5.299,1
 34 | 13.94,14.17,0.8728,5.585,3.15,2.124,5.012,1
 35 | 15.05,14.68,0.8779,5.712,3.328,2.129,5.36,1
 36 | 16.12,15,0.9,5.709,3.485,2.27,5.443,1
 37 | 16.2,15.27,0.8734,5.826,3.464,2.823,5.527,1
 38 | 17.08,15.38,0.9079,5.832,3.683,2.956,5.484,1
 39 | 14.8,14.52,0.8823,5.656,3.288,3.112,5.309,1
 40 | 14.28,14.17,0.8944,5.397,3.298,6.685,5.001,1
 41 | 13.54,13.85,0.8871,5.348,3.156,2.587,5.178,1
 42 | 13.5,13.85,0.8852,5.351,3.158,2.249,5.176,1
 43 | 13.16,13.55,0.9009,5.138,3.201,2.461,4.783,1
 44 | 15.5,14.86,0.882,5.877,3.396,4.711,5.528,1
 45 | 15.11,14.54,0.8986,5.579,3.462,3.128,5.18,1
 46 | 13.8,14.04,0.8794,5.376,3.155,1.56,4.961,1
 47 | 15.36,14.76,0.8861,5.701,3.393,1.367,5.132,1
 48 | 14.99,14.56,0.8883,5.57,3.377,2.958,5.175,1
 49 | 14.79,14.52,0.8819,5.545,3.291,2.704,5.111,1
 50 | 14.86,14.67,0.8676,5.678,3.258,2.129,5.351,1
 51 | 14.43,14.4,0.8751,5.585,3.272,3.975,5.144,1
 52 | 15.78,14.91,0.8923,5.674,3.434,5.593,5.136,1
 53 | 14.49,14.61,0.8538,5.715,3.113,4.116,5.396,1
 54 | 14.33,14.28,0.8831,5.504,3.199,3.328,5.224,1
 55 | 14.52,14.6,0.8557,5.741,3.113,1.481,5.487,1
 56 | 15.03,14.77,0.8658,5.702,3.212,1.933,5.439,1
 57 | 14.46,14.35,0.8818,5.388,3.377,2.802,5.044,1
 58 | 14.92,14.43,0.9006,5.384,3.412,1.142,5.088,1
 59 | 15.38,14.77,0.8857,5.662,3.419,1.999,5.222,1
 60 | 12.11,13.47,0.8392,5.159,3.032,1.502,4.519,1
 61 | 11.42,12.86,0.8683,5.008,2.85,2.7,4.607,1
 62 | 11.23,12.63,0.884,4.902,2.879,2.269,4.703,1
 63 | 12.36,13.19,0.8923,5.076,3.042,3.22,4.605,1
 64 | 13.22,13.84,0.868,5.395,3.07,4.157,5.088,1
 65 | 12.78,13.57,0.8716,5.262,3.026,1.176,4.782,1
 66 | 12.88,13.5,0.8879,5.139,3.119,2.352,4.607,1
 67 | 14.34,14.37,0.8726,5.63,3.19,1.313,5.15,1
 68 | 14.01,14.29,0.8625,5.609,3.158,2.217,5.132,1
 69 | 14.37,14.39,0.8726,5.569,3.153,1.464,5.3,1
 70 | 12.73,13.75,0.8458,5.412,2.882,3.533,5.067,1
 71 | 17.63,15.98,0.8673,6.191,3.561,4.076,6.06,2
 72 | 16.84,15.67,0.8623,5.998,3.484,4.675,5.877,2
 73 | 17.26,15.73,0.8763,5.978,3.594,4.539,5.791,2
 74 | 19.11,16.26,0.9081,6.154,3.93,2.936,6.079,2
 75 | 16.82,15.51,0.8786,6.017,3.486,4.004,5.841,2
 76 | 16.77,15.62,0.8638,5.927,3.438,4.92,5.795,2
 77 | 17.32,15.91,0.8599,6.064,3.403,3.824,5.922,2
 78 | 20.71,17.23,0.8763,6.579,3.814,4.451,6.451,2
 79 | 18.94,16.49,0.875,6.445,3.639,5.064,6.362,2
 80 | 17.12,15.55,0.8892,5.85,3.566,2.858,5.746,2
 81 | 16.53,15.34,0.8823,5.875,3.467,5.532,5.88,2
 82 | 18.72,16.19,0.8977,6.006,3.857,5.324,5.879,2
 83 | 20.2,16.89,0.8894,6.285,3.864,5.173,6.187,2
 84 | 19.57,16.74,0.8779,6.384,3.772,1.472,6.273,2
 85 | 19.51,16.71,0.878,6.366,3.801,2.962,6.185,2
 86 | 18.27,16.09,0.887,6.173,3.651,2.443,6.197,2
 87 | 18.88,16.26,0.8969,6.084,3.764,1.649,6.109,2
 88 | 18.98,16.66,0.859,6.549,3.67,3.691,6.498,2
 89 | 21.18,17.21,0.8989,6.573,4.033,5.78,6.231,2
 90 | 20.88,17.05,0.9031,6.45,4.032,5.016,6.321,2
 91 | 20.1,16.99,0.8746,6.581,3.785,1.955,6.449,2
 92 | 18.76,16.2,0.8984,6.172,3.796,3.12,6.053,2
 93 | 18.81,16.29,0.8906,6.272,3.693,3.237,6.053,2
 94 | 18.59,16.05,0.9066,6.037,3.86,6.001,5.877,2
 95 | 18.36,16.52,0.8452,6.666,3.485,4.933,6.448,2
 96 | 16.87,15.65,0.8648,6.139,3.463,3.696,5.967,2
 97 | 19.31,16.59,0.8815,6.341,3.81,3.477,6.238,2
 98 | 18.98,16.57,0.8687,6.449,3.552,2.144,6.453,2
 99 | 18.17,16.26,0.8637,6.271,3.512,2.853,6.273,2
100 | 18.72,16.34,0.881,6.219,3.684,2.188,6.097,2
101 | 16.41,15.25,0.8866,5.718,3.525,4.217,5.618,2
102 | 17.99,15.86,0.8992,5.89,3.694,2.068,5.837,2
103 | 19.46,16.5,0.8985,6.113,3.892,4.308,6.009,2
104 | 19.18,16.63,0.8717,6.369,3.681,3.357,6.229,2
105 | 18.95,16.42,0.8829,6.248,3.755,3.368,6.148,2
106 | 18.83,16.29,0.8917,6.037,3.786,2.553,5.879,2
107 | 18.85,16.17,0.9056,6.152,3.806,2.843,6.2,2
108 | 17.63,15.86,0.88,6.033,3.573,3.747,5.929,2
109 | 19.94,16.92,0.8752,6.675,3.763,3.252,6.55,2
110 | 18.55,16.22,0.8865,6.153,3.674,1.738,5.894,2
111 | 18.45,16.12,0.8921,6.107,3.769,2.235,5.794,2
112 | 19.38,16.72,0.8716,6.303,3.791,3.678,5.965,2
113 | 19.13,16.31,0.9035,6.183,3.902,2.109,5.924,2
114 | 19.14,16.61,0.8722,6.259,3.737,6.682,6.053,2
115 | 20.97,17.25,0.8859,6.563,3.991,4.677,6.316,2
116 | 19.06,16.45,0.8854,6.416,3.719,2.248,6.163,2
117 | 18.96,16.2,0.9077,6.051,3.897,4.334,5.75,2
118 | 19.15,16.45,0.889,6.245,3.815,3.084,6.185,2
119 | 18.89,16.23,0.9008,6.227,3.769,3.639,5.966,2
120 | 20.03,16.9,0.8811,6.493,3.857,3.063,6.32,2
121 | 20.24,16.91,0.8897,6.315,3.962,5.901,6.188,2
122 | 18.14,16.12,0.8772,6.059,3.563,3.619,6.011,2
123 | 16.17,15.38,0.8588,5.762,3.387,4.286,5.703,2
124 | 18.43,15.97,0.9077,5.98,3.771,2.984,5.905,2
125 | 15.99,14.89,0.9064,5.363,3.582,3.336,5.144,2
126 | 18.75,16.18,0.8999,6.111,3.869,4.188,5.992,2
127 | 18.65,16.41,0.8698,6.285,3.594,4.391,6.102,2
128 | 17.98,15.85,0.8993,5.979,3.687,2.257,5.919,2
129 | 20.16,17.03,0.8735,6.513,3.773,1.91,6.185,2
130 | 17.55,15.66,0.8991,5.791,3.69,5.366,5.661,2
131 | 18.3,15.89,0.9108,5.979,3.755,2.837,5.962,2
132 | 18.94,16.32,0.8942,6.144,3.825,2.908,5.949,2
133 | 15.38,14.9,0.8706,5.884,3.268,4.462,5.795,2
134 | 16.16,15.33,0.8644,5.845,3.395,4.266,5.795,2
135 | 15.56,14.89,0.8823,5.776,3.408,4.972,5.847,2
136 | 15.38,14.66,0.899,5.477,3.465,3.6,5.439,2
137 | 17.36,15.76,0.8785,6.145,3.574,3.526,5.971,2
138 | 15.57,15.15,0.8527,5.92,3.231,2.64,5.879,2
139 | 15.6,15.11,0.858,5.832,3.286,2.725,5.752,2
140 | 16.23,15.18,0.885,5.872,3.472,3.769,5.922,2
141 | 13.07,13.92,0.848,5.472,2.994,5.304,5.395,3
142 | 13.32,13.94,0.8613,5.541,3.073,7.035,5.44,3
143 | 13.34,13.95,0.862,5.389,3.074,5.995,5.307,3
144 | 12.22,13.32,0.8652,5.224,2.967,5.469,5.221,3
145 | 11.82,13.4,0.8274,5.314,2.777,4.471,5.178,3
146 | 11.21,13.13,0.8167,5.279,2.687,6.169,5.275,3
147 | 11.43,13.13,0.8335,5.176,2.719,2.221,5.132,3
148 | 12.49,13.46,0.8658,5.267,2.967,4.421,5.002,3
149 | 12.7,13.71,0.8491,5.386,2.911,3.26,5.316,3
150 | 10.79,12.93,0.8107,5.317,2.648,5.462,5.194,3
151 | 11.83,13.23,0.8496,5.263,2.84,5.195,5.307,3
152 | 12.01,13.52,0.8249,5.405,2.776,6.992,5.27,3
153 | 12.26,13.6,0.8333,5.408,2.833,4.756,5.36,3
154 | 11.18,13.04,0.8266,5.22,2.693,3.332,5.001,3
155 | 11.36,13.05,0.8382,5.175,2.755,4.048,5.263,3
156 | 11.19,13.05,0.8253,5.25,2.675,5.813,5.219,3
157 | 11.34,12.87,0.8596,5.053,2.849,3.347,5.003,3
158 | 12.13,13.73,0.8081,5.394,2.745,4.825,5.22,3
159 | 11.75,13.52,0.8082,5.444,2.678,4.378,5.31,3
160 | 11.49,13.22,0.8263,5.304,2.695,5.388,5.31,3
161 | 12.54,13.67,0.8425,5.451,2.879,3.082,5.491,3
162 | 12.02,13.33,0.8503,5.35,2.81,4.271,5.308,3
163 | 12.05,13.41,0.8416,5.267,2.847,4.988,5.046,3
164 | 12.55,13.57,0.8558,5.333,2.968,4.419,5.176,3
165 | 11.14,12.79,0.8558,5.011,2.794,6.388,5.049,3
166 | 12.1,13.15,0.8793,5.105,2.941,2.201,5.056,3
167 | 12.44,13.59,0.8462,5.319,2.897,4.924,5.27,3
168 | 12.15,13.45,0.8443,5.417,2.837,3.638,5.338,3
169 | 11.35,13.12,0.8291,5.176,2.668,4.337,5.132,3
170 | 11.24,13,0.8359,5.09,2.715,3.521,5.088,3
171 | 11.02,13,0.8189,5.325,2.701,6.735,5.163,3
172 | 11.55,13.1,0.8455,5.167,2.845,6.715,4.956,3
173 | 11.27,12.97,0.8419,5.088,2.763,4.309,5,3
174 | 11.4,13.08,0.8375,5.136,2.763,5.588,5.089,3
175 | 10.83,12.96,0.8099,5.278,2.641,5.182,5.185,3
176 | 10.8,12.57,0.859,4.981,2.821,4.773,5.063,3
177 | 11.26,13.01,0.8355,5.186,2.71,5.335,5.092,3
178 | 10.74,12.73,0.8329,5.145,2.642,4.702,4.963,3
179 | 11.48,13.05,0.8473,5.18,2.758,5.876,5.002,3
180 | 12.21,13.47,0.8453,5.357,2.893,1.661,5.178,3
181 | 11.41,12.95,0.856,5.09,2.775,4.957,4.825,3
182 | 12.46,13.41,0.8706,5.236,3.017,4.987,5.147,3
183 | 12.19,13.36,0.8579,5.24,2.909,4.857,5.158,3
184 | 11.65,13.07,0.8575,5.108,2.85,5.209,5.135,3
185 | 12.89,13.77,0.8541,5.495,3.026,6.185,5.316,3
186 | 11.56,13.31,0.8198,5.363,2.683,4.062,5.182,3
187 | 11.81,13.45,0.8198,5.413,2.716,4.898,5.352,3
188 | 10.91,12.8,0.8372,5.088,2.675,4.179,4.956,3
189 | 11.23,12.82,0.8594,5.089,2.821,7.524,4.957,3
190 | 10.59,12.41,0.8648,4.899,2.787,4.975,4.794,3
191 | 10.93,12.8,0.839,5.046,2.717,5.398,5.045,3
192 | 11.27,12.86,0.8563,5.091,2.804,3.985,5.001,3
193 | 11.87,13.02,0.8795,5.132,2.953,3.597,5.132,3
194 | 10.82,12.83,0.8256,5.18,2.63,4.853,5.089,3
195 | 12.11,13.27,0.8639,5.236,2.975,4.132,5.012,3
196 | 12.8,13.47,0.886,5.16,3.126,4.873,4.914,3
197 | 12.79,13.53,0.8786,5.224,3.054,5.483,4.958,3
198 | 13.37,13.78,0.8849,5.32,3.128,4.67,5.091,3
199 | 12.62,13.67,0.8481,5.41,2.911,3.306,5.231,3
200 | 12.76,13.38,0.8964,5.073,3.155,2.828,4.83,3
201 | 12.38,13.44,0.8609,5.219,2.989,5.472,5.045,3
202 | 12.67,13.32,0.8977,4.984,3.135,2.3,4.745,3
203 | 11.18,12.72,0.868,5.009,2.81,4.051,4.828,3
204 | 12.7,13.41,0.8874,5.183,3.091,8.456,5,3
205 | 12.37,13.47,0.8567,5.204,2.96,3.919,5.001,3
206 | 12.19,13.2,0.8783,5.137,2.981,3.631,4.87,3
207 | 11.23,12.88,0.8511,5.14,2.795,4.325,5.003,3
208 | 13.2,13.66,0.8883,5.236,3.232,8.315,5.056,3
209 | 11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3
210 | 12.3,13.34,0.8684,5.243,2.974,5.637,5.063,3
211 | 


--------------------------------------------------------------------------------
/certifications/Datacamp_course1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course1.pdf


--------------------------------------------------------------------------------
/certifications/Datacamp_course17.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course17.pdf


--------------------------------------------------------------------------------
/certifications/Datacamp_course18.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course18.pdf


--------------------------------------------------------------------------------
/certifications/Datacamp_course2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course2.pdf


--------------------------------------------------------------------------------
/certifications/Datacamp_course3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course3.pdf


--------------------------------------------------------------------------------
/certifications/Datacamp_course4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course4.pdf


--------------------------------------------------------------------------------
/certifications/Datacamp_course5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/certifications/Datacamp_course5.pdf


--------------------------------------------------------------------------------
/notes/Datacamp/Data Science With Python Course 5.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Datacamp/Data Science With Python Course 5.docx


--------------------------------------------------------------------------------
/notes/Datacamp/Iterators and Generators-Python.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Datacamp/Iterators and Generators-Python.docx


--------------------------------------------------------------------------------
/notes/Datacamp/Regression Models.txt:
--------------------------------------------------------------------------------
1 | Regression: Applied over continuous stream of value to predict next value.
2 | Classification: Applied over discrete set of values to identify which 'class' or category the target value is going to fall under.
3 | 
4 | 


--------------------------------------------------------------------------------
/notes/Google ML/Google's ML Crash Course Notes.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Google ML/Google's ML Crash Course Notes.docx


--------------------------------------------------------------------------------
/notes/Google ML/Tensorflow Estimators - documentation.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/Google ML/Tensorflow Estimators - documentation.docx


--------------------------------------------------------------------------------
/notes/R/Exploratory Data Analysis Dimension Reduction.txt:
--------------------------------------------------------------------------------
 1 | In other words, we'd like to find the best matrix created with fewer variables (that is, a lower rank
 2 | | matrix) that explains the original data. This is related to data compression.
 3 | 
 4 |  Two related solutions to these problems are PCA which stands for Principal Component Analysis and SVD,
 5 | | Singular Value Decomposition. This latter simply means that we express a matrix X of observations
 6 | | (rows) and variables (columns) as the product of 3 other matrices, i.e., X=UDV^t. This last term (V^t)
 7 | | represents the transpose of the matrix V.
 8 | 
 9 | 1. principal component analysis (PCA) 
10 | 
11 | 2. singular value decomposition (SVD)


--------------------------------------------------------------------------------
/notes/R/Probability.txt:
--------------------------------------------------------------------------------
  1 | | If you had a ruler of infinite precision, would measuring the height of adults around the world be
  2 | | continuous or discrete?
  3 | 
  4 | 1: continuous
  5 | 2: discrete
  6 | 
  7 | Selection: 2
  8 | 
  9 | | Not quite! Try again.
 10 | 
 11 | | The ruler of infinite precision is the hint. Can you list all possible heights?
 12 | 
 13 | 1: discrete
 14 | 2: continuous
 15 | 
 16 | Selection: 2
 17 | 
 18 | | You are quite good my friend!
 19 | 
 20 |   |===========                                                                                    |  11%
 21 | 
 22 | | Is the drawing of a hand of cards continuous or discrete?
 23 | 
 24 | 1: continuous
 25 | 2: discrete
 26 | 
 27 | Selection: 2
 28 | 
 29 | | All that hard work is paying off!
 30 | 
 31 |   |==============                                                                                 |  14%
 32 | 
 33 | | Continuous random variables are usually associated with measurements of time, distance, or some
 34 | | biological process since they can take on any value, often within some specified range. Limitations of
 35 | | precision in taking the measurements may imply that the values are discrete; we in fact consider them
 36 | | continuous.
 37 | 
 38 | A probability mass function (PMF) gives the probability that a discrete random variable is exactly
 39 | | equal to some value.
 40 | 
 41 | | For instance, suppose we have a coin which may or may not be fair. Let x=0 represent a 'heads' outcome
 42 | | and x=1 represent a 'tails' outcome of a coin toss. If p is the probability of 'heads' which of the
 43 | | following represents the PMF of the coin toss?  The variable x is either 0 (heads) or 1 (tails).
 44 | 
 45 | 1: (p^x)*(1-p)^(1-x)
 46 | 2: (p^(1-x))*(1-p)^x
 47 | 
 48 | A probability density function [PDF] is associated with a continuous random variable. To quote from
 49 | | Wikipedia, it "is a function that describes the relative likelihood for this random variable to take
 50 | | on a given value. The probability of the random variable falling within a particular range of values
 51 | | is given by ... the area under the density function but above the horizontal axis and between the
 52 | | lowest and greatest values of the range.
 53 | 
 54 | | We'll repeat two requirements of a probability density function [PDF]. It must be non-negative everywhere,
 55 | | and the area under it must equal one."
 56 | 
 57 |  The cumulative distribution function (CDF) of a random variable X, either discrete or continuous, is
 58 | | the function F(x) equal to the probability that X is less than or equal to x. In the example above,
 59 | | the area of the blue triangle represents the probability that the random variable was less than or
 60 | | equal to the value 1.6.
 61 | 
 62 | | When the random variable is continuous, as in the example, the PDF is the derivative of the CDF. So integrating the PDF (the line represented by the diagonal)
 63 | | yields the CDF. When you evaluate the CDF at the limits of integration the result is an area.
 64 | 
 65 | Now use the R function integrate to integrate mypdf with the parameters lower equal to 0 and upper equal to 1.6. See if you get the same area (probability) you
 66 | | got before.
 67 | 
 68 | > integrate(mypdf, 0, 1.6)
 69 | 0.64 with absolute error < 7.1e-15
 70 | 
 71 | | You are really on a roll!
 72 | 
 73 |   |==========================================================================================================                                                 |  69%
 74 | 
 75 | | The survivor function S(x) of a random variable X is defined as the function of x equal to the probability that the random variable X is greater than the value x.
 76 | | This is the complement of the CDF F(x), in our example, the portion of the lower triangle that is not shaded.
 77 | 
 78 | 
 79 | | We'll close by repeating some important points.
 80 | 
 81 | ...
 82 | 
 83 |   |=========================================================================================================================================                  |  89%
 84 | 
 85 | | A probability model connects data to a population using assumptions.
 86 | 
 87 | ...
 88 | 
 89 |   |==============================================================================================================================================             |  91%
 90 | 
 91 | | Be careful to distinguish between population medians and sample medians.
 92 | 
 93 | ...
 94 | 
 95 |   |==================================================================================================================================================         |  94%
 96 | 
 97 | | A sample median is an estimator of a population median (the estimand).
 98 | 
 99 | 
100 | | We represent the conditional probability of an event A given that B has occurred with the notation
101 | | P(A|B). More specifically, we define the conditional probability of event A, given that B has occurred
102 | | with the following.
103 | 
104 | ...
105 | 
106 |   |==============                                                                                 |  14%
107 | 
108 | | P(A|B) = P(A & B)/ P(B) . P(A|B) is the probability that BOTH A and B occur divided by the probability
109 | | that B occurs.
110 | 
111 | | From the definition of P(A|B), we can write P(A&B) = P(A|B) * P(B), right?  Let's use this to express
112 | | P(B|A).
113 | | P(B|A) = P(B&A)/P(A) = P(A|B) * P(B)/P(A). This is a simple form of Bayes' Rule which relates the two
114 | | conditional probabilities.
115 | Suppose we don't know P(A) itself, but only know its conditional probabilities, that is, the
116 | | probability that it occurs if B occurs and the probability that it occurs if B doesn't occur. These
117 | | are P(A|B) and P(A|~B), respectively. We use ~B to represent 'not B' or 'B complement'.
118 | 
119 | ...
120 | 
121 |   |===========================                                                                    |  29%
122 | 
123 | | We can then express P(A) = P(A|B) * P(B) + P(A|~B) * P(~B) and substitute this is into the denominator
124 | | of Bayes' Formula.
125 | 
126 | | P(B|A) = P(A|B) * P(B) / ( P(A|B) * P(B) + P(A|~B) * P(~B) )


--------------------------------------------------------------------------------
/notes/R/Regression Models.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Regression Models.txt


--------------------------------------------------------------------------------
/notes/R/Rplot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot.png


--------------------------------------------------------------------------------
/notes/R/Rplot01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot01.png


--------------------------------------------------------------------------------
/notes/R/Rplot02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot02.png


--------------------------------------------------------------------------------
/notes/R/Rplot03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot03.png


--------------------------------------------------------------------------------
/notes/R/Rplot04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot04.png


--------------------------------------------------------------------------------
/notes/R/Rplot05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/Rplot05.png


--------------------------------------------------------------------------------
/notes/R/Statistial Interference.txt:
--------------------------------------------------------------------------------
 1 | We want to emphasize a couple of important points here. First, a statistic (singular) is a number
 2 | | computed from a sample of data. We use statistics to infer information about a population. Second, a
 3 | | random variable is an outcome from an experiment. Deterministic processes, such as computing means or
 4 | | variances, applied to random variables, produce additional random variables which have their own
 5 | | distributions. It's important to keep straight which distributions you're talking about.
 6 | 
 7 | Finally, there are two broad flavors of inference. The first is frequency, which uses "long run
 8 | | proportion of times an event occurs in independent, identically distributed repetitions." The second
 9 | | is Bayesian in which the probability estimate for a hypothesis is updated as additional evidence is
10 | | acquired. Both flavors require an understanding of probability so that's what the next lessons will
11 | | cover.


--------------------------------------------------------------------------------
/notes/R/dendo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/dendo1.png


--------------------------------------------------------------------------------
/notes/R/prob-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/notes/R/prob-1.png


--------------------------------------------------------------------------------
/notes/R/ways to cluster.txt:
--------------------------------------------------------------------------------
 1 | What is Clustering?
 2 | clustering organizes data points that are close into groups to find a relation or pattern.
 3 | 
 4 | Types of Distances: 
 5 | 1. Euclidean - using Pythagoras theoram.
 6 | 2. Continous - correlation similarity
 7 | 3. Manhattan - using sum of all vectors. Like taxi driving through blocks of manhattan city from point A to B.
 8 | 
 9 | Ways of examining and organizing multi-dimensional data:
10 | 1. Heirarchical Clustering
11 | 2. K-Means Clustering - R documentation tells us that the k-means method "aims to partition the points into k groups such that 
12 | the sum of squares from points to the assigned cluster centres is minimized."
13 | 
14 | Heirarchical Clustering techniques:
15 | 1. complete linkage
16 | 2. Average linkage
17 | 3. heat maps
18 | 
19 | K-Means clustering techniques:
20 | 
21 | 


--------------------------------------------------------------------------------
/src/__pycache__/helper.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/__pycache__/helper.cpython-36.pyc


--------------------------------------------------------------------------------
/src/case_studies/case_study_1.1.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | # If you need to use real values
  4 | df = pd.DataFrame(pd.read_csv('../_datasets/WDIData_min.csv'))
  5 | cols = df.iloc[0, 0:5]
  6 | rows = df.iloc[1:10, 0:5].values
  7 | 
  8 | # Dummy subset
  9 | feature_names = ['CountryName',
 10 |                  'CountryCode',
 11 |                  'IndicatorName',
 12 |                  'IndicatorCode',
 13 |                  'Year',
 14 |                  'Value']
 15 | 
 16 | row_val = ['Arab World',
 17 |            'ARB',
 18 |            'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 19 |            'SP.ADO.TFRT',
 20 |            '1960',
 21 |            '133.56090740552298']
 22 | 
 23 | row_vals = [['Arab World',
 24 |              'ARB',
 25 |              'Adolescent fertility rate (births per 1,000 women ages 15-19)',
 26 |              'SP.ADO.TFRT',
 27 |              '1960',
 28 |              '133.56090740552298'],
 29 |             ['Arab World',
 30 |              'ARB',
 31 |              'Age dependency ratio (% of working-age population)',
 32 |              'SP.POP.DPND',
 33 |              '1960',
 34 |              '87.7976011532547'],
 35 |             ['Arab World',
 36 |              'ARB',
 37 |              'Age dependency ratio, old (% of working-age population)',
 38 |              'SP.POP.DPND.OL',
 39 |              '1960',
 40 |              '6.634579191565161'],
 41 |             ['Arab World',
 42 |              'ARB',
 43 |              'Age dependency ratio, young (% of working-age population)',
 44 |              'SP.POP.DPND.YG',
 45 |              '1960',
 46 |              '81.02332950839141'],
 47 |             ['Arab World',
 48 |              'ARB',
 49 |              'Arms exports (SIPRI trend indicator values)',
 50 |              'MS.MIL.XPRT.KD',
 51 |              '1960',
 52 |              '3000000.0'],
 53 |             ['Arab World',
 54 |              'ARB',
 55 |              'Arms imports (SIPRI trend indicator values)',
 56 |              'MS.MIL.MPRT.KD',
 57 |              '1960',
 58 |              '538000000.0'],
 59 |             ['Arab World',
 60 |              'ARB',
 61 |              'Birth rate, crude (per 1,000 people)',
 62 |              'SP.DYN.CBRT.IN',
 63 |              '1960',
 64 |              '47.697888095096395'],
 65 |             ['Arab World',
 66 |              'ARB',
 67 |              'CO2 emissions (kt)',
 68 |              'EN.ATM.CO2E.KT',
 69 |              '1960',
 70 |              '59563.9892169935'],
 71 |             ['Arab World',
 72 |              'ARB',
 73 |              'CO2 emissions (metric tons per capita)',
 74 |              'EN.ATM.CO2E.PC',
 75 |              '1960',
 76 |              '0.6439635478877049'],
 77 |             ['Arab World',
 78 |              'ARB',
 79 |              'CO2 emissions from gaseous fuel consumption (% of total)',
 80 |              'EN.ATM.CO2E.GF.ZS',
 81 |              '1960',
 82 |              '5.041291753975099'],
 83 |             ['Arab World',
 84 |              'ARB',
 85 |              'CO2 emissions from liquid fuel consumption (% of total)',
 86 |              'EN.ATM.CO2E.LF.ZS',
 87 |              '1960',
 88 |              '84.8514729446567'],
 89 |             ['Arab World',
 90 |              'ARB',
 91 |              'CO2 emissions from liquid fuel consumption (kt)',
 92 |              'EN.ATM.CO2E.LF.KT',
 93 |              '1960',
 94 |              '49541.707291032304'],
 95 |             ['Arab World',
 96 |              'ARB',
 97 |              'CO2 emissions from solid fuel consumption (% of total)',
 98 |              'EN.ATM.CO2E.SF.ZS',
 99 |              '1960',
100 |              '4.72698138789597'],
101 |             ['Arab World',
102 |              'ARB',
103 |              'Death rate, crude (per 1,000 people)',
104 |              'SP.DYN.CDRT.IN',
105 |              '1960',
106 |              '19.7544519237187'],
107 |             ['Arab World',
108 |              'ARB',
109 |              'Fertility rate, total (births per woman)',
110 |              'SP.DYN.TFRT.IN',
111 |              '1960',
112 |              '6.92402738655897'],
113 |             ['Arab World',
114 |              'ARB',
115 |              'Fixed telephone subscriptions',
116 |              'IT.MLT.MAIN',
117 |              '1960',
118 |              '406833.0'],
119 |             ['Arab World',
120 |              'ARB',
121 |              'Fixed telephone subscriptions (per 100 people)',
122 |              'IT.MLT.MAIN.P2',
123 |              '1960',
124 |              '0.6167005703199'],
125 |             ['Arab World',
126 |              'ARB',
127 |              'Hospital beds (per 1,000 people)',
128 |              'SH.MED.BEDS.ZS',
129 |              '1960',
130 |              '1.9296220724398703'],
131 |             ['Arab World',
132 |              'ARB',
133 |              'International migrant stock (% of population)',
134 |              'SM.POP.TOTL.ZS',
135 |              '1960',
136 |              '2.9906371279862403'],
137 |             ['Arab World',
138 |              'ARB',
139 |              'International migrant stock, total',
140 |              'SM.POP.TOTL',
141 |              '1960',
142 |              '3324685.0']]
143 | 
144 | # Zip lists: zipped_lists
145 | zipped_lists = zip(feature_names, row_val)
146 | 
147 | # Create a dictionary: rs_dict
148 | rs_dict = dict(zipped_lists)
149 | 
150 | # Print the dictionary
151 | print('--------- 1 --------')
152 | print(rs_dict)
153 | 
154 | 
155 | # Suppose you needed to repeat the same process done in the previous exercise to many,
156 | # many rows of data. Rewriting your code again and again could become very tedious, repetitive,
157 | # and unmaintainable.
158 | #
159 | # In this exercise, you will create a function to house the code you wrote earlier to make things
160 | # easier and much more concise. Why? This way, you only need to call the function and supply the appropriate
161 | # lists to create your dictionaries! Again, the lists feature_names and row_vals are preloaded and
162 | # these contain the header names of the dataset and actual values of a row from the dataset, respectively,
163 | 
164 | # Define lists2dict()
165 | def list2dict(list1, list2):
166 |     """Return a dictionary where list1 provides
167 |     the keys and list2 provides the values."""
168 | 
169 |     # Zip lists: zipped_lists
170 |     zipped_lists = zip(list1, list2)
171 | 
172 |     # Create a dictionary: rs_dict
173 |     rs_dict = dict(zipped_lists)
174 | 
175 |     # Return the dictionary
176 |     return rs_dict
177 | 
178 | 
179 | # Call lists2dict: rs_fxn
180 | rs_fxn = list2dict(feature_names, row_vals)
181 | 
182 | # Print rs_fxn
183 | print('--------- 2 --------')
184 | print(rs_fxn)
185 | 
186 | # Using a list comprehension
187 | #
188 | # This time, you're going to use the lists2dict() function you defined in the last exercise to turn a bunch of
189 | # lists into a list of dictionaries with the help of a list comprehension.
190 | #
191 | # The lists2dict() function has already been preloaded, together with a couple of lists, feature_names and row_lists.
192 | # feature_names contains the header names of the World Bank dataset and row_lists is a list of lists, where each
193 | # sublist is a list of actual values of a row from the dataset.
194 | #
195 | # Your goal is to use a list comprehension to generate a list of dicts, where the keys are the header names and the
196 | # values are the row entries.
197 | 
198 | # Print the first two lists in row_vals
199 | print('--------- 3 --------')
200 | print(row_vals[0])
201 | print(row_vals[1])
202 | 
203 | # Turn list of lists into list of dicts: list_of_dicts
204 | list_of_dicts = [list2dict(feature_names, sublist) for sublist in row_vals]
205 | 
206 | # Print the first two dictionaries in list_of_dicts
207 | print('--------- 4 --------')
208 | print(list_of_dicts[0])
209 | 
210 | # Turning this all into a DataFrame
211 | #
212 | # You've zipped lists together, created a function to house your code, and even used the function in a list
213 | # comprehension to generate a list of dictionaries. That was a lot of work and you did a great job!
214 | #
215 | # You will now use of all these to convert the list of dictionaries into a pandas DataFrame. You will see how
216 | # convenient it is to generate a DataFrame from dictionaries with the DataFrame() function from the pandas package.
217 | #
218 | # The lists2dict() function, feature_names list, and row_lists list have been preloaded for this exercise.
219 | #
220 | # Go for it!
221 | 
222 | # Turn list of dicts into a DataFrame: df
223 | df = pd.DataFrame(list_of_dicts)
224 | 
225 | # Print the head of the DataFrame
226 | print('--------- 5 --------')
227 | print(df.head())
228 | 


--------------------------------------------------------------------------------
/src/case_studies/case_study_1.2.py:
--------------------------------------------------------------------------------
  1 | # Processing data in chunks (1)
  2 | #
  3 | # Sometimes, data sources can be so large in size that storing the entire dataset in memory becomes too
  4 | # resource-intensive. In this exercise, you will process the first 1000 rows of a file line by line, to create a
  5 | # dictionary of the counts of how many times each country appears in a column in the dataset.
  6 | #
  7 | # The csv file 'world_dev_ind.csv' is in your current directory for your use. To begin, you need to open a
  8 | # connection to this file using what is known as a context manager. For example, the command with open('datacamp.csv')
  9 | # as datacamp binds the csv file 'datacamp.csv' as datacamp in the context manager. Here, the with statement is
 10 | # the context manager, and its purpose is to ensure that _datasets are efficiently allocated when opening a
 11 | # connection to a file.
 12 | #
 13 | # If you'd like to learn more about context managers, refer to the DataCamp course on Importing Data in Python
 14 | # (https://www.datacamp.com/courses/importing-data-in-python-part-1).
 15 | 
 16 | # Open a connection to the file
 17 | with open('../_datasets/WDIData_min.csv') as file:
 18 |     # Skip the column names
 19 |     file.readline()
 20 | 
 21 |     # Initialize an empty dictionary: counts_dict
 22 |     counts_dict = {}
 23 | 
 24 |     # Process only the first 1000 rows
 25 |     for j in range(0, 1000):
 26 | 
 27 |         # Split the current line into a list: line
 28 |         line = file.readline().split(',')
 29 | 
 30 |         # Get the value for the first column: first_col
 31 |         first_col = line[0]
 32 | 
 33 |         # If the column value is in the dict, increment its value
 34 |         if first_col in counts_dict.keys():
 35 |             counts_dict[first_col] += 1
 36 | 
 37 |         # Else, add to the dict and set value to 1
 38 |         else:
 39 |             counts_dict[first_col] = 1
 40 | 
 41 | # Print the resulting dictionary
 42 | print(counts_dict)
 43 | 
 44 | 
 45 | #
 46 | # Writing a generator to load data in chunks (2)
 47 | #
 48 | # In the previous exercise, you processed a file line by line for a given number of lines. What if, however, you want
 49 | # to do this for the entire file?
 50 | #
 51 | # In this case, it would be useful to use generators. Generators allow users to lazily evaluate data. This concept of
 52 | # lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an
 53 | # efficient manner by yielding only chunks of data at a time instead of the whole thing at once.
 54 | #
 55 | # In this exercise, you will define a generator function read_large_file() that produces a generator object which
 56 | # yields a single line from a file each time next() is called on it. The csv file 'world_dev_ind.csv' is in your
 57 | # current directory for your use.
 58 | #
 59 | # Note that when you open a connection to a file, the resulting file object is already a generator! So out in the
 60 | # wild, you won't have to explicitly create generator objects in cases such as this. However, for pedagogical reasons,
 61 | # we are having you practice how to do this here with the read_large_file() function. Go for it!
 62 | # Define read_large_file()
 63 | 
 64 | def read_large_file(file_object):
 65 |     """A generator function to read a large file lazily."""
 66 | 
 67 |     # DO NOT Loop indefinitely until the end of the file else it will run out of memory
 68 |     for i in range(0, 100):
 69 | 
 70 |         # Read a line from the file: data
 71 |         data = file_object.readline()
 72 | 
 73 |         # Break if this is the end of the file
 74 |         if not data:
 75 |             break
 76 | 
 77 |         # Yield the line of data
 78 |         yield data
 79 | 
 80 |     # Create a generator object for the file: gen_file
 81 |     gen_file = read_large_file(file)
 82 | 
 83 |     # Print the first three lines of the file
 84 |     print(next(gen_file))
 85 |     print(next(gen_file))
 86 |     print(next(gen_file))
 87 | 
 88 | 
 89 | # Writing a generator to load data in chunks (3)
 90 | #
 91 | # Great! You've just created a generator function that you can use to help you process large file_operations.
 92 | #
 93 | # Now let's use your generator function to process the World Bank dataset like you did previously.
 94 | # You will process the file line by line, to create a dictionary of the counts of how many times each country
 95 | # appears in a column in the dataset. For this exercise, however, you won't process just 1000 rows of data, you'll
 96 | # process the entire dataset!
 97 | #
 98 | # The generator function read_large_file() and the csv file 'world_dev_ind.csv' are preloaded and ready for your use.
 99 | #  Go for it!
100 | # Initialize an empty dictionary: counts_dict
101 | counts_dict = {}
102 | with open('../_datasets/WDIData_min.csv') as file:
103 |     # Iterate over the generator from read_large_file()
104 |     for line in read_large_file(file):
105 | 
106 |         row = line.split(',')
107 |         first_col = row[0]
108 | 
109 |         if first_col in counts_dict.keys():
110 |             counts_dict[first_col] += 1
111 |         else:
112 |             counts_dict[first_col] = 1
113 | 
114 | # Print
115 | print(counts_dict)
116 | 


--------------------------------------------------------------------------------
/src/case_studies/case_study_1.3.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | # Writing an iterator to load data in chunks (1)
  5 | #
  6 | # Another way to read data too large to store in memory in chunks is to read the file in as DataFrames of a certain
  7 | # length, say, 100. For example, with the pandas package (imported as pd), you can do
  8 | # pd.read_csv(filename, chunksize=100). This creates an iterable reader object, which means that you can use next()
  9 | # on it.
 10 | #
 11 | # In this exercise, you will read a file in small DataFrame chunks with read_csv(). You're going to use the World
 12 | # Bank Indicators data 'ind_pop_data.csv', available in your current directory, to look at the urban population indicator
 13 | # for numerous countries and years.
 14 | # Import the pandas package
 15 | 
 16 | # Initialize reader object: df_reader
 17 | df_reader = pd.read_csv('../_datasets/WDIData_min.csv', chunksize=10)
 18 | 
 19 | # Print two chunks
 20 | print(next(df_reader))
 21 | print(next(df_reader))
 22 | 
 23 | # Writing an iterator to load data in chunks (2)
 24 | #
 25 | # In the previous exercise, you used read_csv() to read in DataFrame chunks from a large dataset. In this exercise,
 26 | # you will read in a file using a bigger DataFrame chunk size and then process the data from the first chunk.
 27 | #
 28 | # To process the data, you will create another DataFrame composed of only the rows from a specific country. You will
 29 | # then zip together two of the columns from the new DataFrame, 'Total Population' and 'Urban population (% of
 30 | # total)'. Finally, you will create a list of tuples from the zip object, where each tuple is composed of a value
 31 | # from each of the two columns mentioned.
 32 | #
 33 | # You're going to use the data from 'ind_pop_data.csv', available in your current directory. Pandas has been imported
 34 | #  as pd.
 35 | 
 36 | # Initialize reader object: urb_pop_reader
 37 | urb_pop_reader = pd.read_csv('../_datasets/ind_pop_data.csv', chunksize=1000)
 38 | 
 39 | # Get the first DataFrame chunk: df_urb_pop
 40 | df_urb_pop = next(urb_pop_reader)
 41 | 
 42 | # Check out the head of the DataFrame
 43 | print(df_urb_pop.head())
 44 | 
 45 | # Check out specific country: df_pop_ceb
 46 | df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']
 47 | 
 48 | # Zip DataFrame columns of interest: pops
 49 | pops = zip(df_pop_ceb['Total Population'], df_pop_ceb['Urban population (% of total)'])
 50 | 
 51 | # Turn zip object into list: pops_list
 52 | pops_list = list(pops)
 53 | 
 54 | # Print pops_list
 55 | print(pops_list)
 56 | 
 57 | # Writing an iterator to load data in chunks (3)
 58 | # You're getting used to reading and processing data in chunks by now.
 59 | # Let's push your skills a little further by adding a column to a DataFrame.
 60 | #
 61 | # In this exercise, you will be using a list comprehension to create the values for a new column 'Total Urban
 62 | # Population' from the list of tuples that you generated earlier. Recall from the previous exercise that the first
 63 | # and second elements of each tuple consist of, respectively, values from the columns 'Total Population' and 'Urban
 64 | # population (% of total)'. The values in this new column 'Total Urban Population', therefore, are the product of the
 65 | #  first and second element in each tuple. Furthermore, because the 2nd element is a percentage, you need to divide
 66 | # the entire result by 100, or alternatively, multiply it by 0.01.
 67 | #
 68 | # You will also plot the data from this new column to create a visualization of the urban population data.
 69 | #
 70 | # You're going to use the data from 'ind_pop_data.csv', available in your current directory. The packages pandas and
 71 | # matplotlib.pyplot have been imported as pd and plt respectively for your use.
 72 | 
 73 | # Initialize reader object: urb_pop_reader(see above)
 74 | 
 75 | # Get the first DataFrame chunk: df_urb_pop(see above)
 76 | 
 77 | # Check out specific country: df_pop_ceb(see above)
 78 | 
 79 | # Zip DataFrame columns of interest: pops(see above)
 80 | 
 81 | # Turn zip object into list: pops_list(see above)
 82 | 
 83 | # Use list comprehension to create new DataFrame column 'Total Urban Population'
 84 | df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
 85 | print(df_pop_ceb['Total Urban Population'])
 86 | 
 87 | # Plot urban population data
 88 | df_pop_ceb.plot(kind="scatter", x='Year', y='Total Urban Population')
 89 | plt.show()
 90 | 
 91 | # Writing an iterator to load data in chunks (4)
 92 | #
 93 | # In the previous exercises, you've only processed the data from the
 94 | # first DataFrame chunk. This time, you will aggregate the results over all the DataFrame chunks in the dataset. This
 95 | #  basically means you will be processing the entire dataset now. This is neat because you're going to be able to
 96 | # process the entire large dataset by just working on smaller pieces of it!
 97 | #
 98 | # You're going to use the data from 'ind_pop_data.csv', available in your current directory. The packages pandas and
 99 | # matplotlib.pyplot have been imported as pd and plt respectively for your use.
100 | 
101 | # Initialize reader object: urb_pop_reader(see above)
102 | 
103 | # Initialize empty DataFrame: data
104 | data = pd.DataFrame()
105 | 
106 | # Iterate over each DataFrame chunk
107 | for df_urb_pop in urb_pop_reader:
108 |     # Check out specific country: df_pop_ceb(see above)
109 | 
110 |     # Zip DataFrame columns of interest: pops(see above)
111 | 
112 |     # Turn zip object into list: pops_list(see above)
113 | 
114 |     # Use list comprehension to create new DataFrame column 'Total Urban Population1'(similar to above)
115 |     df_pop_ceb['Total Urban Population1'] = [int(tup[0] * tup[1]) for tup in pops_list]
116 | 
117 |     # Append DataFrame chunk to data: data
118 |     data = data.append(df_pop_ceb)
119 | 
120 | # Plot urban population data
121 | data.plot(kind='scatter', x='Year', y='Total Urban Population1')
122 | plt.show()


--------------------------------------------------------------------------------
/src/case_studies/case_study_pipelining_and_scaling.py:
--------------------------------------------------------------------------------
 1 | # Bringing it all together II:
 2 | #
 3 | # Pipeline for regression For this final exercise, you will return to the Gapminder
 4 | # dataset. Guess what? Even this dataset has missing values that we dealt with for you in earlier chapters! Now,
 5 | # you have all the tools to take care of them yourself!
 6 | #
 7 | # Your job is to build a pipeline that imputes the missing data, scales the features, and fits an ElasticNet to the
 8 | # Gapminder data. You will then tune the l1_ratio of your ElasticNet using GridSearchCV.
 9 | #
10 | # All the necessary modules have been imported, and the feature and target variable arrays have been pre-loaded as X
11 | # and y.
12 | 
13 | import numpy as np
14 | import pandas as pd
15 | from sklearn.model_selection import GridSearchCV
16 | from sklearn.model_selection import train_test_split
17 | from sklearn.pipeline import Pipeline
18 | from sklearn.preprocessing import Imputer
19 | from sklearn.preprocessing import StandardScaler
20 | from sklearn.linear_model import ElasticNet
21 | 
22 | from helper import path
23 | 
24 | # Read 'gm_2008_region.csv' into a DataFrame: df
25 | df = pd.read_csv(path + 'gm_2008_region.csv')
26 | 
27 | X = df.drop('life', axis=1)
28 | y = df['life']
29 | 
30 | # Setup the pipeline steps: steps
31 | steps = [('imputation', Imputer(missing_values='NaN', strategy='mean', axis=0)),
32 |          ('scaler', StandardScaler()),
33 |          ('elasticnet', ElasticNet())]
34 | 
35 | # Create the pipeline: pipeline
36 | pipeline = Pipeline(steps)
37 | 
38 | # Specify the hyperparameter space
39 | parameters = {'elasticnet__l1_ratio': np.linspace(0, 1, 30)}
40 | 
41 | # Create train and test sets
42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
43 | 
44 | # Create the GridSearchCV object: gm_cv
45 | gm_cv = GridSearchCV(pipeline, param_grid=parameters)
46 | 
47 | # Fit to the training set
48 | gm_cv.fit(X_train, y_train)
49 | 
50 | # Compute and print the metrics
51 | r2 = gm_cv.score(X_test, y_test)
52 | print("Tuned ElasticNet Alpha: {}".format(gm_cv.best_params_))
53 | print("Tuned ElasticNet R squared: {}".format(r2))
54 | 


--------------------------------------------------------------------------------
/src/case_studies/case_study_trumps_twitter_RTs.py:
--------------------------------------------------------------------------------
 1 | # Import pandas as pd
 2 | import pandas as pd
 3 | 
 4 | # Import twitter data
 5 | tweets_df = pd.DataFrame(pd.read_excel("../_datasets/Trump Tweets(2017).xlsx"))
 6 | 
 7 | 
 8 | # Define count_entries()
 9 | def count_entries(df, *args):
10 |     """Return a dictionary with counts of
11 |     occurrences as value for each key."""
12 | 
13 |     # Initialize an empty dictionary: cols_count
14 |     cols_count = {}
15 | 
16 |     # Iterate over column names in args
17 |     for col_name in args:
18 | 
19 |         # Extract column from DataFrame: col
20 |         col = df[col_name]
21 | 
22 |         # Iterate over the column in DataFrame
23 |         for entry in col:
24 | 
25 |             # If entry is in cols_count, add 1
26 |             if entry in cols_count.keys():
27 |                 cols_count[entry] += 1
28 | 
29 |             # Else add the entry to cols_count, set the value to 1
30 |             else:
31 |                 cols_count[entry] = 1
32 | 
33 |     # Return the cols_count dictionary
34 |     return cols_count
35 | 
36 | 
37 | # Call count_entries(): result2
38 | result = count_entries(tweets_df, 'Tweet')
39 | 
40 | # Filter our Retweets
41 | retweets = (lambda x: x[0:2] == 'RT', tweets_df['Tweet'])
42 | 
43 | # Print result
44 | 
45 | # print(list(result))
46 | for tweet in retweets:
47 |     print(tweet)
48 | 


--------------------------------------------------------------------------------
/src/case_studies/case_study_urban_population_trends.py:
--------------------------------------------------------------------------------
 1 | # Case Study: Plot Urban population trends in various countries over the years based on publically available data set.
 2 | #
 3 | # In this case study, I have to define the function plot_pop() which takes two arguments: the filename of the file to
 4 | # be processed, and the country code of the rows we want to process in the dataset.
 5 | #
 6 | # calling the function already does the following:
 7 | #
 8 | # Loading of the file chunk by chunk,
 9 | # Creating the new column of urban population values,
10 | # and Plotting the urban population data.
11 | #
12 | # The function makes it convenient to repeat the same process for whatever file and country code we want to process
13 | # and visualize!
14 | #
15 | # We are using the data from 'ind_pop_data.csv', available in /_datasets/ directory.
16 | # The packages pandas and matplotlib.pyplot has been imported as pd and plt respectively.
17 | #
18 | # If you have enjoyed working with this data, you can continue exploring it using the pre-processed version available
19 | # on Kaggle.
20 | 
21 | import pandas as pd
22 | import matplotlib.pyplot as plt
23 | 
24 | def_file_path = '../../_datasets/'
25 | 
26 | # Define plot_pop()
27 | def plot_pop(filename, country_code):
28 |     # Initialize reader object: urb_pop_reader
29 |     urb_pop_reader = pd.read_csv(filename, chunksize=1000)
30 | 
31 |     # Initialize empty DataFrame: data
32 |     data = pd.DataFrame()
33 | 
34 |     # Iterate over each DataFrame chunk
35 |     for df_urb_pop in urb_pop_reader:
36 |         # Check out specific country: df_pop_ceb
37 |         df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]
38 | 
39 |         # Zip DataFrame columns of interest: pops
40 |         pops = zip(df_pop_ceb['Total Population'],
41 |                    df_pop_ceb['Urban population (% of total)'])
42 | 
43 |         # Turn zip object into list: pops_list
44 |         pops_list = list(pops)
45 | 
46 |         # Use list comprehension to create new DataFrame column 'Total Urban Population'
47 |         df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1]) for tup in pops_list]
48 | 
49 |         # Append DataFrame chunk to data: data
50 |         data = data.append(df_pop_ceb)
51 | 
52 |     # Plot urban population data
53 |     data.plot(kind='scatter', x='Year', y='Total Urban Population')
54 |     plt.show()
55 | 
56 | 
57 | # Set the filename: fn
58 | fn = 'ind_pop_data.csv'
59 | 
60 | # Call plot_pop for country code 'CEB'
61 | plot_pop(def_file_path + fn, 'CEB')
62 | 
63 | # Call plot_pop for country code 'ARB'
64 | plot_pop(def_file_path + fn, 'ARB')
65 | 


--------------------------------------------------------------------------------
/src/case_studies/case_study_webscraping_imdb.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author: Saransh Bansal
 3 | Purpose: Scrape top 250 movies in IMDB and visualize the frequency of these top films released in specific years
 4 | """
 5 | import os
 6 | import re
 7 | import sys
 8 | 
 9 | import numpy as np
10 | import requests
11 | 
12 | import matplotlib.pyplot as plt
13 | import pandas as pd
14 | from bs4 import BeautifulSoup
15 | 
16 | os.getcwd()  # current working directory
17 | 
18 | # get the current encoding
19 | type = sys.getfilesystemencoding()
20 | 
21 | # request the webpage
22 | req = requests.get("https://www.imdb.com/chart/top")
23 | page = req.text
24 | 
25 | soup = BeautifulSoup(page, 'html.parser')
26 | print(soup.prettify())
27 | 
28 | # get top 250 movie names and years, may take ~30 seconds
29 | movie_names = []
30 | movie_year = [0] * 250
31 | 
32 | j = 0
33 | for i in range(250):
34 |     title = str(soup.findAll('td', {'class': 'titleColumn'})[i])
35 |     movie_names.append(re.findall('>(.*?)</a>', title)[0])
36 | 
37 |     year = str(soup.findAll('span', {'class': 'secondaryInfo'})[i])
38 |     movie_year[i] = int(re.findall(r"\(([0-9_]+)\)", year)[0])
39 | 
40 |     # keep track of the progress
41 |     print('Extracted movie :: ' + movie_names[i] + ' (' + str(movie_year[i]) + ') ')
42 |     j = j + 1
43 | 
44 | print(movie_names)
45 | print(movie_year)
46 | 
47 | 
48 | def encode_title(item):
49 |     return str(item.encode('utf-8'))
50 | 
51 | 
52 | # export to the text file
53 | open("top250names.txt", "w").write("\n".join(encode_title(item) for item in movie_names))
54 | 
55 | # compute the frequency table
56 | y = np.bincount(movie_year)
57 | ii = np.nonzero(y)[0]
58 | out = list(zip(ii, y[ii]))
59 | # create a dataframe
60 | df = pd.DataFrame(out, columns=['Year', 'Freq'], index=ii)
61 | # drop the first Year column since I already assign valid index
62 | df.drop(df.columns[0], axis=1)
63 | # plot
64 | plt.plot(ii, df['Freq'])
65 | plt.show()
66 | 


--------------------------------------------------------------------------------
/src/case_studies/top250names.txt:
--------------------------------------------------------------------------------
1 | b'The Shawshank Redemption'
2 | b'The Godfather'
3 | b'The Godfather: Part II'
4 | b'The Dark Knight'
5 | b'12 Angry Men'


--------------------------------------------------------------------------------
/src/core/py_comprehensions.py:
--------------------------------------------------------------------------------
 1 | # Create a list of strings: fellowship
 2 | fellowship = ['frodo', 'samwise', 'merry', 'aragorn', 'legolas', 'boromir', 'gimli']
 3 | 
 4 | # Create list comprehension: new_fellowship with condition in predicate expression
 5 | new_fellowship = [member for member in fellowship if (len(member) >= 7)]
 6 | 
 7 | # Create list comprehension: new_fellowship with condition in predicate expression
 8 | new_fellowship_1 = [member if (len(member) >= 7) else '' for member in fellowship]
 9 | 
10 | # Print the new list
11 | print(new_fellowship)
12 | 
13 | # Print the new list
14 | print(new_fellowship_1)
15 | 
16 | # -----------------------------#
17 | 
18 | # Create dict comprehension: new_fellowship_dict
19 | new_fellowship_dict = {member: len(member) for member in fellowship}
20 | 
21 | # Print the new list
22 | print(new_fellowship_dict)
23 | 


--------------------------------------------------------------------------------
/src/core/py_enumeration_example.py:
--------------------------------------------------------------------------------
 1 | # Create a list of strings: mutants
 2 | mutants = ['charles xavier',
 3 |            'bobby drake',
 4 |            'kurt wagner',
 5 |            'max eisenhardt',
 6 |            'kitty pride']
 7 | 
 8 | # Create a list of tuples: mutant_list
 9 | mutant_list = enumerate(mutants)
10 | 
11 | # Print the list of tuples
12 | print(list(mutant_list))
13 | 
14 | print('----------\n')
15 | 
16 | # Unpack and print the tuple pairs
17 | for index1, value1 in enumerate(mutants):
18 |     print(index1, value1)
19 | 
20 | print('----------\n')
21 | 
22 | # Change the start index
23 | for index2, value2 in enumerate(mutants, start=1):
24 |     print(index2, value2)
25 | 


--------------------------------------------------------------------------------
/src/core/py_filter_example.py:
--------------------------------------------------------------------------------
 1 | # Import reduce from functools
 2 | from functools import reduce
 3 | 
 4 | # Create a list of strings: stark
 5 | stark = ['robb', 'sansa', 'arya', 'eddard', 'jon']
 6 | 
 7 | # Use reduce() to apply a lambda function over stark: result
 8 | result = reduce(lambda item1, item2: item1 + item2, stark)
 9 | 
10 | # Print the result
11 | print(result)
12 | 


--------------------------------------------------------------------------------
/src/core/py_generators.py:
--------------------------------------------------------------------------------
 1 | # Create a list of strings
 2 | lannister = ['cersei', 'jaime', 'tywin', 'tyrion', 'joffrey']
 3 | 
 4 | 
 5 | # Define generator function get_lengths
 6 | def get_lengths(input_list):
 7 |     """Generator function that yields the
 8 |     length of the strings in input_list."""
 9 | 
10 |     # Yield the length of a string
11 |     for person in input_list:
12 |         yield len(person)
13 | 
14 | 
15 | # Print the values generated by get_lengths()
16 | for value in get_lengths(lannister):
17 |     print(value)
18 | 


--------------------------------------------------------------------------------
/src/core/py_iterable_and_iterator.py:
--------------------------------------------------------------------------------
 1 | # An ITERABLE is:
 2 | #
 3 | # anything that can be looped over (i.e. you can loop over a string or file) or
 4 | # anything that can appear on the right-side of a for-loop:  for x in iterable: ... or
 5 | # anything you can call with iter() that will return an ITERATOR:  iter(obj) or
 6 | # an object that defines __iter__ that returns a fresh ITERATOR,
 7 | # or it may have a __getitem__ method suitable for indexed lookup.
 8 | #
 9 | # An ITERATOR is an object:
10 | #
11 | # with state that remembers where it is during iteration,
12 | # with a __next__ method that:
13 | # returns the next value in the iteration
14 | # updates the state to point at the next value
15 | # signals when it is done by raising StopIteration
16 | # and that is self-iterable (meaning that it has an __iter__ method that returns self).
17 | #
18 | # Notes:
19 | #
20 | # The __next__ method in Python 3 is spelt next in Python 2, and
21 | # The builtin function next() calls that method on the object passed to it.
22 | #
23 | # EXAMPLES ::
24 | 
25 | # s is a str object that is immutable
26 | # s has no state
27 | # s has a __getitem__() method
28 | s = 'cat'      # s is an ITERABLE
29 | print(next(s))  # TypeError: 'str' object is not an iterator
30 | 
31 | # t has state (it starts by pointing at the "c"
32 | # t has a next() method and an __iter__() method
33 | t = iter(s)    # t is an ITERATOR
34 | 
35 | next(t)        # the next() function returns the next value and advances the state
36 | next(t)        # the next() function returns the next value and advances
37 | next(t)        # the next() function returns the next value and advances
38 | next(t)        # next() raises StopIteration to signal that iteration is complete
39 | 
40 | # >>> iter(t) is t   # the iterator is self-iterable
41 | 


--------------------------------------------------------------------------------
/src/core/py_regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | mytext = str([
 4 |     '20080620033027/http://www.mrvc.indianrail.gov.in/overview.htm). _Official webpage of Mumbai Railway Vikas Corporation_. Archived from [the original](http://www.mrvc.indianrail.gov.in/overview.htm) on 2008-06-20. Retrieved 2008-12-11.'])
 5 | 
 6 | myregex = r'(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}'
 7 | x = re.findall(myregex, mytext)
 8 | 
 9 | result = []
10 | for res in x:
11 |     result.append(res.replace("www.", "").split('//')[-1].split('/')[0])
12 | 
13 | print(';'.join(result))
14 | 


--------------------------------------------------------------------------------
/src/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/db/__init__.py


--------------------------------------------------------------------------------
/src/db/py_mongo_integration.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | 
 3 | import pymongo
 4 | from bson import ObjectId
 5 | from pymongo import MongoClient
 6 | 
 7 | MONGODB_HOST = 'localhost'
 8 | MONGODB_PORT = 27017
 9 | DB_NAME = 'testdb'
10 | COLLECTION_TEST = 'collection_1'
11 | COLLECTION_PROFILES = 'profiles'
12 | 
13 | test_data = {
14 |     'title': 'My First MongoDB document',
15 |     'author': 'Saransh Bansal',
16 |     'likes': 100,
17 | }
18 | 
19 | user_profiles = [{'user_id': 211, 'name': 'Luke'}, {'user_id': 212, 'name': 'Ziltoid'}]
20 | 
21 | 
22 | class MongoUtil():
23 |     db = None
24 | 
25 |     def __init__(self):
26 |         client = MongoClient(MONGODB_HOST, MONGODB_PORT)
27 |         self.db = client.DB_NAME
28 | 
29 |     def connect_to_mongo(self):
30 |         return self.db
31 | 
32 |     def print_collection(self, coll_name):
33 |         mycol = self.db[coll_name]
34 |         print(mycol)
35 | 
36 |     def insert_document(self, coll_name, document=None):
37 |         mycol = self.db[coll_name]
38 |         mycol.insert_one(document)
39 | 
40 |     def update_document(self, coll_name, obj_id):
41 |         mycol = self.db[coll_name]
42 |         mycol.update_one({'_id': ObjectId(obj_id)}, {"$set": {"title": "abc"}})
43 | 
44 |     def print_all(self, coll_name):
45 |         results = self.db[coll_name].find()
46 |         for node in results:
47 |             pprint.pprint(node)
48 | 
49 |     def count_documents(self, coll_name):
50 |         mycol = self.db[coll_name]
51 |         count = mycol.count_documents({})
52 |         print(count)
53 | 
54 |     def create_profiles(self, coll_name):
55 |         self.db[coll_name].insert_many(user_profiles)
56 | 
57 |     def create_index(self, coll_name, index_col):
58 |         self.db[coll_name].create_index([(index_col, pymongo.ASCENDING)],
59 |                                         unique=True)
60 |         print(sorted(list(self.db[coll_name].index_information())))
61 | 
62 | 
63 | instance = MongoUtil()
64 | 
65 | instance.print_collection(COLLECTION_TEST)
66 | 
67 | # instance.insert_document(COLLECTION_TEST, test_data)
68 | 
69 | # instance.create_profiles(COLLECTION_PROFILES)
70 | 
71 | instance.print_all(COLLECTION_TEST)
72 | 
73 | print('\n')
74 | 
75 | instance.print_all(COLLECTION_PROFILES)
76 | 
77 | print('\n')
78 | 
79 | instance.count_documents(COLLECTION_TEST)
80 | 
81 | # instance.update_document(COLLECTION_TEST, '5d2c87e03c30f6680050c521')
82 | 
83 | instance.create_index(COLLECTION_PROFILES, 'user_id')
84 | 


--------------------------------------------------------------------------------
/src/db/py_sql.py:
--------------------------------------------------------------------------------
 1 | # Import necessary module
 2 | import pandas as pd
 3 | from sqlalchemy import create_engine
 4 | 
 5 | from helper import path
 6 | 
 7 | # Create engine: engine
 8 | engine = create_engine('sqlite:///' + path + 'Chinook.sqlite')
 9 | 
10 | # Save the table names to a list: table_names
11 | table_names = engine.table_names()
12 | 
13 | # Print the table names to the shell
14 | print(table_names)
15 | 
16 | # Open engine connection: con
17 | con = engine.connect()
18 | 
19 | # Perform query: rs
20 | rs = con.execute('select * from Album')
21 | 
22 | # Save results of the query to DataFrame: df
23 | df1 = pd.DataFrame(rs.fetchall())
24 | 
25 | # Close connection
26 | con.close()
27 | 
28 | # Print head of DataFrame df
29 | print(df1.head())
30 | 
31 | # ---------------------------------------
32 | 
33 | # Perform query and save results to DataFrame: df
34 | with engine.connect() as con:
35 |     rs = con.execute('select LastName, Title from Employee')
36 |     df2 = pd.DataFrame(rs.fetchmany(size=3))
37 |     df2.columns = rs.keys()
38 | 
39 | # Print the length of the DataFrame df
40 | print(len(df2))
41 | 
42 | # Print the head of the DataFrame df
43 | print(df2.head())
44 | 
45 | # ---------------------------------------
46 | # Open engine in context manager
47 | # Perform query and save results to DataFrame: df
48 | with engine.connect() as con:
49 |     rs = con.execute('select * from Employee where EmployeeId >= 6')
50 |     df3 = pd.DataFrame(rs.fetchall())
51 |     df3.columns = rs.keys()
52 | 
53 | # Print the head of the DataFrame df
54 | print(df3.head())
55 | 


--------------------------------------------------------------------------------
/src/db/py_sql_with_pandas.py:
--------------------------------------------------------------------------------
 1 | # Import necessary module
 2 | import random
 3 | from datetime import datetime
 4 | 
 5 | import pandas as pd
 6 | from sqlalchemy import create_engine
 7 | 
 8 | from helper import path
 9 | 
10 | # Create engine: engine
11 | engine = create_engine('sqlite:///' + path + 'Chinook.sqlite');
12 | 
13 | # Execute query and store records in DataFrame: df
14 | df1 = pd.read_sql_query('select * from Album', engine)
15 | 
16 | # Execute query and store records in DataFrame: df
17 | df2 = pd.read_sql_query('select * from Employee where EmployeeId >= 6 order by BirthDate', engine)
18 | 
19 | df3 = pd.read_sql_query('select Title, Name from Album al inner join Artist ar on al.ArtistID=ar.ArtistID', engine)
20 | 
21 | df4 = pd.read_sql_query(
22 |     'select * from PlaylistTrack INNER JOIN Track on PlaylistTrack.TrackId = Track.TrackId where Milliseconds < 250000',
23 |     engine)
24 | 
25 | rand_dates = [datetime(random.randrange(2000, 2001), random.randrange(1, 6), random.randrange(1, 3)) for d in
26 |               range(0, len(df4))]
27 | df4['dates'] = rand_dates
28 | 
29 | df4 = df4.loc[:, ~df4.columns.duplicated()]
30 | # Print head of DataFrame
31 | # print(df1.head())
32 | # print(df2.head())
33 | # print(df3.head())
34 | # print(df4.head())
35 | df4 = df4.groupby([df4.dates.dt.year.rename('year'), df4.dates.dt.month.rename('month')]).size()
36 | print(df4)
37 | # df4.groupby(df4.dates).agg({'count'})
38 | # print(df4.columns)
39 | # print(df4['dates'].value_counts())
40 | 
41 | 


--------------------------------------------------------------------------------
/src/file_operations/py_corrupt_file_read.py:
--------------------------------------------------------------------------------
 1 | # Import matplotlib.pyplot as plt
 2 | import matplotlib.pyplot as plt
 3 | import pandas as pd
 4 | 
 5 | from helper import path
 6 | 
 7 | # Assign filename: file
 8 | file = 'titanic_corrupt.txt'
 9 | 
10 | # Import file: data
11 | data = pd.read_csv(path + file, sep='\t', comment='#', na_values=['NA', 'NaN', "Nothing"])
12 | 
13 | # Print the head of the DataFrame
14 | print(data.head())
15 | 
16 | # Plot 'Age' variable in a histogram
17 | pd.DataFrame.hist(data[['Age']])
18 | plt.xlabel('Age (years)')
19 | plt.ylabel('count')
20 | plt.show()
21 | 


--------------------------------------------------------------------------------
/src/file_operations/py_default_file_read_1.py:
--------------------------------------------------------------------------------
 1 | from helper import path
 2 | 
 3 | # Open a file: file
 4 | file = open(path + 'moby_dick.txt', mode='r')
 5 | 
 6 | # Print it
 7 | print(file.read())
 8 | 
 9 | # Check whether file is closed
10 | print(file.closed)
11 | 
12 | # Close file
13 | file.close()
14 | 
15 | # Check whether file is closed
16 | print(file.closed)
17 | 
18 | # Importing text file_operations line by line
19 | 
20 | # Read & print the first 3 lines
21 | with open(path + 'moby_dick.txt') as file:
22 |     print(file.readline())
23 |     print(file.readline())
24 |     print(file.readline())
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/src/file_operations/py_numpy_file_read_1.py:
--------------------------------------------------------------------------------
 1 | # Import package
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | from helper import path
 6 | 
 7 | # Assign filename to variable: file
 8 | file = 'digits.csv'
 9 | 
10 | # Load file as array: digits
11 | digits = np.loadtxt(path + file, delimiter=',', dtype=str)
12 | 
13 | # Print datatype of digits
14 | print(type(digits))
15 | 
16 | # Select and reshape a row
17 | im = digits[21, 1:]
18 | im_sq = np.reshape(im, (8, 98))
19 | 
20 | # Plot reshaped data (matplotlib.pyplot already loaded as plt)
21 | plt.imshow(im_sq, cmap='Greys', interpolation='nearest')
22 | plt.show()
23 | 


--------------------------------------------------------------------------------
/src/file_operations/py_numpy_file_read_2.py:
--------------------------------------------------------------------------------
 1 | # Import package
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | from helper import path
 6 | 
 7 | # Assign the filename: file
 8 | file = 'amis.csv'
 9 | 
10 | # Load the data: data
11 | data = np.loadtxt(path + file, delimiter=',', skiprows=1, usecols=[1, 3])
12 | 
13 | # Print data
14 | print(data)
15 | 
16 | # Read flat file to data frame
17 | 
18 | # Read the first 5 rows of the file into a DataFrame: data
19 | data = pd.read_csv(path + file, nrows=5, header=None)
20 | 
21 | # Build a numpy array from the DataFrame: data_array
22 | data_array = np.array(data)
23 | 
24 | # Print the datatype of data_array to the shell
25 | print(type(data_array))
26 | 


--------------------------------------------------------------------------------
/src/file_operations/py_pandas_excel_read.py:
--------------------------------------------------------------------------------
 1 | # Listing sheets in Excel file_operations
 2 | #
 3 | # Whether you like it or not, any working data scientist will need to deal with Excel spreadsheets at some point in
 4 | # time. You won't always want to do so in Excel, however!
 5 | #
 6 | # Here, you'll learn how to use pandas to import Excel spreadsheets and how to list the names of the sheets in any
 7 | # loaded .xlsx file.
 8 | #
 9 | # Recall from the video that, given an Excel file imported into a variable spreadsheet, you can retrieve a list of
10 | # the sheet names using the attribute spreadsheet.sheet_names.
11 | #
12 | # Specifically, you'll be loading and checking out the spreadsheet 'battledeath.xlsx', modified from the Peace
13 | # Research Institute Oslo's (PRIO) dataset. This data contains age-adjusted mortality rates due to war in various
14 | # countries over several years.
15 | 
16 | # Import pandas
17 | import pandas as pd
18 | 
19 | from helper import path
20 | 
21 | # Assign spreadsheet filename: file
22 | file = 'battledeath.xlsx'
23 | 
24 | # Load spreadsheet: xl
25 | xl = pd.ExcelFile(path + file)
26 | 
27 | # Print sheet names
28 | print(xl.sheet_names)
29 | 
30 | # -------------------------------------------
31 | # Load a sheet into a DataFrame by name: df1
32 | df1 = xl.parse('2004')
33 | 
34 | # Print the head of the DataFrame df1
35 | print(df1.head())
36 | 
37 | # Load a sheet into a DataFrame by index: df2
38 | # parse_args :: sheet index/name | skiprows | custom column names | parse_cols columns to show
39 | df2 = xl.parse(0, skiprows=[0])
40 | 
41 | # Print the head of the DataFrame df2
42 | print(df2.head())
43 | 
44 | # PS: both are ~ same!
45 | 


--------------------------------------------------------------------------------
/src/file_operations/py_pandas_file_read_1.py:
--------------------------------------------------------------------------------
 1 | # Import pandas as pd
 2 | import pandas as pd
 3 | from helper import path
 4 | # Assign the filename: file
 5 | file = 'digits.csv'
 6 | 
 7 | # Read the file into a DataFrame: df
 8 | df = pd.read_csv(file)
 9 | 
10 | # View the head of the DataFrame
11 | print(df.head())
12 | 


--------------------------------------------------------------------------------
/src/file_operations/py_pandas_read_csv.py:
--------------------------------------------------------------------------------
 1 | # Import pandas as pd
 2 | import pandas as pd
 3 | 
 4 | # Import the cars.csv data: cars
 5 | cars = pd.DataFrame(pd.read_csv("../_datasets/cars.csv"))
 6 | tweets = pd.DataFrame(pd.read_csv("../_datasets/tweets.csv"))
 7 | jobs = pd.DataFrame(pd.read_csv("../_datasets/Information_gain_job_advertisements.csv"))
 8 | industries = pd.DataFrame(pd.read_json("../_datasets/industries.json"))
 9 | 
10 | # Print out cars
11 | print(cars.describe())
12 | 
13 | # Print out tweets
14 | print(tweets.keys())
15 | 
16 | # Print all columns of industries
17 | print(list(industries.resultList))
18 | 


--------------------------------------------------------------------------------
/src/file_operations/py_pickle_read_test.py:
--------------------------------------------------------------------------------
 1 | # Save a dictionary into a pickle file.
 2 | import pickle
 3 | 
 4 | from helper import path
 5 | 
 6 | d = {'Aug': '85', 'Airline': '8', 'June': '69.4', 'Mar': '84.4'}
 7 | pickle.dump(d, open(path + "data.pk1", "wb"))
 8 | 
 9 | # Load the dictionary back from the pickle file.
10 | d = pickle.load(open(path + 'data.pk1', "rb"))
11 | 
12 | print(d)
13 | 
14 | print(type(d))
15 | 


--------------------------------------------------------------------------------
/src/file_operations/py_read_hdf5_file.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | 
 3 | from helper import path
 4 | 
 5 | # Assign filename: file
 6 | file = 'NEONDS.hdf5'
 7 | 
 8 | # Load file: data
 9 | data = h5py.File(path + file, 'r')
10 | 
11 | # Print the datatype of the loaded file
12 | print(type(data))
13 | 
14 | # Print the keys of the file
15 | for key in data.keys():
16 |     print(data[key])
17 | 


--------------------------------------------------------------------------------
/src/file_operations/py_read_matlab_file.py:
--------------------------------------------------------------------------------
 1 | # Import package
 2 | import scipy.io
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | 
 6 | from helper import path
 7 | 
 8 | # Load MATLAB file: mat
 9 | mat = scipy.io.loadmat(path + 'albeck_gene_expression.mat')
10 | 
11 | # Print the datatype type of mat
12 | print(type(mat))
13 | 
14 | # Print the keys of the MATLAB dictionary
15 | print(mat.keys())
16 | 
17 | # Print the type of the value corresponding to the key 'CYratioCyt'
18 | print(type(mat['fret']))
19 | 
20 | # Print the shape of the value corresponding to the key 'CYratioCyt'
21 | print(np.shape(mat['fret']))
22 | 
23 | # Subset the array and plot it
24 | data = mat['fret'][25, 5:]
25 | fig = plt.figure()
26 | plt.plot(data)
27 | plt.xlabel('time (min.)')
28 | plt.ylabel('normalized fluorescence (measure of expression)')
29 | plt.show()
30 | 
31 | 


--------------------------------------------------------------------------------
/src/file_operations/py_read_sas_file.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | # Import sas7bdat package
 5 | from sas7bdat import SAS7BDAT
 6 | 
 7 | from helper import path
 8 | 
 9 | # Save file to a DataFrame: df_sas
10 | with SAS7BDAT(path+'sales.sas7bdat') as file:
11 |     df_sas = file.to_data_frame()
12 | 
13 | # Print head of DataFrame
14 | print((df_sas.head()))
15 | 
16 | # Plot histogram of DataFrame features (pandas and pyplot already imported)
17 | pd.DataFrame.hist(df_sas[['P']])
18 | plt.ylabel('count')
19 | plt.show()
20 | 


--------------------------------------------------------------------------------
/src/file_operations/py_read_stata_file.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | 
 4 | # Import sas7bdat package
 5 | 
 6 | from helper import path
 7 | 
 8 | df = pd.read_stata(path + 'disarea.dta', 'rb')
 9 | 
10 | # Print head of DataFrame
11 | print(df.head())
12 | 
13 | 
14 | # Plot histogram of DataFrame features (pandas and pyplot already imported)
15 | def plot(key):
16 |     if key not in ['wbcode', 'country']:
17 |         pd.DataFrame.hist(df[[key]])
18 |         plt.ylabel('count')
19 |         plt.show()
20 | 
21 | for key in df.keys():
22 |     plot(key)
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/src/file_operations/py_test_loops_algo.py:
--------------------------------------------------------------------------------
 1 | # Here are some comparisons of the performances for in, set and bisect. Note the time (in second) is in log scale.
 2 | 
 3 | import bisect
 4 | import math
 5 | import random
 6 | import time
 7 | 
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def method_in(a, b, c):
12 |     start_time = time.time()
13 |     for i, x in enumerate(a):
14 |         if x in b:
15 |             c[i] = 1
16 |     return time.time() - start_time
17 | 
18 | 
19 | def method_set_in(a, b, c):
20 |     start_time = time.time()
21 |     s = set(b)
22 |     for i, x in enumerate(a):
23 |         if x in s:
24 |             c[i] = 1
25 |     return time.time() - start_time
26 | 
27 | 
28 | def method_bisect(a, b, c):
29 |     start_time = time.time()
30 |     b.sort()
31 |     for i, x in enumerate(a):
32 |         index = bisect.bisect_left(b, x)
33 |         if index < len(a):
34 |             if x == b[index]:
35 |                 c[i] = 1
36 |     return time.time() - start_time
37 | 
38 | 
39 | def profile():
40 |     time_method_in = []
41 |     time_method_set_in = []
42 |     time_method_bisect = []
43 | 
44 |     Nls = [x for x in range(1000, 20000, 1000)]
45 |     for N in Nls:
46 |         a = [x for x in range(0, N)]
47 |         random.shuffle(a)
48 |         b = [x for x in range(0, N)]
49 |         random.shuffle(b)
50 |         c = [0 for x in range(0, N)]
51 | 
52 |         time_method_in.append(math.log(method_in(a, b, c)))
53 |         time_method_set_in.append(math.log(method_set_in(a, b, c)))
54 |         time_method_bisect.append(math.log(method_bisect(a, b, c)))
55 | 
56 |     plt.plot(Nls, time_method_in, marker='o', color='r', linestyle='-', label='in')
57 |     plt.plot(Nls, time_method_set_in, marker='o', color='b', linestyle='-', label='set')
58 |     plt.plot(Nls, time_method_bisect, marker='o', color='g', linestyle='-', label='bisect')
59 |     plt.xlabel('list size', fontsize=18)
60 |     plt.ylabel('log(time)', fontsize=18)
61 |     plt.legend(loc='upper left')
62 |     plt.show()
63 | 
64 | 
65 | profile()
66 | 


--------------------------------------------------------------------------------
/src/file_operations/read_in_chunks.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from helper import path
 3 | 
 4 | 
 5 | # Define count_entries()
 6 | def count_entries(csv_file, c_size, colname):
 7 |     """Return a dictionary with counts of
 8 |     occurrences as value for each key."""
 9 | 
10 |     # Initialize an empty dictionary: counts_dict
11 |     counts_dict = {}
12 | 
13 |     # Iterate over the file chunk by chunk
14 |     for chunk in pd.read_csv(csv_file, chunksize=c_size):
15 | 
16 |         # Iterate over the column in DataFrame
17 |         for entry in chunk[colname]:
18 |             if entry in counts_dict.keys():
19 |                 counts_dict[entry] += 1
20 |             else:
21 |                 counts_dict[entry] = 1
22 | 
23 |     # Return counts_dict
24 |     return counts_dict
25 | 
26 | 
27 | # Call count_entries(): result_counts
28 | result_counts = count_entries(path + 'Information_gain_job_advertisements.csv', 10, 'Term')
29 | 
30 | # Print result_counts
31 | print(result_counts)
32 | 


--------------------------------------------------------------------------------
/src/file_operations/read_tweets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | import csv
 5 | 
 6 | import tweepy  # https://github.com/tweepy/tweepy
 7 | 
 8 | # Twitter API credentials
 9 | consumer_key = ""
10 | consumer_secret = ""
11 | access_key = ""
12 | access_secret = ""
13 | 
14 | 
15 | def get_all_tweets(screen_name):
16 |     # Twitter only allows access to a users most recent 3240 tweets with this method
17 | 
18 |     # authorize twitter, initialize tweepy
19 |     auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
20 |     auth.set_access_token(access_key, access_secret)
21 |     api = tweepy.API(auth)
22 | 
23 |     # initialize a list to hold all the tweepy Tweets
24 |     alltweets = []
25 | 
26 |     # make initial request for most recent tweets (200 is the maximum allowed count)
27 |     new_tweets = api.user_timeline(screen_name=screen_name, count=200)
28 | 
29 |     # save most recent tweets
30 |     alltweets.extend(new_tweets)
31 | 
32 |     # save the id of the oldest tweet less one
33 |     oldest = alltweets[-1].id - 1
34 | 
35 |     # keep grabbing tweets until there are no tweets left to grab
36 |     while len(new_tweets) > 0:
37 |         print
38 |         "getting tweets before %s" % oldest
39 | 
40 |         # all subsiquent requests use the max_id param to prevent duplicates
41 |         new_tweets = api.user_timeline(screen_name=screen_name, count=200, max_id=oldest)
42 | 
43 |         # save most recent tweets
44 |         alltweets.extend(new_tweets)
45 | 
46 |         # update the id of the oldest tweet less one
47 |         oldest = alltweets[-1].id - 1
48 | 
49 |         print
50 |         "...%s tweets downloaded so far" % (len(alltweets))
51 | 
52 |     # transform the tweepy tweets into a 2D array that will populate the csv
53 |     outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]
54 | 
55 |     # write the csv
56 |     with open('%s_tweets.csv' % screen_name, 'wb') as f:
57 |         writer = csv.writer(f)
58 |         writer.writerow(["id", "created_at", "text"])
59 |         writer.writerows(outtweets)
60 | 
61 |     pass
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     # pass in the username of the account you want to download
66 |     get_all_tweets("J_tsar")
67 | 


--------------------------------------------------------------------------------
/src/grains_data_from_dataset.py:
--------------------------------------------------------------------------------
1 | import csv
2 | 
3 | import numpy as np
4 | 
5 | from helper import path
6 | 
7 | with open('../' + path + 'seeds-width-vs-length.csv', 'r') as f:
8 |     grains = list(csv.reader(f, delimiter=','))
9 |     grains = np.array(grains).astype(np.float)


--------------------------------------------------------------------------------
/src/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/misc/__init__.py


--------------------------------------------------------------------------------
/src/misc/py_test_loops_algo.py:
--------------------------------------------------------------------------------
 1 | # Here are some comparisons of the performances for in, set and bisect. Note the time (in second) is in log scale.
 2 | 
 3 | import bisect
 4 | import math
 5 | import random
 6 | import time
 7 | 
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def method_in(a, b, c):
12 |     start_time = time.time()
13 |     for i, x in enumerate(a):
14 |         if x in b:
15 |             c[i] = 1
16 |     return time.time() - start_time
17 | 
18 | 
19 | def method_set_in(a, b, c):
20 |     start_time = time.time()
21 |     s = set(b)
22 |     for i, x in enumerate(a):
23 |         if x in s:
24 |             c[i] = 1
25 |     return time.time() - start_time
26 | 
27 | 
28 | def method_bisect(a, b, c):
29 |     start_time = time.time()
30 |     b.sort()
31 |     for i, x in enumerate(a):
32 |         index = bisect.bisect_left(b, x)
33 |         if index < len(a):
34 |             if x == b[index]:
35 |                 c[i] = 1
36 |     return time.time() - start_time
37 | 
38 | 
39 | def profile():
40 |     time_method_in = []
41 |     time_method_set_in = []
42 |     time_method_bisect = []
43 | 
44 |     Nls = [x for x in range(1000, 20000, 1000)]
45 |     for N in Nls:
46 |         a = [x for x in range(0, N)]
47 |         random.shuffle(a)
48 |         b = [x for x in range(0, N)]
49 |         random.shuffle(b)
50 |         c = [0 for x in range(0, N)]
51 | 
52 |         time_method_in.append(math.log(method_in(a, b, c)))
53 |         time_method_set_in.append(math.log(method_set_in(a, b, c)))
54 |         time_method_bisect.append(math.log(method_bisect(a, b, c)))
55 | 
56 |     plt.plot(Nls, time_method_in, marker='o', color='r', linestyle='-', label='in')
57 |     plt.plot(Nls, time_method_set_in, marker='o', color='b', linestyle='-', label='set')
58 |     plt.plot(Nls, time_method_bisect, marker='o', color='g', linestyle='-', label='bisect')
59 |     plt.xlabel('list size', fontsize=18)
60 |     plt.ylabel('log(time)', fontsize=18)
61 |     plt.legend(loc='upper left')
62 |     plt.show()
63 | 
64 | 
65 | profile()
66 | 


--------------------------------------------------------------------------------
/src/misc/py_zip_example.py:
--------------------------------------------------------------------------------
 1 | # Using zip
 2 | #
 3 | # Another interesting function that you've learned is zip(), which takes any number of iterables and
 4 | # returns a zip object that is an iterator of tuples. If you wanted to print the values of a zip object,
 5 | # you can convert it into a list and then print it. Printing just a zip object will not return the values unless you
 6 | # unpack it first. In this exercise, you will explore this for yourself.
 7 | #
 8 | # Three lists of strings are pre-loaded: mutants, aliases, and powers. First, you will use list() and zip() on these
 9 | # lists to generate a list of tuples. Then, you will create a zip object using zip(). Finally, you will unpack this
10 | # zip object in a for loop to print the values in each tuple. Observe the different output generated by printing the
11 | # list of tuples, then the zip object, and finally, the tuple values in the for loop.
12 | 
13 | mutants = ['charles xavier',
14 |            'bobby drake',
15 |            'kurt wagner',
16 |            'max eisenhardt',
17 |            'kitty pride']
18 | 
19 | aliases = ['prof x', 'iceman', 'nightcrawler', 'magneto', 'shadowcat']
20 | 
21 | powers = ['telepathy',
22 |           'thermokinesis',
23 |           'teleportation',
24 |           'magnetokinesis',
25 |           'intangibility']
26 | 
27 | # Create a list of tuples: mutant_data
28 | mutant_data = list(zip(mutants, aliases, powers))
29 | 
30 | # Print the list of tuples
31 | print(mutant_data)
32 | 
33 | # Create a zip object using the three lists: mutant_zip
34 | mutant_zip = zip(mutants, aliases, powers)
35 | 
36 | # Print the zip object
37 | print(mutant_zip)
38 | 
39 | # Unpack the zip object and print the tuple values
40 | for value1, value2, value3 in mutant_zip:
41 |     print(value1, value2, value3)
42 | 


--------------------------------------------------------------------------------
/src/misc/random.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | a = [x for x in range(3, 13)]
 4 | print(a)
 5 | value = []
 6 | print(value[0] if value else 0)
 7 | 
 8 | val = '2018,1'
 9 | date_object = datetime.strptime(val, '%Y,%m')
10 | 
11 | print(date_object.strftime("%b %y"))
12 | 
13 | print('sdada' + '123')
14 | 
15 | 
16 | class B(object):
17 |     def __init__(self):
18 |         body = "aaa"
19 |         self.context = {
20 |             'body': body,
21 |         }
22 | 
23 | 
24 | class A(B):
25 |     def __init__(self):
26 |         super().__init__()
27 |         self.context['body'] = self.context['body'] + "BBB"
28 |         self.context = {
29 |             **self.context,
30 |         }
31 |         print(self.context['body'])
32 | 
33 | 
34 | class C(B):
35 |     def __init__(self):
36 |         super().__init__()
37 | 
38 |         print(self.context['body'])
39 | 
40 | 
41 | b = B()
42 | a = A()
43 | c = C()
44 | 
45 | data = {
46 |     'key': 100
47 | }
48 | 
49 | print('{}\\xE2\\x80\\xAD\\xE2\\x80\\xAD'.format(data))
50 | 
51 | print('+381652522560')
52 | 
53 | name = 'Larry Lam'
54 | if name:
55 |     names = name.split(' ')
56 |     given = names[0]
57 |     family = names[len(names) - 1]
58 |     print('{}, {}'.format(given, family))
59 | 
60 | 
61 | print('   '.strip() or 'NA')
62 | 
63 | 


--------------------------------------------------------------------------------
/src/misc/tensorflow_starter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from builtins import print
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import tensorflow as tf
 7 | from sklearn import metrics
 8 | from tensorflow.python.data import Dataset
 9 | 
10 | tf.logging.set_verbosity(tf.logging.ERROR)
11 | pd.options.display.max_rows = 10
12 | pd.options.display.float_format = '{:.1f}'.format
13 | 
14 | california_housing_dataframe = pd.read_csv('https://storage.googleapis.com/mledu-datasets/california_housing_train.csv',
15 |                                            sep=",")
16 | california_housing_dataframe = california_housing_dataframe.reindex(
17 |     np.random.permutation(california_housing_dataframe.index))
18 | california_housing_dataframe["median_house_value"] /= 1000.0
19 | california_housing_dataframe
20 | 
21 | # Define the input feature: total_rooms.
22 | my_feature = california_housing_dataframe[["total_rooms"]]
23 | 
24 | # Configure a numeric feature column for total_rooms.
25 | feature_columns = [tf.feature_column.numeric_column("total_rooms")]
26 | 
27 | # Define the label.
28 | targets = california_housing_dataframe["median_house_value"]
29 | 
30 | # Use gradient descent as the optimizer for training the model.
31 | my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
32 | my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
33 | 
34 | # Configure the linear regression model with our feature columns and optimizer.
35 | # Set a learning rate of 0.0000001 for Gradient Descent.
36 | linear_regressor = tf.estimator.LinearRegressor(
37 |     feature_columns=feature_columns,
38 |     optimizer=my_optimizer
39 | )
40 | 
41 | 
42 | def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
43 |     """Trains a linear regression model of one feature.
44 | 
45 |     Args:
46 |       features: pandas DataFrame of features
47 |       targets: pandas DataFrame of targets
48 |       batch_size: Size of batches to be passed to the model
49 |       shuffle: True or False. Whether to shuffle the data.
50 |       num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
51 |     Returns:
52 |       Tuple of (features, labels) for next data batch
53 |     """
54 | 
55 |     # Convert pandas data into a dict of np arrays.
56 |     features = {key: np.array(value) for key, value in dict(features).items()}
57 | 
58 |     # Construct a dataset, and configure batching/repeating.
59 |     ds = Dataset.from_tensor_slices((features, targets))  # warning: 2GB limit
60 |     ds = ds.batch(batch_size).repeat(num_epochs)
61 | 
62 |     # Shuffle the data, if specified.
63 |     if shuffle:
64 |         ds = ds.shuffle(buffer_size=10000)
65 | 
66 |     # Return the next batch of data.
67 |     features, labels = ds.make_one_shot_iterator().get_next()
68 |     return features, labels
69 | 
70 | 
71 | _ = linear_regressor.train(
72 |     input_fn=lambda: my_input_fn(my_feature, targets),
73 |     steps=100
74 | )
75 | 
76 | # Create an input function for predictions.
77 | # Note: Since we're making just one prediction for each example, we don't
78 | # need to repeat or shuffle the data here.
79 | prediction_input_fn =lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)
80 | 
81 | # Call predict() on the linear_regressor to make predictions.
82 | predictions = linear_regressor.predict(input_fn=prediction_input_fn)
83 | 
84 | # Format predictions as a NumPy array, so we can calculate error metrics.
85 | predictions = np.array([item['predictions'][0] for item in predictions])
86 | 
87 | # Print Mean Squared Error and Root Mean Squared Error.
88 | mean_squared_error = metrics.mean_squared_error(predictions, targets)
89 | root_mean_squared_error = math.sqrt(mean_squared_error)
90 | print("Mean Squared Error (on training data): %0.3f" % mean_squared_error)
91 | print("Root Mean Squared Error (on training data): %0.3f" % root_mean_squared_error)


--------------------------------------------------------------------------------
/src/ml-supervised/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-supervised/__init__.py


--------------------------------------------------------------------------------
/src/ml-supervised/course-description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-supervised/course-description.png


--------------------------------------------------------------------------------
/src/ml-supervised/course-description.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;}
 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 6 | \deftab720
 7 | \pard\pardeftab720\partightenfactor0
 8 | 
 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0
10 | At the end of day, the value of Data Scientists rests on their ability to describe the world and to make predictions. Machine Learning is the field of teaching machines and computers to learn from existing data to make predictions on new data - will a given tumor be benign or malignant? Which of your customers will take their business elsewhere? Is a particular email spam or not? In this course, you'll learn how to use Python to perform supervised learning, an essential component of Machine Learning. You'll learn how to build predictive models, how to tune their parameters and how to tell how well they will perform on unseen data, all the while using real world datasets. You'll do so using scikit-learn, one of the most popular and user-friendly machine learning libraries for Python.}


--------------------------------------------------------------------------------
/src/ml-supervised/k-fold_cross_validation.py:
--------------------------------------------------------------------------------
 1 | # 5-fold cross-validation
 2 | #
 3 | # Cross-validation is a vital step in evaluating a model. It maximizes the amount of data
 4 | # that is used to train the model, as during the course of training, the model is not only trained, but also tested
 5 | # on all of the available data.
 6 | #
 7 | # In this exercise, you will practice 5-fold cross validation on the Gapminder data. By default, scikit-learn's
 8 | # cross_val_score() function uses R2 as the metric of choice for regression. Since you are performing 5-fold
 9 | # cross-validation, the function will return 5 scores. Your job is to compute these 5 scores and then take their
10 | # average.
11 | #
12 | # The DataFrame has been loaded as df and split into the feature/target variable arrays X and y. The modules pandas
13 | # and numpy have been imported as pd and np, respectively.
14 | 
15 | # Import the necessary modules
16 | import numpy as np
17 | import pandas as pd
18 | from sklearn.linear_model import LinearRegression
19 | from sklearn.model_selection import cross_val_score
20 | 
21 | from helper import path
22 | 
23 | # Read the CSV file into a DataFrame: df
24 | df = pd.read_csv(path + 'gm_2008_region.csv')
25 | 
26 | # Create arrays for features and target variable
27 | X = df['fertility'].values
28 | y = df['life'].values
29 | 
30 | # Reshape X and y
31 | X = X.reshape(-1, 1)
32 | y = y.reshape(-1, 1)
33 | 
34 | # Create a linear regression object: reg
35 | reg = LinearRegression()
36 | 
37 | # Compute 5-fold cross-validation scores: cv_scores
38 | cv_scores = cross_val_score(reg, X, y, cv=5)
39 | 
40 | # Print the 5-fold cross-validation scores
41 | print(cv_scores)
42 | 
43 | print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))
44 | 
45 | # ---------------------------------
46 | 
47 | # Test time for 3-fold & 10 fold operations :: %timeit cross_val_score(reg, X, y, cv = ____)
48 | # Perform 3-fold CV
49 | cvscores_3 = cross_val_score(reg, X, y, cv=3)
50 | print("Average 3-Fold CV Score: {}".format(np.mean(cvscores_3)))
51 | 
52 | # Perform 10-fold CV
53 | cvscores_10 = cross_val_score(reg, X, y, cv=10)
54 | print("Average 10-Fold CV Score: {}".format(np.mean(cvscores_10)))
55 | 


--------------------------------------------------------------------------------
/src/ml-supervised/ml_centering_and_scaling.py:
--------------------------------------------------------------------------------
 1 | # Centering and scaling your data
 2 | #
 3 | # In the video, Hugo demonstrated how significantly the performance of a model can
 4 | # improve if the features are scaled. Note that this is not always the case: In the Congressional voting records
 5 | # dataset, for example, all of the features are binary. In such a situation, scaling will have minimal impact.
 6 | #
 7 | # You will now explore scaling for yourself on a new dataset - White Wine Quality! Hugo used the Red Wine Quality
 8 | # dataset in the video. We have used the 'quality' feature of the wine to create a binary target variable: If
 9 | # 'quality' is less than 5, the target variable is 1, and otherwise, it is 0.
10 | #
11 | # The DataFrame has been pre-loaded as df, along with the feature and target variable arrays X and y. Explore it in
12 | # the IPython Shell. Notice how some features seem to have different units of measurement. 'density', for instance,
13 | # only takes values between 0 and 1, while 'total sulfur dioxide' has a maximum value of 289. As a result,
14 | # it may be worth scaling the features here. Your job in this exercise is to scale the features and compute the mean
15 | # and standard deviation of the unscaled features compared to the scaled features.
16 | 
17 | 
18 | # Import scale
19 | import numpy as np
20 | import pandas as pd
21 | from sklearn.preprocessing import scale
22 | 
23 | from helper import path
24 | 
25 | # Read 'white-wine.csv' into a DataFrame: df
26 | df = pd.read_csv(path + 'white-wine.csv')
27 | 
28 | X = df.drop('quality', axis=1)
29 | y = df['quality']
30 | 
31 | # Scale the features: X_scaled
32 | X_scaled = scale(X)
33 | 
34 | # Print the mean and standard deviation of the unscaled features
35 | print("Mean of Unscaled Features: {}".format(np.mean(X)))
36 | print("Standard Deviation of Unscaled Features: {}".format(np.std(X)))
37 | 
38 | # Print the mean and standard deviation of the scaled features
39 | print("Mean of Scaled Features: {}".format(np.mean(X_scaled)))
40 | print("Standard Deviation of Scaled Features: {}".format(np.std(X_scaled)))
41 | 
42 | # -----------
43 | 
44 | 
45 | # Import the necessary modules
46 | from sklearn.preprocessing import StandardScaler
47 | from sklearn.pipeline import Pipeline
48 | from sklearn.neighbors import KNeighborsClassifier
49 | from sklearn.model_selection import train_test_split
50 | 
51 | # Setup the pipeline steps: steps
52 | steps = [('scaler', StandardScaler()),
53 |          ('knn', KNeighborsClassifier())]
54 | 
55 | # Create the pipeline: pipeline
56 | pipeline = Pipeline(steps)
57 | 
58 | # Create train and test sets
59 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
60 | 
61 | # Fit the pipeline to the training set: knn_scaled
62 | knn_scaled = pipeline.fit(X_train, y_train)
63 | 
64 | # Instantiate and fit a k-NN classifier to the unscaled data
65 | knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)
66 | 
67 | # Compute and print metrics
68 | print('Accuracy with Scaling: {}'.format(knn_scaled.score(X_test, y_test)))
69 | print('Accuracy without Scaling: {}'.format(knn_unscaled.score(X_test, y_test)))
70 | 


--------------------------------------------------------------------------------
/src/ml-supervised/ml_manually_remove_missing_data.py:
--------------------------------------------------------------------------------
 1 | # Regression with categorical features
 2 | #
 3 | # Having created the dummy variables from the 'Region' feature, you can build
 4 | # regression models as you did before. Here, you'll use ridge regression to perform 5-fold cross-validation.
 5 | # The feature array X and target variable array y have been pre-loaded.
 6 | #
 7 | # Dropping missing data
 8 | #
 9 | # The voting dataset from Chapter 1 contained a bunch of missing values that we dealt with for
10 | # you behind the scenes. Now, it's time for you to take care of these yourself!
11 | #
12 | # The unprocessed dataset has been loaded into a DataFrame df. Explore it in the IPython Shell with the .head()
13 | # method. You will see that there are certain data points labeled with a '?'. These denote missing values. As you saw
14 | #  in the video, different datasets encode missing values in different ways. Sometimes it may be a '9999',
15 | # other times a 0 - real-world data can be very messy! If you're lucky, the missing values will already be encoded as
16 | #  NaN. We use NaN because it is an efficient and simplified way of internally representing missing data, and it lets
17 | #  us take advantage of pandas methods such as .dropna() and .fillna(), as well as scikit-learn's Imputation
18 | # transformer Imputer().
19 | # In this exercise, your job is to convert the '?'s to NaNs, and then drop the rows that contain them from the
20 | # DataFrame.
21 | 
22 | import matplotlib.pyplot as plt
23 | import numpy as np
24 | import pandas as pd
25 | 
26 | from helper import path
27 | 
28 | # Read 'gapminder.csv' into a DataFrame: df
29 | df = pd.read_csv(path + 'gm_2008_region.csv')
30 | 
31 | # Create a boxplot of life expectancy per region
32 | df.boxplot('life', 'Region', rot=60)
33 | 
34 | # Show the plot
35 | plt.show()
36 | 
37 | # ----------------------
38 | 
39 | # Create arrays for features and target variable
40 | X = df['population'].values
41 | y = df['life'].values
42 | 
43 | # Reshape X and y
44 | X = X.reshape(-1, 1)
45 | y = y.reshape(-1, 1)
46 | 
47 | # Import necessary modules
48 | from sklearn.linear_model import Ridge
49 | from sklearn.model_selection import cross_val_score
50 | 
51 | # Instantiate a ridge regressor: ridge
52 | ridge = Ridge(alpha=0.5, normalize=True)
53 | 
54 | # Perform 5-fold cross-validation: ridge_cv
55 | ridge_cv = cross_val_score(ridge, X, y, cv=5)
56 | 
57 | # Print the cross-validated scores
58 | print(ridge_cv)
59 | 
60 | # -------------------------
61 | 
62 | # Convert '?' to NaN
63 | df[df == '?'] = np.nan
64 | 
65 | # Print the number of NaNs
66 | print(df.isnull().sum())
67 | 
68 | # Print shape of original DataFrame
69 | print("Shape of Original DataFrame: {}".format(df.shape))
70 | 
71 | # Drop missing values and print shape of new DataFrame
72 | df = df.dropna()
73 | 
74 | # Print shape of new DataFrame
75 | print("Shape of DataFrame After Dropping All Rows with Missing Values: {}".format(df.shape))
76 | 


--------------------------------------------------------------------------------
/src/ml-supervised/ml_pipeline_with_hyperparameters.py:
--------------------------------------------------------------------------------
 1 | # Bringing it all together I:
 2 | #
 3 | # Pipeline for classification It is time now to piece together everything you have
 4 | # learned so far into a pipeline for classification! Your job in this exercise is to build a pipeline that includes
 5 | # scaling and hyperparameter tuning to classify wine quality.
 6 | #
 7 | # You'll return to using the SVM classifier you were briefly introduced to earlier in this chapter. The
 8 | # hyperparameters you will tune are C and gamma. C controls the regularization strength. It is analogous to the C you
 9 | #  tuned for logistic regression in Chapter 3, while gamma controls the kernel coefficient: Do not worry about this
10 | # now as it is beyond the scope of this course.
11 | #
12 | # The following modules have been pre-loaded: Pipeline, svm, train_test_split, GridSearchCV, classification_report,
13 | # accuracy_score. The feature and target variable arrays X and y have also been pre-loaded.e.
14 | 
15 | import pandas as pd
16 | from sklearn.metrics import classification_report
17 | from sklearn.model_selection import GridSearchCV
18 | from sklearn.model_selection import train_test_split
19 | from sklearn.pipeline import Pipeline
20 | from sklearn.preprocessing import StandardScaler
21 | from sklearn.svm import SVC
22 | 
23 | from helper import path
24 | 
25 | # Read 'white-wine.csv' into a DataFrame: df
26 | df = pd.read_csv(path + 'white-wine.csv')
27 | 
28 | X = df.drop('quality', axis=1)
29 | y = df['quality']
30 | 
31 | # Setup the pipeline
32 | steps = [('scaler', StandardScaler()),
33 |          ('SVM', SVC())]
34 | 
35 | pipeline = Pipeline(steps)
36 | 
37 | # Specify the hyperparameter space
38 | parameters = {'SVM__C': [1, 10, 100],
39 |               'SVM__gamma': [0.1, 0.01]}
40 | 
41 | # Create train and test sets
42 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=21)
43 | 
44 | # Instantiate the GridSearchCV object: cv
45 | cv = GridSearchCV(pipeline, param_grid=parameters)
46 | 
47 | # Fit to the training set
48 | cv.fit(X_train, y_train)
49 | 
50 | # Predict the labels of the test set: y_pred
51 | y_pred = cv.predict(X_test)
52 | 
53 | # Compute and print metrics
54 | print("Accuracy: {}".format(cv.score(X_test, y_test)))
55 | print(classification_report(y_test, y_pred))
56 | print("Tuned Model Parameters: {}".format(cv.best_params_))
57 | 


--------------------------------------------------------------------------------
/src/ml-supervised/ml_pipelines.py:
--------------------------------------------------------------------------------
 1 | # Imputing missing data in a ML Pipeline I
 2 | #
 3 | # As you've come to appreciate, there are many steps to building a model,
 4 | # from creating training and test sets, to fitting a classifier or regressor, to tuning its parameters, to evaluating
 5 | #  its performance on new data. Imputation can be seen as the first step of this machine learning process,
 6 | # the entirety of which can be viewed within the context of a pipeline. Scikit-learn provides a pipeline constructor
 7 | # that allows you to piece together these steps into one process and thereby simplify your workflow.
 8 | #
 9 | # You'll now practice setting up a pipeline with two steps: the imputation step, followed by the instantiation of a
10 | # classifier. You've seen three classifiers in this course so far: k-NN, logistic regression, and the decision tree.
11 | # You will now be introduced to a fourth one - the Support Vector Machine, or SVM. For now, do not worry about how it
12 | #  works under the hood. It works exactly as you would expect of the scikit-learn estimators that you have worked
13 | # with previously, in that it has the same .fit() and .predict() methods as before.
14 | 
15 | 
16 | # Import the Imputer module
17 | from sklearn.preprocessing import Imputer
18 | from sklearn.svm import SVC
19 | 
20 | # Setup the Imputation transformer: imp
21 | imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
22 | 
23 | # Instantiate the SVC classifier: clf
24 | clf = SVC()
25 | 
26 | # Setup the pipeline with the required steps: steps
27 | steps = [('imputation', imp),
28 |          ('SVM', clf)]
29 | 
30 | # --------------
31 | # Import necessary modules
32 | import pandas as pd
33 | from sklearn.preprocessing import Imputer
34 | from sklearn.pipeline import Pipeline
35 | from sklearn.svm import SVC
36 | from sklearn.metrics import classification_report
37 | from sklearn.model_selection import train_test_split
38 | from helper import path
39 | 
40 | # Read 'white-wine.csv' into a DataFrame: df
41 | df = pd.read_csv(path + 'white-wine.csv')
42 | 
43 | X = df.drop('quality', axis=1)
44 | y = df['quality']
45 | # Setup the pipeline steps: steps
46 | steps = [('imputation', Imputer(missing_values='NaN', strategy='most_frequent', axis=0)),
47 |          ('SVM', SVC())]
48 | 
49 | # Create the pipeline: pipeline
50 | pipeline = Pipeline(steps)
51 | 
52 | # Create training and test sets
53 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
54 | 
55 | # Fit the pipeline to the train set
56 | pipeline.fit(X_train, y_train)
57 | 
58 | # Predict the labels of the test set
59 | y_pred = pipeline.predict(X_test)
60 | 
61 | # Compute metrics
62 | print(classification_report(y_test, y_pred))
63 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-1.py:
--------------------------------------------------------------------------------
 1 | # Hold-out set in practice I: Classification
 2 | #
 3 | # You will now practice evaluating a model with tuned hyperparameters on a
 4 | #  hold-out set. The feature array and target variable array from the diabetes dataset have been pre-loaded as X and y.
 5 | #
 6 | # In addition to C, logistic regression has a 'penalty' hyperparameter which specifies whether to use 'l1' or 'l2'
 7 | # regularization. Your job in this exercise is to create a hold-out set, tune the 'C' and 'penalty' hyperparameters
 8 | # of a logistic regression classifier using GridSearchCV on the training set, and then evaluate its performance
 9 | # against the hold-out set.
10 | 
11 | import numpy as np
12 | import pandas as pd
13 | from sklearn.linear_model import LogisticRegression
14 | from sklearn.model_selection import GridSearchCV
15 | from sklearn.model_selection import train_test_split
16 | 
17 | from helper import path
18 | 
19 | # Read the CSV file into a DataFrame: df
20 | df = pd.read_csv(path + 'diabetes.csv')
21 | 
22 | # Create arrays for features and target variable
23 | X = df.drop('diabetes', axis=1)
24 | y = df['diabetes']
25 | 
26 | # Create the hyperparameter grid
27 | c_space = np.logspace(-5, 8, 15)
28 | param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}
29 | 
30 | # Instantiate the logistic regression classifier: logreg
31 | logreg = LogisticRegression()
32 | 
33 | # Create train and test sets
34 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
35 | 
36 | # Instantiate the GridSearchCV object: logreg_cv
37 | logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
38 | 
39 | # Fit it to the training data
40 | logreg_cv.fit(X_train, y_train)
41 | 
42 | # Print the optimal parameters and best score
43 | print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
44 | print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))
45 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_hyperparamter_tuning_hold-out_set_with_GridSearchCV-2.py:
--------------------------------------------------------------------------------
 1 | # Hold-out set in practice II: Regression
 2 | #
 3 | # Remember lasso and ridge regression from the previous chapter? Lasso used
 4 | # the L1 penalty to regularize, while ridge used the L2 penalty. There is another type of regularized regression
 5 | # known as the elastic net. In elastic net regularization, the penalty term is a linear combination of the L1 and L2
 6 | # penalties:
 7 | #
 8 | # a∗L1+b∗L2 In scikit-learn, this term is represented by the 'l1_ratio' parameter: An 'l1_ratio' of 1 corresponds to
 9 | # an L1 penalty, and anything lower is a combination of L1 and L2.
10 | #
11 | # In this exercise, you will GridSearchCV to tune the 'l1_ratio' of an elastic net model trained on the Gapminder
12 | # data. As in the previous exercise, use a hold-out set to evaluate your model's performance.
13 | 
14 | import numpy as np
15 | import pandas as pd
16 | from sklearn.linear_model import ElasticNet
17 | from sklearn.metrics import mean_squared_error
18 | from sklearn.model_selection import GridSearchCV
19 | from sklearn.model_selection import train_test_split
20 | 
21 | from helper import path
22 | 
23 | # Read the CSV file into a DataFrame: df
24 | df = pd.read_csv(path + 'diabetes.csv')
25 | 
26 | # Create arrays for features and target variable
27 | X = df.drop('diabetes', axis=1)
28 | y = df['diabetes']
29 | 
30 | # Create train and test sets
31 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
32 | 
33 | # Create the hyperparameter grid
34 | l1_space = np.linspace(0, 1, 30)
35 | param_grid = {'l1_ratio': l1_space}
36 | 
37 | # Instantiate the ElasticNet regressor: elastic_net
38 | elastic_net = ElasticNet()
39 | 
40 | # Setup the GridSearchCV object: gm_cv
41 | gm_cv = GridSearchCV(elastic_net, param_grid, cv=5)
42 | 
43 | # Fit it to the training data
44 | gm_cv.fit(X_train, y_train)
45 | 
46 | # Predict on the test set and compute metrics
47 | y_pred = gm_cv.predict(X_test)
48 | r2 = gm_cv.score(X_test, y_test)
49 | mse = mean_squared_error(y_test, y_pred)
50 | print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
51 | print("Tuned ElasticNet R squared: {}".format(r2))
52 | print("Tuned ElasticNet MSE: {}".format(mse))
53 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_hyperparamter_tuning_with_GridSearchCV.py:
--------------------------------------------------------------------------------
 1 | # Hyperparameter tuning with GridSearchCV
 2 | #
 3 | # Hugo demonstrated how to use to tune the n_neighbors parameter of the
 4 | # KNeighborsClassifier() using GridSearchCV on the voting dataset. You will now practice this yourself, but by using
 5 | # logistic regression on the diabetes dataset instead!
 6 | #
 7 | # Like the alpha parameter of lasso and ridge regularization that you saw earlier, logistic regression also has a
 8 | # regularization parameter: C. C controls the inverse of the regularization strength, and this is what you will tune
 9 | # in this exercise. A large C can lead to an overfit model, while a small C can lead to an underfit model.
10 | #
11 | # The hyperparameter space for C has been setup for you. Your job is to use GridSearchCV and logistic regression to
12 | # find the optimal C in this hyperparameter space. The feature array is available as X and target variable array is
13 | # available as y.
14 | #
15 | # You may be wondering why you aren't asked to split the data into training and test sets. Good observation! Here,
16 | # we want you to focus on the process of setting up the hyperparameter grid and performing grid-search
17 | # cross-validation. In practice, you will indeed want to hold out a portion of your data for evaluation purposes,
18 | # and you will learn all about this in the next video!
19 | 
20 | import numpy as np
21 | import pandas as pd
22 | # Import necessary modules
23 | from sklearn.linear_model import LogisticRegression
24 | from sklearn.model_selection import GridSearchCV
25 | 
26 | from helper import path
27 | 
28 | # Read the CSV file into a DataFrame: df
29 | df = pd.read_csv(path + 'diabetes.csv')
30 | 
31 | # Create arrays for features and target variable
32 | X = df.drop('diabetes', axis=1)
33 | y = df['diabetes']
34 | 
35 | # Setup the hyperparameter grid
36 | c_space = np.logspace(-5, 8, 15)
37 | param_grid = {'C': c_space}
38 | 
39 | # Instantiate a logistic regression classifier: logreg
40 | logreg = LogisticRegression()
41 | 
42 | # Instantiate the GridSearchCV object: logreg_cv
43 | logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
44 | 
45 | # Fit it to the data
46 | logreg_cv.fit(X, y)
47 | 
48 | # Print the tuned parameters and score
49 | print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
50 | print("Best score is {}".format(logreg_cv.best_score_))
51 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_hyperparamter_tuning_with_RandomizedSearchCV.py:
--------------------------------------------------------------------------------
 1 | # Hyperparameter tuning with RandomizedSearchCV
 2 | #
 3 | # GridSearchCV can be computationally expensive, especially if you are
 4 | # searching over a large hyperparameter space and dealing with multiple hyperparameters. A solution to this is to use
 5 | #  RandomizedSearchCV, in which not all hyperparameter values are tried out. Instead, a fixed number of
 6 | # hyperparameter settings is sampled from specified probability distributions. You'll practice using
 7 | # RandomizedSearchCV in this exercise and see how this works.
 8 | #
 9 | # Here, you'll also be introduced to a new model: the Decision Tree. Don't worry about the specifics of how this
10 | # model works. Just like k-NN, linear regression, and logistic regression, decision trees in scikit-learn have .fit()
11 | #  and .predict() methods that you can use in exactly the same way as before. Decision trees have many parameters
12 | # that can be tuned, such as max_features, max_depth, and min_samples_leaf: This makes it an ideal use case for
13 | # RandomizedSearchCV.
14 | #
15 | # As before, the feature array X and target variable array y of the diabetes dataset have been pre-loaded. The
16 | # hyperparameter settings have been specified for you. Your goal is to use RandomizedSearchCV to find the optimal
17 | # hyperparameters. Go for it!
18 | 
19 | # Import the necessary modules
20 | import pandas as pd
21 | from scipy.stats import randint
22 | from sklearn.model_selection import RandomizedSearchCV
23 | from sklearn.tree import DecisionTreeClassifier
24 | 
25 | from helper import path
26 | 
27 | # Read the CSV file into a DataFrame: df
28 | df = pd.read_csv(path + 'diabetes.csv')
29 | 
30 | # Create arrays for features and target variable
31 | X = df.drop('diabetes', axis=1)
32 | y = df['diabetes']
33 | 
34 | # Setup the parameters and distributions to sample from: param_dist
35 | param_dist = {"max_depth": [3, None],
36 |               "max_features": randint(1, 9),
37 |               "min_samples_leaf": randint(1, 9),
38 |               "criterion": ["gini", "entropy"]}
39 | 
40 | # Instantiate a Decision Tree classifier: tree
41 | tree = DecisionTreeClassifier()
42 | 
43 | # Instantiate the RandomizedSearchCV object: tree_cv
44 | tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)
45 | 
46 | # Fit it to the data
47 | tree_cv.fit(X, y)
48 | 
49 | # Print the tuned parameters and score
50 | print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
51 | print("Best score is {}".format(tree_cv.best_score_))
52 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_knn_classifier_modal.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # k-Nearest Neighbors: Fit Having explored the Congressional voting records dataset, it is time now to build your
 3 | # first classifier. In this exercise, you will fit a k-Nearest Neighbors classifier to the voting dataset,
 4 | # which has once again been pre-loaded for you into a DataFrame df.
 5 | #
 6 | # In the video, Hugo discussed the importance of ensuring your data adheres to the format required by the
 7 | # scikit-learn API. The features need to be in an array where each column is a feature and each row a different
 8 | # observation or data point - in this case, a Congressman's voting record. The target needs to be a single column
 9 | # with the same number of observations as the feature data. We have done this for you in this exercise. Notice we
10 | # named the feature array X and response variable y: This is in accordance with the common scikit-learn practice.
11 | #
12 | # Your job is to create an instance of a k-NN classifier with 6 neighbors (by specifying the n_neighbors parameter)
13 | # and then fit it to the data. The data has been pre-loaded into a DataFrame called df. #
14 | 
15 | import pandas as pd
16 | from sklearn.neighbors import KNeighborsClassifier
17 | 
18 | from helper import path
19 | 
20 | # this dataset won't work. Can't run this program.
21 | file = 'house-votes-84.csv'
22 | 
23 | df = pd.read_csv(path + file)
24 | 
25 | # Explore Data
26 | print(df.describe())
27 | 
28 | # Create arrays for the features(X) and the response variable/target(y)
29 | X = df.drop('party', axis=1).values
30 | y = df['party'].values
31 | 
32 | # Create a k-NN classifier with 6 neighbors
33 | knn = KNeighborsClassifier(n_neighbors=60)
34 | 
35 | # Fit the classifier to the data
36 | knn.fit(X, y)
37 | 
38 | # Predict the labels for the training data X
39 | y_pred = knn.predict(X)
40 | 
41 | # This is our prediction based of knn-classifier - Prediction: ['democrat']
42 | print(y_pred)
43 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_knn_classifier_modal_train_test.py:
--------------------------------------------------------------------------------
 1 | # Import necessary modules
 2 | import matplotlib.pyplot as plt
 3 | import numpy as np
 4 | from sklearn import datasets
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.neighbors import KNeighborsClassifier
 7 | 
 8 | # Load the digits dataset: digits
 9 | digits = datasets.load_digits()
10 | 
11 | # Create feature and target arrays
12 | X = digits.data
13 | y = digits.target
14 | 
15 | # Split into training and test set
16 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
17 | 
18 | # Create a k-NN classifier with 7 neighbors: knn
19 | knn = KNeighborsClassifier(n_neighbors=7)
20 | 
21 | # Fit the classifier to the training data
22 | knn.fit(X_train, y_train)
23 | 
24 | # Print the accuracy
25 | print(knn.score(X_test, y_test))
26 | 
27 | # -------------------------#
28 | 
29 | # Test model accuracy on Training data and Test data and plot a graph
30 | 
31 | # Setup arrays to store train and test accuracies
32 | neighbors = np.arange(1, 9)
33 | train_accuracy = np.empty(len(neighbors))
34 | test_accuracy = np.empty(len(neighbors))
35 | 
36 | # Loop over different values of k
37 | for i, k in enumerate(neighbors):
38 |     # Setup a k-NN Classifier with k neighbors: knn
39 |     knn = KNeighborsClassifier(n_neighbors=k)
40 | 
41 |     # Fit the classifier to the training data
42 |     knn.fit(X_train, y_train)
43 | 
44 |     # Compute accuracy on the training set
45 |     train_accuracy[i] = knn.score(X_train, y_train)
46 | 
47 |     # Compute accuracy on the testing set
48 |     test_accuracy[i] = knn.score(X_test, y_test)
49 | 
50 | # Generate plot
51 | plt.title('k-NN: Varying Number of Neighbors')
52 | plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
53 | plt.plot(neighbors, train_accuracy, label='Training Accuracy')
54 | plt.legend()
55 | plt.xlabel('Number of Neighbors')
56 | plt.ylabel('Accuracy')
57 | plt.show()
58 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_knn_classifiers_performance_metrics.py:
--------------------------------------------------------------------------------
 1 | # Metrics for classification
 2 | #
 3 | # In Chapter 1, you evaluated the performance of your k-NN classifier based on its
 4 | # accuracy. However, as Andy discussed, accuracy is not always an informative metric. In this exercise, you will dive
 5 | #  more deeply into evaluating the performance of binary classifiers by computing a confusion matrix and generating a
 6 | #  classification report.
 7 | #
 8 | # You may have noticed in the video that the classification report consisted of three rows, and an additional support
 9 | #  column. The support gives the number of samples of the true response that lie in that class - so in the video
10 | # example, the support was the number of Republicans or Democrats in the test set on which the classification report
11 | # was computed. The precision, recall, and f1-score columns, then, gave the respective metrics for that particular
12 | # class.
13 | #
14 | # Here, you'll work with the PIMA Indians dataset obtained from the UCI Machine Learning Repository. The goal is to
15 | # predict whether or not a given female patient will contract diabetes based on features such as BMI, age,
16 | # and number of pregnancies. Therefore, it is a binary classification problem. A target value of 0 indicates that the
17 | #  patient does not have diabetes, while a value of 1 indicates that the patient does have diabetes. As in Chapters 1
18 | #  and 2, the dataset has been preprocessed to deal with missing values.
19 | #
20 | # The dataset has been loaded into a DataFrame df and the feature and target variable arrays X and y have been
21 | # created for you. In addition, sklearn.model_selection.train_test_split and sklearn.neighbors.KNeighborsClassifier
22 | # have already been imported.
23 | #
24 | # Your job is to train a k-NN classifier to the data and evaluate its performance by generating a confusion matrix
25 | # and classification report.
26 | 
27 | 
28 | # Import numpy and pandas
29 | import pandas as pd
30 | from sklearn.model_selection import train_test_split
31 | # Import necessary modules
32 | from sklearn.neighbors import KNeighborsClassifier
33 | 
34 | from helper import path
35 | 
36 | # Read the CSV file into a DataFrame: df
37 | df = pd.read_csv(path + 'diabetes.csv')
38 | 
39 | # Create arrays for features and target variable
40 | X = df['age'].values
41 | y = df['diabetes'].values
42 | 
43 | # Reshape X and y
44 | X = X.reshape(-1, 1)
45 | y = y.reshape(-1, 1)
46 | 
47 | # Import necessary modules
48 | from sklearn.metrics import classification_report
49 | from sklearn.metrics import confusion_matrix
50 | 
51 | # Create training and test set
52 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
53 | 
54 | # Instantiate a k-NN classifier: knn
55 | knn = KNeighborsClassifier(n_neighbors=6)
56 | 
57 | # Fit the classifier to the training data
58 | knn.fit(X_train, y_train)
59 | 
60 | # Predict the labels of the test data: y_pred
61 | y_pred = knn.predict(X_test)
62 | 
63 | # Generate the confusion matrix and classification report
64 | print(confusion_matrix(y_test, y_pred))
65 | print(classification_report(y_test, y_pred))
66 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_lasso_regularized_linear_regression.py:
--------------------------------------------------------------------------------
 1 | # What is Lasso Regression?
 2 | #
 3 | # http://www.statisticshowto.com/lasso-regression/
 4 | #
 5 | # Lasso regression is a type of linear
 6 | # regression that uses shrinkage. Shrinkage is where data values are shrunk towards a central point, like the mean.
 7 | # The lasso procedure encourages simple, sparse models (i.e. models with fewer parameters). This particular type of
 8 | # regression is well-suited for models showing high levels of muticollinearity or when you want to automate certain
 9 | # parts of model selection, like variable selection/parameter elimination.
10 | #
11 | # The acronym “LASSO” stands for Least Absolute Shrinkage and Selection Operator. Lasso regression is what is called
12 | # the Penalized regression method, often used in machine learning to select the subset of variables. It is a
13 | # supervised machine learning method. Specifically, LASSO is a Shrinkage and Variable Selection method for linear
14 | # regression models. LASSO, is actually an acronym for Least Absolute Selection and Shrinkage Operator.
15 | #
16 | # 0:34
17 | #
18 | # The LASSO imposes a constraint on the sum of the absolute values of the model parameters, where the sum has a
19 | # specified constant as an upper bound. This constraint causes regression coefficients for some variables to shrink
20 | # towards zero. This is the shrinkage process. The shrinkage process allows for better interpretation of the model
21 | # and identifies the variables most strongly associated with the target corresponds variable. That is the variable
22 | # selection process. It goes to obtain the subset of predictors that minimizes prediction error.
23 | #
24 | # So why use Lasso instead of just using ordinary least squares multiple regression?
25 | #
26 | # Well, first, it can provide greater prediction accuracy. If the true relationship between the response variable and
27 | #  the predictors is approximately linear and you have a large number of observations, then OLS regression parameter
28 | # estimates will have low bias and low variance. However, if you have a relatively small number of observations and a
29 | #  large number of predictors, then the variance of the OLS perimeter estimates will be higher. In this case,
30 | # Lasso Regression is useful because shrinking the regression coefficient can reduce variance without a substantial
31 | # increase in bias. 1:43 Second, Lasso Regression can increase model interpretability. Often times, at least some of
32 | # the explanatory variables in an OLS multiple regression analysis are not really associated with the response
33 | # variable. As a result, we often end up with a model that's over fitted and more difficult to interpret. With Lasso
34 | # Regression, the regression coefficients for unimportant variables are reduced to zero which effectively removes
35 | # them from the model and produces a simpler model that selects only the most important predictors. In Lasso
36 | # Regression, a tuning parameter called lambda is applied to the regression model to control the strength of the
37 | # penalty. As lambda increases, more coefficients are reduced to zero that is fewer predictors are selected and there
38 | #  is more shrinkage of the non-zero coefficient. With Lasso Regression where lambda is equal to zero then we have an
39 | #  OLS regression analysis. Bias increases and variance decreases as lambda increases. To demonstrate how lasso
40 | # regression works, let's use and example from the ad help data set in which our goal is to identify a set of
41 | # variables that best predicts the extent to which students feel connected to their school. We will use the same
42 | # ad-health data set that we used for the decision tree in random forced machine learning applications. The response
43 | # or target variable is a quantitative variable that measures school connectedness. The response values range from 6
44 | # to 38, where higher values indicate a greater connection with the school. There are a total of 23 Categorical and
45 | # Quantitative predictor variables. This is a pretty large number of predictor variables, so using OLS multiple
46 | # regression analysis would not be ideal, particularly if the goal is to identify a smaller subset of these
47 | # predictors that most accurately predicts school connectedness. Categorical predictors include gender and race and
48 | # ethnicity. Although Lasso Regression models can handle categorical variables with more than two levels In
49 | # conducting my data management, I created a series of five binary categorical variables for race and ethnicity,
50 | # Hispanic, White, Black, Native American, and Asian. I did this to improve interpratability of the selected model.
51 | # Binary substitutes variables for measure with individual questions of about whether the adolescent had ever used
52 | # alcohol, marijuana, cocaine, or inhalants. Additional categorical variables include the availability of cigarettes
53 | # in the home, whether or not either parent was on public assistance, and any experience with being expelled from
54 | # school. Finally, quantitative predictive variables include age, alcohol problems, and a measure of deviance. That
55 | # includes such behaviors as vandalism, other property damage, lying, stealing, running away,
56 | 
57 | import matplotlib.pyplot as plt
58 | # Import the necessary modules
59 | import pandas as pd
60 | # Import Lasso
61 | from sklearn.linear_model import Lasso
62 | 
63 | from helper import path
64 | 
65 | # Read the CSV file into a DataFrame: df
66 | df = pd.read_csv(path + 'gm_2008_region.csv')
67 | print(df.info())
68 | print(df.describe())
69 | print(df.head())
70 | 
71 | # Create arrays for features and target variable
72 | X = df['population'].values
73 | y = df['life'].values
74 | 
75 | # Reshape X and y
76 | X = X.reshape(-1, 1)
77 | # y = y.reshape(-1, 1)
78 | 
79 | # Instantiate a lasso regressor: lasso
80 | lasso = Lasso(alpha=0.4, normalize=True)
81 | 
82 | # Fit the regressor to the data
83 | lasso.fit(X, y)
84 | 
85 | # Compute and print the coefficients
86 | lasso_coef = lasso.coef_
87 | print(lasso_coef)
88 | 
89 | df_columns = df.keys()
90 | print(df_columns)
91 | 
92 | # Plot the coefficients
93 | plt.plot(range(len(df_columns)), lasso_coef)
94 | plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
95 | plt.margins(0.02)
96 | plt.show()
97 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_linear_regression_modal.py:
--------------------------------------------------------------------------------
 1 | # Fit & predict for regression
 2 | #
 3 | # Now, you will fit a linear regression and predict life expectancy using just one
 4 | # feature. You saw Andy do this earlier using the 'RM' feature of the Boston housing dataset. In this exercise,
 5 | # you will use the 'fertility' feature of the Gapminder dataset. Since the goal is to predict life expectancy,
 6 | # the target variable here is 'life'. The array for the target variable has been pre-loaded as y and the array for
 7 | # 'fertility' has been pre-loaded as X_fertility.
 8 | #
 9 | # A scatter plot with 'fertility' on the x-axis and 'life' on the y-axis has been generated. As you can see,
10 | # there is a strongly negative correlation, so a linear regression should be able to capture this trend. Your job is
11 | # to fit a linear regression and then predict the life expectancy, overlaying these predicted values on the plot to
12 | # generate a regression line. You will also compute and print the R2 score using sckit-learn's .score() method.
13 | 
14 | # Import numpy and pandas
15 | import numpy as np
16 | import pandas as pd
17 | from matplotlib import pyplot as plt
18 | 
19 | # Import LinearRegression
20 | from sklearn.linear_model import LinearRegression
21 | 
22 | from helper import path
23 | 
24 | # Read the CSV file into a DataFrame: df
25 | df = pd.read_csv(path + 'gm_2008_region.csv')
26 | 
27 | # Create arrays for features and target variable
28 | X_fertility = df['fertility'].values
29 | y = df['life'].values
30 | 
31 | # Print the dimensions of X and y before reshaping
32 | print("Dimensions of y before reshaping: {}".format(y.shape))
33 | print("Dimensions of X before reshaping: {}".format(X_fertility.shape))
34 | 
35 | # Reshape X and y
36 | y = y.reshape(-1, 1)
37 | X_fertility = X_fertility.reshape(-1, 1)
38 | 
39 | # Print the dimensions of X and y after reshaping
40 | print("Dimensions of y after reshaping: {}".format(y.shape))
41 | print("Dimensions of X after reshaping: {}".format(X_fertility.shape))
42 | 
43 | # Create the regressor: reg
44 | reg = LinearRegression()
45 | 
46 | # Create the prediction space
47 | prediction_space = np.linspace(min(X_fertility), max(X_fertility)).reshape(-1, 1)
48 | 
49 | # Fit the model to the data
50 | reg.fit(X_fertility, y)
51 | 
52 | # Compute predictions over the prediction space: y_pred
53 | y_pred = reg.predict(prediction_space)
54 | 
55 | # Print R^2
56 | print(reg.score(X_fertility, y))
57 | 
58 | # Plot regression line over original scatter plot
59 | plt.scatter(X_fertility, y, color='blue')
60 | plt.plot(prediction_space, y_pred, color='black', linewidth=3)
61 | plt.show()
62 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_linear_regression_modal_train_test.py:
--------------------------------------------------------------------------------
 1 | # Fit & predict for regression
 2 | # Import numpy and pandas
 3 | import numpy as np
 4 | import pandas as pd
 5 | # Import necessary modules
 6 | from sklearn.linear_model import LinearRegression
 7 | from sklearn.metrics import mean_squared_error
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | from helper import path
11 | 
12 | # Read the CSV file into a DataFrame: df
13 | df = pd.read_csv(path + 'gm_2008_region.csv')
14 | 
15 | # Create arrays for features and target variable
16 | X_fertility = df['fertility'].values
17 | y = df['life'].values
18 | 
19 | # Reshape X and y
20 | X_fertility = X_fertility.reshape(-1, 1)
21 | y = y.reshape(-1, 1)
22 | 
23 | # Create training and test sets
24 | X_train, X_test, y_train, y_test = train_test_split(X_fertility, y, test_size=0.3, random_state=42)
25 | 
26 | # Create the regressor: reg_all
27 | reg_all = LinearRegression()
28 | 
29 | # Fit the regressor to the training data
30 | reg_all.fit(X_train, y_train)
31 | 
32 | # Predict on the test data: y_pred
33 | y_pred = reg_all.predict(X_test)
34 | 
35 | # Compute and print R^2 and RMSE
36 | print("R^2: {}".format(reg_all.score(X_test, y_test)))
37 | rmse = np.sqrt(mean_squared_error(y_test, y_pred))
38 | print("Root Mean Squared Error: {}".format(rmse))
39 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_logistic_regression_modal.py:
--------------------------------------------------------------------------------
 1 | # Building a logistic regression model
 2 | #
 3 | # Time to build your first logistic regression model! As Hugo showed in the
 4 | # video, scikit-learn makes it very easy to try different models, since the Train-Test-Split/Instantiate/Fit/Predict
 5 | # paradigm applies to all classifiers and regressors - which are known in scikit-learn as 'estimators'. You'll see
 6 | # this now for yourself as you train a logistic regression model on exactly the same data as in the previous
 7 | # exercise. Will it outperform k-NN? There's only one way to find out!
 8 | #
 9 | # The feature and target variable arrays X and y have been pre-loaded, and train_test_split has been imported for you
10 | #  from sklearn.model_selection.
11 | 
12 | # Import the necessary modules
13 | import pandas as pd
14 | from sklearn.linear_model import LogisticRegression
15 | from sklearn.metrics import confusion_matrix, classification_report
16 | from sklearn.model_selection import train_test_split
17 | 
18 | from helper import path
19 | 
20 | # Read the CSV file into a DataFrame: df
21 | df = pd.read_csv(path + 'diabetes.csv')
22 | 
23 | # Create arrays for features and target variable
24 | X = df['age'].values
25 | y = df['diabetes'].values
26 | 
27 | # Reshape X and y
28 | X = X.reshape(-1, 1)
29 | y = y.reshape(-1, 1)
30 | 
31 | # Create training and test sets
32 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
33 | 
34 | # Create the classifier: logreg
35 | logreg = LogisticRegression()
36 | 
37 | # Fit the classifier to the training data
38 | logreg.fit(X_train, y_train)
39 | 
40 | # Predict the labels of the test set: y_pred
41 | y_pred = logreg.predict(X_test)
42 | 
43 | # Compute and print the confusion matrix and classification report
44 | print(confusion_matrix(y_test, y_pred))
45 | print(classification_report(y_test, y_pred))
46 | 
47 | # -------------------------
48 | 
49 | # Plotting an ROC curve
50 | #
51 | # Classification reports and confusion matrices are great methods to quantitatively evaluate model performance,
52 | # while ROC curves provide a way to visually evaluate models. As Hugo demonstrated in the video, most classifiers in
53 | # scikit-learn have a .predict_proba() method which returns the probability of a given sample being in a particular
54 | # class. Having built a logistic regression model, you'll now evaluate its performance by plotting an ROC curve. In
55 | # doing so, you'll make use of the .predict_proba() method and become familiar with its functionality.
56 | #
57 | # Here, you'll continue working with the PIMA Indians diabetes dataset. The classifier has already been fit to the
58 | # training data and is available as logreg
59 | 
60 | # Import necessary modules
61 | from sklearn.metrics import roc_curve
62 | from matplotlib import pyplot as plt
63 | 
64 | # Compute predicted probabilities: y_pred_prob
65 | y_pred_prob = logreg.predict_proba(X_test)[:, 1]
66 | 
67 | # Generate ROC curve values: fpr, tpr, thresholds
68 | fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
69 | 
70 | # Plot ROC curve
71 | plt.plot([0, 1], [0, 1], 'k--')
72 | plt.plot(fpr, tpr)
73 | plt.xlabel('False Positive Rate')
74 | plt.ylabel('True Positive Rate')
75 | plt.title('ROC Curve')
76 | plt.show()
77 | 
78 | # -------------------------
79 | 
80 | # Calculating ROC AUC score
81 | # Larger area under ROC curve = better model
82 | 
83 | from sklearn.metrics import roc_auc_score
84 | 
85 | print(roc_auc_score(y_test, y_pred_prob))
86 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_ridge_regularized_linear_regression.py:
--------------------------------------------------------------------------------
 1 | # Regularization II: Ridge
 2 | #
 3 | # Lasso is great for feature selection, but when building regression models,
 4 | # Ridge regression should be your first choice.
 5 | #
 6 | # Recall that lasso performs regularization by adding to the loss function a penalty term of the absolute value of
 7 | # each coefficient multiplied by some alpha. This is also known as L1 regularization because the regularization term
 8 | # is the L1 norm of the coefficients. This is not the only way to regularize, however.
 9 | #
10 | # If instead you took the sum of the squared values of the coefficients multiplied by some alpha - like in Ridge
11 | # regression - you would be computing the L2 norm. In this exercise, you will practice fitting ridge regression
12 | # models over a range of different alphas, and plot cross-validated R2 scores for each, using this function that we
13 | # have defined for you, which plots the R2 score as well as standard error for each alpha:
14 | 
15 | 
16 | def display_plot(cv_scores, cv_scores_std):
17 |     fig = plt.figure()
18 |     ax = fig.add_subplot(1, 1, 1)
19 |     ax.plot(alpha_space, cv_scores)
20 | 
21 |     std_error = cv_scores_std / np.sqrt(10)
22 | 
23 |     ax.fill_between(alpha_space, cv_scores + std_error, cv_scores - std_error, alpha=0.2)
24 |     ax.set_ylabel('CV Score +/- Std Error')
25 |     ax.set_xlabel('Alpha')
26 |     ax.axhline(np.max(cv_scores), linestyle='--', color='.5')
27 |     ax.set_xlim([alpha_space[0], alpha_space[-1]])
28 |     ax.set_xscale('log')
29 |     plt.show()
30 | 
31 | 
32 | import matplotlib.pyplot as plt
33 | import numpy as np
34 | import pandas as pd
35 | from sklearn.linear_model import Ridge
36 | from sklearn.model_selection import cross_val_score
37 | 
38 | from helper import path
39 | 
40 | # Read the CSV file into a DataFrame: df
41 | df = pd.read_csv(path + 'gm_2008_region.csv')
42 | print(df.info())
43 | print(df.describe())
44 | print(df.head())
45 | 
46 | # Create arrays for features and target variable
47 | X = df['population'].values
48 | y = df['life'].values
49 | 
50 | # Reshape X and y
51 | X = X.reshape(-1, 1)
52 | # y = y.reshape(-1, 1)
53 | 
54 | # Setup the array of alphas and lists to store scores
55 | alpha_space = np.logspace(-4, 0, 50)
56 | ridge_scores = []
57 | ridge_scores_std = []
58 | 
59 | # Create a ridge regressor: ridge
60 | ridge = Ridge(normalize=True)
61 | 
62 | # Compute scores over range of alphas
63 | for alpha in alpha_space:
64 |     # Specify the alpha value to use: ridge.alpha
65 |     ridge.alpha = alpha
66 | 
67 |     # Perform 10-fold CV: ridge_cv_scores
68 |     ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)
69 | 
70 |     # Append the mean of ridge_cv_scores to ridge_scores
71 |     ridge_scores.append(np.mean(ridge_cv_scores))
72 | 
73 |     # Append the std of ridge_cv_scores to ridge_scores_std
74 |     ridge_scores_std.append(np.std(ridge_cv_scores))
75 | 
76 | # Display the plot
77 | display_plot(ridge_scores, ridge_scores_std)
78 | 


--------------------------------------------------------------------------------
/src/ml-supervised/py_sklearn_digits_dataset.py:
--------------------------------------------------------------------------------
 1 | # The digits recognition dataset Up until now, you have been performing binary classification, since the target
 2 | # variable had two possible outcomes. Hugo, however, got to perform multi-class classification in the videos,
 3 | # where the target variable could take on three possible outcomes. Why does he get to have all the fun?! In the
 4 | # following exercises, you'll be working with the MNIST digits recognition dataset, which has 10 classes, the digits
 5 | # 0 through 9! A reduced version of the MNIST dataset is one of scikit-learn's included datasets, and that is the one
 6 | #  we will use in this exercise.
 7 | #
 8 | # Each sample in this scikit-learn dataset is an 8x8 image representing a handwritten digit. Each pixel is
 9 | # represented by an integer in the range 0 to 16, indicating varying levels of black. Recall that scikit-learn's
10 | # built-in datasets are of type Bunch, which are dictionary-like objects. Helpfully for the MNIST dataset,
11 | # scikit-learn provides an 'images' key in addition to the 'data' and 'target' keys that you have seen with the Iris
12 | # data. Because it is a 2D array of the images corresponding to each sample, this 'images' key is useful for
13 | # visualizing the images, as you'll see in this exercise (for more on plotting 2D arrays, see Chapter 2 of DataCamp's
14 | #  course on Data Visualization with Python). On the other hand, the 'data' key contains the feature array - that is,
15 | #  the images as a flattened array of 64 pixels.
16 | #
17 | # Notice that you can access the keys of these Bunch objects in two different ways: By using the . notation,
18 | # as in digits.images, or the [] notation, as in digits['images'].
19 | #
20 | # For more on the MNIST data, check out this exercise in Part 1 of DataCamp's Importing Data in Python course. There,
21 | #  the full version of the MNIST dataset is used, in which the images are 28x28. It is a famous dataset in machine
22 | # learning and computer vision, and frequently used as a benchmark to evaluate the performance of a new model.
23 | 
24 | import matplotlib.pyplot as plt
25 | # Import necessary modules
26 | from sklearn import datasets
27 | 
28 | # Load the digits dataset: digits
29 | digits = datasets.load_digits()
30 | 
31 | # Print the keys and DESCR of the dataset
32 | print(digits.keys())
33 | print(digits.DESCR)
34 | 
35 | # Print the shape of the images and data keys
36 | print(digits.images.shape)
37 | print(digits.data.shape)
38 | 
39 | # Display digit 1010
40 | plt.imshow(digits.images[1010], cmap=plt.cm.gray_r, interpolation='nearest')
41 | plt.show()


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/01-how-many-clusters.py:
--------------------------------------------------------------------------------
 1 | """
 2 | How many clusters?
 3 | 
 4 | You are given an array points of size 300x2, where each row gives the (x, y) co-ordinates of a point on a map. Make a
 5 | scatter plot of these points, and use the scatter plot to guess how many clusters there are.
 6 | 
 7 | matplotlib.pyplot has already been imported as plt. In the IPython Shell:
 8 | 
 9 | Create an array called xs that contains the values of points[:,0] - that is, column 0 of points.
10 | Create an array called ys that contains the values of points[:,1] - that is, column 1 of points.
11 | Make a scatter plot by passing xs and ys to the plt.scatter() function.
12 | Call the plt.show() function to show your plot.
13 | How many clusters do you see?
14 | """
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | 
18 | from helper import points
19 | 
20 | print(type(points))
21 | points = np.array(points)
22 | 
23 | xs = points[:, 0]
24 | 
25 | ys = points[:, 1]
26 | 
27 | plt.scatter(xs, ys, alpha=0.5)
28 | 
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/02-clustering-2d-points.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clustering 2D points
 3 | 
 4 | From the scatter plot of the previous exercise, you saw that the points seem to separate into 3 clusters. You'll now
 5 | create a KMeans model to find 3 clusters, and fit it to the data points from the previous exercise. After the model
 6 | has been fit, you'll obtain the cluster labels for some new points using the .predict() method.
 7 | 
 8 | You are given the array points from the previous exercise, and also an array new_points.
 9 | 
10 | INSTRUCTIONS
11 | 100XP
12 | INSTRUCTIONS
13 | 100XP
14 | Import KMeans from sklearn.cluster.
15 | Using KMeans(), create a KMeans instance called model to find 3 clusters. To specify the number of clusters, use the n_clusters keyword argument.
16 | Use the .fit() method of model to fit the model to the array of points points.
17 | Use the .predict() method of model to predict the cluster labels of new_points, assigning the result to labels.
18 | Hit 'Submit Answer' to see the cluster labels of new_points.
19 | '''
20 | import numpy as np
21 | # Import KMeans
22 | from sklearn.cluster import KMeans
23 | 
24 | from helper import points, new_points, smart_print
25 | 
26 | # Convert to np-arrays
27 | points = np.array(points)
28 | new_points = np.array(new_points)
29 | 
30 | # Create a KMeans instance with 3 clusters: model
31 | model = KMeans(n_clusters=3)
32 | 
33 | # Fit model to points
34 | model.fit(points)
35 | 
36 | # Determine the cluster labels of new_points: labels
37 | labels = model.predict(new_points)
38 | 
39 | # Print cluster labels of new_points
40 | smart_print(labels)
41 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/03-inspect-your-clustering.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Inspect your clustering
 3 | 
 4 | Let's now inspect the clustering you performed in the previous exercise!
 5 | 
 6 | A solution to the previous exercise has already run, so new_points is an array of points and labels is the array of their cluster labels.
 7 | 
 8 | INSTRUCTIONS
 9 | 100XP
10 | Import matplotlib.pyplot as plt.
11 | Assign column 0 of new_points to xs, and column 1 of new_points to ys.
12 | Make a scatter plot of xs and ys, specifying the c=labels keyword arguments to color the points by their cluster label. Also specify alpha=0.5.
13 | Compute the coordinates of the centroids using the .cluster_centers_ attribute of model.
14 | Assign column 0 of centroids to centroids_x, and column 1 of centroids to centroids_y.
15 | Make a scatter plot of centroids_x and centroids_y, using 'D' (a diamond) as a marker by specifying the marker parameter. Set the size of the markers to be 50 using s=50.
16 | '''
17 | # Import pyplot
18 | import matplotlib.pyplot as plt
19 | from numpy import array
20 | from sklearn.cluster import KMeans
21 | 
22 | from helper import points, new_points
23 | 
24 | # Convert to np-arrays
25 | points = array(points)
26 | new_points = array(new_points)
27 | 
28 | # Create a KMeans instance with 3 clusters: model
29 | model = KMeans(n_clusters=3)
30 | 
31 | # Fit model to points
32 | model.fit(points)
33 | 
34 | # Determine the cluster labels of new_points: labels
35 | labels = model.predict(new_points)
36 | # Assign the columns of new_points: xs and ys
37 | xs = new_points[:, 0]
38 | ys = new_points[:, 1]
39 | 
40 | # Make a scatter plot of xs and ys, using labels to define the colors
41 | plt.scatter(xs, ys, alpha=0.5, c=labels)
42 | 
43 | # Assign the cluster centers: centroids
44 | centroids = model.cluster_centers_
45 | 
46 | # Assign the columns of centroids: centroids_x, centroids_y
47 | centroids_x = centroids[:, 0]
48 | centroids_y = centroids[:, 1]
49 | 
50 | # Make a scatter plot of centroids_x and centroids_y
51 | plt.scatter(centroids_x, centroids_y, marker='D', s=50)
52 | plt.show()
53 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/04-how-many-clusters-of-grain.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | How many clusters of grain?
 3 | 
 4 | In the video, you learned how to choose a good number of clusters for a dataset using the k-means inertia graph. You
 5 | are given an array samples containing the measurements (such as area, perimeter, length, and several others) of
 6 | samples of grain. What's a good number of clusters in this case?
 7 | 
 8 | KMeans and PyPlot (plt) have already been imported for you.
 9 | 
10 | This dataset was sourced from the UCI Machine Learning Repository.
11 | 
12 | INSTRUCTIONS
13 | 100XP
14 | For each of the given values of k, perform the following steps:
15 | Create a KMeans instance called model with k clusters.
16 | Fit the model to the grain data samples.
17 | Append the value of the inertia_ attribute of model to the list inertias.
18 | The code to plot ks vs inertias has been written for you, so hit 'Submit Answer' to see the plot!
19 | '''
20 | # Import pyplot
21 | import matplotlib.pyplot as plt
22 | import numpy as np
23 | from sklearn.cluster import KMeans
24 | 
25 | from helper import points
26 | 
27 | samples = np.array(points)
28 | 
29 | ks = range(1, 6)
30 | inertias = []
31 | 
32 | for k in ks:
33 |     # Create a KMeans instance with k clusters: model
34 |     model = KMeans(n_clusters=k)
35 | 
36 |     # Fit model to samples
37 |     model.fit(samples)
38 | 
39 |     # Append the inertia to the list of inertias
40 |     inertias.append(model.inertia_)
41 | 
42 | # Plot ks vs inertias
43 | plt.plot(ks, inertias, '-o')
44 | plt.xlabel('number of clusters, k')
45 | plt.ylabel('inertia')
46 | plt.xticks(ks)
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/05-evaluating-the-grain-clustering.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Evaluating the grain clustering
 3 | 
 4 | In the previous exercise, you observed from the inertia plot that 3 is a good number of clusters for the grain data.
 5 | In fact, the grain samples come from a mix of 3 different grain varieties: "Kama", "Rosa" and "Canadian". In this
 6 | exercise, cluster the grain samples into three clusters, and compare the clusters to the grain varieties using a
 7 | cross-tabulation.
 8 | 
 9 | You have the array samples of grain samples, and a list varieties giving the grain variety for each sample. Pandas (
10 | pd) and KMeans have already been imported for you.
11 | 
12 | INSTRUCTIONS 100XP Create a KMeans model called model with 3 clusters. Use the .fit_predict() method of model to fit
13 | it to samples and derive the cluster labels. Using .fit_predict() is the same as using .fit() followed by .predict().
14 | Create a DataFrame df with two columns named 'labels' and 'varieties', using labels and varieties, respectively,
15 | for the column values. This has been done for you. Use the pd.crosstab() function on df['labels'] and df['varieties']
16 | to count the number of times each grain variety coincides with each cluster label. Assign the result to ct. Hit
17 | 'Submit Answer' to see the cross-tabulation! '''
18 | 
19 | # Import pyplot
20 | import numpy as np
21 | import pandas as pd
22 | from sklearn.cluster import KMeans
23 | 
24 | from helper import points
25 | 
26 | samples = np.array(points)
27 | 
28 | # Create a KMeans model with 3 clusters: model
29 | model = KMeans(n_clusters=3)
30 | 
31 | # Use fit_predict to fit model and obtain cluster labels: labels
32 | labels = model.fit_predict(samples)
33 | 
34 | # Create a DataFrame with labels and varieties as columns: df
35 | df = pd.DataFrame({'labels': labels, 'varieties': varieties})
36 | 
37 | # Create crosstab: ct
38 | ct = pd.crosstab(df['labels'], df['varieties'])
39 | 
40 | # Display ct
41 | print(ct)
42 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/06-07-scaling-&-clustering-the-fish-data.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Scaling fish data for clustering
 3 | 
 4 | You are given an array samples giving measurements of fish. Each row represents an individual fish. The measurements, such as weight in grams, length in centimeters, and the percentage ratio of height to length, have very different scales. In order to cluster this data effectively, you'll need to standardize these features first. In this exercise, you'll build a pipeline to standardize and cluster the data.
 5 | 
 6 | These fish measurement data were sourced from the Journal of Statistics Education.
 7 | 
 8 | INSTRUCTIONS
 9 | 100XP
10 | INSTRUCTIONS
11 | 100XP
12 | Import:
13 | make_pipeline from sklearn.pipeline.
14 | StandardScaler from sklearn.preprocessing.
15 | KMeans from sklearn.cluster.
16 | Create an instance of StandardScaler called scaler.
17 | Create an instance of KMeans with 4 clusters called kmeans.
18 | Create a pipeline called pipeline that chains scaler and kmeans. To do this, you just need to pass them in as arguments to make_pipeline().
19 | '''
20 | from sklearn.cluster import KMeans
21 | # Perform the necessary imports
22 | from sklearn.pipeline import make_pipeline
23 | from sklearn.preprocessing import StandardScaler
24 | 
25 | # Create scaler: scaler
26 | scaler = StandardScaler()
27 | 
28 | # Create KMeans instance: kmeans
29 | kmeans = KMeans(n_clusters=4)
30 | 
31 | # Create pipeline: pipeline
32 | pipeline = make_pipeline(scaler, kmeans)
33 | 
34 | '''
35 | Clustering the fish data
36 | 
37 | You'll now use your standardization and clustering pipeline from the previous exercise to cluster the fish by their measurements, and then create a cross-tabulation to compare the cluster labels with the fish species.
38 | 
39 | As before, samples is the 2D array of fish measurements. Your pipeline is available as pipeline, and the species of every fish sample is given by the list species.
40 | 
41 | INSTRUCTIONS
42 | 100XP
43 | Import pandas as pd.
44 | Fit the pipeline to the fish measurements samples.
45 | Obtain the cluster labels for samples by using the .predict() method of pipeline.
46 | Using pd.DataFrame(), create a DataFrame df with two columns named 'labels' and 'species', using labels and species, respectively, for the column values.
47 | Using pd.crosstab(), create a cross-tabulation ct of df['labels'] and df['species'].
48 | '''
49 | # Import pandas
50 | import pandas as pd
51 | import numpy as np
52 | 
53 | from helper import points
54 | 
55 | samples = np.array(points)
56 | 
57 | # Fit the pipeline to samples
58 | pipeline.fit(samples)
59 | 
60 | # Calculate the cluster labels: labels
61 | labels = pipeline.predict(samples)
62 | 
63 | # Create a DataFrame with labels and species as columns: df
64 | df = pd.DataFrame({'labels': labels, 'species': species})
65 | 
66 | # Create crosstab: ct
67 | ct = pd.crosstab(df['labels'], df['species'])
68 | 
69 | # Display ct
70 | print(ct)
71 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/08-09-scaling-&-clustering-which-stocks-move-together.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clustering stocks using KMeans
 3 | 
 4 | In this exercise, you'll cluster companies using their daily stock price movements (i.e. the dollar difference between the closing and opening prices for each trading day). You are given a NumPy array movements of daily price movements from 2010 to 2015 (obtained from Yahoo! Finance), where each row corresponds to a company, and each column corresponds to a trading day.
 5 | 
 6 | Some stocks are more expensive than others. To account for this, include a Normalizer at the beginning of your pipeline. The Normalizer will separately transform each company's stock price to a relative scale before the clustering begins.
 7 | 
 8 | Note that Normalizer() is different to StandardScaler(), which you used in the previous exercise. While StandardScaler() standardizes features (such as the features of the fish data from the previous exercise) by removing the mean and scaling to unit variance, Normalizer() rescales each sample - here, each company's stock price - independently of the other.
 9 | 
10 | KMeans and make_pipeline have already been imported for you.
11 | 
12 | INSTRUCTIONS
13 | 100XP
14 | INSTRUCTIONS
15 | 100XP
16 | Import Normalizer from sklearn.preprocessing.
17 | Create an instance of Normalizer called normalizer.
18 | Create an instance of KMeans called kmeans with 10 clusters.
19 | Using make_pipeline(), create a pipeline called pipeline that chains normalizer and kmeans.
20 | Fit the pipeline to the movements array.
21 | '''
22 | from sklearn.cluster import KMeans
23 | # Import Normalizer
24 | from sklearn.preprocessing import Normalizer
25 | 
26 | # Create a normalizer: normalizer
27 | normalizer = Normalizer()
28 | 
29 | # Create a KMeans model with 10 clusters: kmeans
30 | kmeans = KMeans(n_clusters=10)
31 | 
32 | # Make a pipeline chaining normalizer and kmeans: pipeline
33 | pipeline = make_pipeline(normalizer, kmeans)
34 | 
35 | # Fit pipeline to the daily price movements
36 | pipeline.fit(movements)
37 | 
38 | '''
39 | Which stocks move together?
40 | 
41 | In the previous exercise, you clustered companies by their daily stock price movements. So which company have stock 
42 | prices that tend to change in the same way? You'll now inspect the cluster labels from your clustering to find out. 
43 | 
44 | Your solution to the previous exercise has already been run. Recall that you constructed a Pipeline pipeline
45 | containing a KMeans model and fit it to the NumPy array movements of daily stock movements. In addition,
46 | a list companies of the company names is available.
47 | 
48 | INSTRUCTIONS 100XP INSTRUCTIONS 100XP Import pandas as pd. Use the .predict() method of the pipeline to predict the
49 | labels for movements. Align the cluster labels with the list of company names companies by creating a DataFrame df
50 | with labels and companies as columns. This has been done for you. Use the .sort_values() method of df to sort the
51 | DataFrame by the 'labels' column, and print the result. Hit 'Submit Answer' and take a moment to see which companies
52 | are together in each cluster! '''
53 | # Import pandas
54 | import pandas as pd
55 | 
56 | # Predict the cluster labels: labels
57 | labels = pipeline.predict(movements)
58 | 
59 | # Create a DataFrame aligning labels and companies: df
60 | df = pd.DataFrame({'labels': labels, 'companies': companies})
61 | 
62 | # Display df sorted by cluster label
63 | print(df.sort_values('labels'))
64 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/ch1_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/01-clustering-for-dataset-exploration/ch1_slides.pdf


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/chapter-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/01-clustering-for-dataset-exploration/chapter-details.png


--------------------------------------------------------------------------------
/src/ml-unsupervised/01-clustering-for-dataset-exploration/chapter-details.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;}
 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 6 | \deftab720
 7 | \pard\pardeftab720\partightenfactor0
 8 | 
 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0
10 | Learn how to discover the underlying groups (or "clusters") in a dataset. By the end of this chapter, you'll be clustering companies using their stock market prices, and distinguishing different species by clustering their measurements.}


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/01-hierarchical-clustering-of-the-grain-data.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Hierarchical clustering of the grain data
 3 | 
 4 | In the video, you learned that the SciPy linkage() function performs hierarchical clustering on an array of samples.
 5 | Use the linkage() function to obtain a hierarchical clustering of the grain samples, and use dendrogram() to
 6 | visualize the result. A sample of the grain measurements is provided in the array samples, while the variety of each
 7 | grain sample is given by the list varieties.
 8 | 
 9 | INSTRUCTIONS 100XP Import: linkage and dendrogram from scipy.cluster.hierarchy. matplotlib.pyplot as plt. Perform
10 | hierarchical clustering on samples using the linkage() function with the method='complete' keyword argument. Assign
11 | the result to mergings. Plot a dendrogram using the dendrogram() function on mergings. Specify the keyword arguments
12 | labels=varieties, leaf_rotation=90, and leaf_font_size=6. '''
13 | 
14 | import matplotlib.pyplot as plt
15 | # Perform the necessary imports
16 | from scipy.cluster.hierarchy import linkage, dendrogram
17 | 
18 | # Calculate the linkage: mergings
19 | mergings = linkage(samples, method='complete')
20 | 
21 | # Plot the dendrogram, using varieties as labels
22 | dendrogram(mergings,
23 |            labels=varieties,
24 |            leaf_rotation=90,
25 |            leaf_font_size=6,
26 |            )
27 | plt.show()
28 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/02-hierarchies-of-stocks.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Hierarchies of stocks
 3 | 
 4 | In chapter 1, you used k-means clustering to cluster companies according to their stock price movements. Now,
 5 | you'll perform hierarchical clustering of the companies. You are given a NumPy array of price movements movements,
 6 | where the rows correspond to companies, and a list of the company names companies. SciPy hierarchical clustering
 7 | doesn't fit into a sklearn pipeline, so you'll need to use the normalize() function from sklearn.preprocessing
 8 | instead of Normalizer.
 9 | 
10 | linkage and dendrogram have already been imported from sklearn.cluster.hierarchy, and PyPlot has been imported as plt.
11 | 
12 | INSTRUCTIONS
13 | 100XP
14 | INSTRUCTIONS
15 | 100XP
16 | Import normalize from sklearn.preprocessing.
17 | Rescale the price movements for each stock by using the normalize() function on movements.
18 | Apply the linkage() function to normalized_movements, using 'complete' linkage, to calculate the hierarchical clustering. Assign the result to mergings.
19 | Plot a dendrogram of the hierarchical clustering, using the list companies of company names as the labels. In addition, specify the leaf_rotation=90, and leaf_font_size=6 keyword arguments as you did in the previous exercise.
20 | '''
21 | # Import normalize
22 | from sklearn.preprocessing import normalize
23 | 
24 | # Normalize the movements: normalized_movements
25 | normalized_movements = normalize(movements)
26 | 
27 | # Calculate the linkage: mergings
28 | mergings = linkage(normalized_movements, method='complete')
29 | 
30 | # Plot the dendrogram
31 | dendrogram(mergings, labels=companies, leaf_rotation=90, leaf_font_size=6)
32 | plt.show()
33 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/03-different-linkage-different-hierarchical-clustering.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Different linkage, different hierarchical clustering!
 3 | 
 4 | In the video, you saw a hierarchical clustering of the voting countries at the Eurovision song contest using
 5 | 'complete' linkage. Now, perform a hierarchical clustering of the voting countries with 'single' linkage, and compare
 6 | the resulting dendrogram with the one in the video. Different linkage, different hierarchical clustering!
 7 | 
 8 | You are given an array samples. Each row corresponds to a voting country, and each column corresponds to a
 9 | performance that was voted for. The list country_names gives the name of each voting country. This dataset was
10 | obtained from Eurovision.
11 | 
12 | INSTRUCTIONS
13 | 100XP
14 | INSTRUCTIONS
15 | 100XP
16 | Import:
17 | linkage and dendrogram from scipy.cluster.hierarchy.
18 | matplotlib.pyplot as plt.
19 | Perform hierarchical clustering on samples using the linkage() function with the method='single' keyword argument. Assign the result to mergings.
20 | Plot a dendrogram of the hierarchical clustering, using the list country_names as the labels. In addition, specify the leaf_rotation=90, and leaf_font_size=6 keyword arguments as you have done earlier.
21 | '''
22 | # Perform the necessary imports
23 | import matplotlib.pyplot as plt
24 | from pytz import country_names
25 | from scipy.cluster.hierarchy import linkage, dendrogram
26 | 
27 | # Calculate the linkage: mergings
28 | mergings = linkage(samples, method='single')
29 | 
30 | # Plot the dendrogram
31 | dendrogram(mergings, labels=country_names, leaf_rotation=90, leaf_font_size=6)
32 | plt.show()
33 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/04-extracting-the-cluster-labels.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Extracting the cluster labels
 3 | 
 4 | In the previous exercise, you saw that the intermediate clustering of the grain samples at height 6 has 3 clusters.
 5 | Now, use the fcluster() function to extract the cluster labels for this intermediate clustering, and compare the
 6 | labels with the grain varieties using a cross-tabulation.
 7 | 
 8 | The hierarchical clustering has already been performed and mergings is the result of the linkage() function. The list
 9 | varieties gives the variety of each grain sample.
10 | 
11 | INSTRUCTIONS
12 | 100XP
13 | Import:
14 | pandas as pd.
15 | fcluster from scipy.cluster.hierarchy.
16 | Perform a flat hierarchical clustering by using the fcluster() function on mergings. Specify a maximum height of 6 and the keyword argument criterion='distance'.
17 | Create a DataFrame df with two columns named 'labels' and 'varieties', using labels and varieties, respectively, for the column values. This has been done for you.
18 | Create a cross-tabulation ct between df['labels'] and df['varieties'] to count the number of times each grain variety coincides with each cluster label.
19 | '''
20 | # Perform the necessary imports
21 | import pandas as pd
22 | from scipy.cluster.hierarchy import fcluster
23 | 
24 | # Use fcluster to extract labels: labels
25 | labels = fcluster(mergings, 6, criterion='distance')
26 | 
27 | # Create a DataFrame with labels and varieties as columns: df
28 | df = pd.DataFrame({'labels': labels, 'varieties': varieties})
29 | 
30 | # Create crosstab: ct
31 | ct = pd.crosstab(df['labels'], df['varieties'])
32 | 
33 | # Display ct
34 | print(ct)
35 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/05-tsne-visualization-of-grain-dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | t-SNE visualization of grain dataset
 3 | 
 4 | In the video, you saw t-SNE applied to the iris dataset. In this exercise, you'll apply t-SNE to the grain samples
 5 | data and inspect the resulting t-SNE features using a scatter plot. You are given an array samples of grain samples
 6 | and a list variety_numbers giving the variety number of each grain sample.
 7 | 
 8 | INSTRUCTIONS 100XP Import TSNE from sklearn.manifold. Create a TSNE instance called model with learning_rate=200.
 9 | Apply the .fit_transform() method of model to samples. Assign the result to tsne_features. Select the column 0 of
10 | tsne_features. Assign the result to xs. Select the column 1 of tsne_features. Assign the result to ys. Make a scatter
11 | plot of the t-SNE features xs and ys. To color the points by the grain variety, specify the additional keyword
12 | argument c=variety_numbers. '''
13 | # Import TSNE
14 | from sklearn.manifold import TSNE
15 | 
16 | # Create a TSNE instance: model
17 | model = TSNE(learning_rate=200)
18 | 
19 | # Apply fit_transform to samples: tsne_features
20 | tsne_features = model.fit_transform(samples)
21 | 
22 | # Select the 0th feature: xs
23 | xs = tsne_features[:, 0]
24 | 
25 | # Select the 1st feature: ys
26 | ys = tsne_features[:, 1]
27 | 
28 | # Scatter plot, coloring by variety_numbers
29 | plt.scatter(xs, ys, c=variety_numbers)
30 | plt.show()
31 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/06-a-tsne-map-of-the-stock-market.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A t-SNE map of the stock market
 3 | 
 4 | t-SNE provides great visualizations when the individual samples can be labeled. In this exercise, you'll apply t-SNE to the company stock price data. A scatter plot of the resulting t-SNE features, labeled by the company names, gives you a map of the stock market! The stock price movements for each company are available as the array normalized_movements (these have already been normalized for you). The list companies gives the name of each company. PyPlot (plt) has been imported for you.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | INSTRUCTIONS
 9 | 100XP
10 | Import TSNE from sklearn.manifold.
11 | Create a TSNE instance called model with learning_rate=50.
12 | Apply the .fit_transform() method of model to normalized_movements. Assign the result to tsne_features.
13 | Select column 0 and column 1 of tsne_features.
14 | Make a scatter plot of the t-SNE features xs and ys. Specify the additional keyword argument alpha=0.5.
15 | Code to label each point with its company name has been written for you using plt.annotate(), so just hit 'Submit Answer' to see the visualization!
16 | '''
17 | # Import TSNE
18 | from sklearn.manifold import TSNE
19 | 
20 | # Create a TSNE instance: model
21 | model = TSNE(learning_rate=50)
22 | 
23 | # Apply fit_transform to normalized_movements: tsne_features
24 | tsne_features = model.fit_transform(normalized_movements)
25 | 
26 | # Select the 0th feature: xs
27 | xs = tsne_features[:,0]
28 | 
29 | # Select the 1th feature: ys
30 | ys = tsne_features[:,1]
31 | 
32 | # Scatter plot
33 | plt.scatter(xs, ys, alpha=0.5)
34 | 
35 | # Annotate the points
36 | for x, y, company in zip(xs, ys, companies):
37 |     plt.annotate(company, (x, y), fontsize=5, alpha=0.75)
38 | plt.show()
39 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/ch2_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/ch2_slides.pdf


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/chapter-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/chapter-details.png


--------------------------------------------------------------------------------
/src/ml-unsupervised/02-visualization-with-hierarchical-clustering-and-t-sne/chapter-details.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;}
 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 6 | \deftab720
 7 | \pard\pardeftab720\partightenfactor0
 8 | 
 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0
10 | In this chapter, you'll learn about two unsupervised learning techniques for data visualization, hierarchical clustering and t-SNE. Hierarchical clustering merges the data samples into ever-coarser clusters, yielding a tree visualization of the resulting cluster hierarchy. t-SNE maps the data samples into 2d space so that the proximity of the samples to one another can be visualized.}


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/01-correlated-data-in-nature.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Correlated data in nature
 3 | 
 4 | You are given an array grains giving the width and length of samples of grain. You suspect that width and length will be correlated. To confirm this, make a scatter plot of width vs length and measure their Pearson correlation.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | Import:
 9 | matplotlib.pyplot as plt.
10 | pearsonr from scipy.stats.
11 | Assign column 0 of grains to width and column 1 of grains to length.
12 | Make a scatter plot with width on the x-axis and length on the y-axis.
13 | Use the pearsonr() function to calculate the Pearson correlation of width and length.
14 | '''
15 | 
16 | # Perform the necessary imports
17 | import matplotlib.pyplot as plt
18 | from grains_data_from_dataset import grains
19 | from scipy.stats import pearsonr
20 | 
21 | # Assign the 0th column of grains: width
22 | width = grains[:, 0]
23 | 
24 | # Assign the 1st column of grains: length
25 | length = grains[:, 1]
26 | # Scatter plot width vs length
27 | plt.scatter(width, length)
28 | plt.axis('equal')
29 | plt.show()
30 | 
31 | # Calculate the Pearson correlation
32 | correlation, pvalue = pearsonr(width, length)
33 | 
34 | # Display the correlation
35 | print(correlation)
36 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/02-decorrelating-the-grain-measurements-with-pca.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Decorrelating the grain measurements with PCA
 3 | 
 4 | You observed in the previous exercise that the width and length measurements of the grain are correlated. Now, you'll use PCA to decorrelate these measurements, then plot the decorrelated points and measure their Pearson correlation.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | Import PCA from sklearn.decomposition.
 9 | Create an instance of PCA called model.
10 | Use the .fit_transform() method of model to apply the PCA transformation to grains. Assign the result to pca_features.
11 | The subsequent code to extract, plot, and compute the Pearson correlation of the first two columns pca_features has been written for you, so hit 'Submit Answer' to see the result!
12 | '''
13 | import matplotlib.pyplot as plt
14 | from scipy.stats import pearsonr
15 | # Import PCA
16 | from sklearn.decomposition import PCA
17 | 
18 | from grains_data_from_dataset import grains
19 | 
20 | # Create PCA instance: model
21 | model = PCA()
22 | 
23 | # Apply the fit_transform method of model to grains: pca_features
24 | pca_features = model.fit_transform(grains)
25 | 
26 | # Assign 0th column of pca_features: xs
27 | xs = pca_features[:, 0]
28 | 
29 | # Assign 1st column of pca_features: ys
30 | ys = pca_features[:, 1]
31 | 
32 | # Scatter plot xs vs ys
33 | plt.scatter(xs, ys)
34 | plt.axis('equal')
35 | plt.show()
36 | 
37 | # Calculate the Pearson correlation of xs and ys
38 | correlation, pvalue = pearsonr(xs, ys)
39 | 
40 | # Display the correlation
41 | print(correlation)
42 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/03-the-first-principal-component.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | The first principal component
 3 | 
 4 | The first principal component of the data is the direction in which the data varies the most. In this exercise,
 5 | your job is to use PCA to find the first principal component of the length and width measurements of the grain
 6 | samples, and represent it as an arrow on the scatter plot.
 7 | 
 8 | The array grains gives the length and width of the grain samples. PyPlot (plt) and PCA have already been imported for
 9 | you.
10 | 
11 | INSTRUCTIONS
12 | 100XP
13 | INSTRUCTIONS
14 | 100XP
15 | Make a scatter plot of the grain measurements. This has been done for you.
16 | Create a PCA instance called model.
17 | Fit the model to the grains data.
18 | Extract the coordinates of the mean of the data using the .mean_ attribute of model.
19 | Get the first principal component of model using the .components_[0,:] attribute.
20 | Plot the first principal component as an arrow on the scatter plot, using the plt.arrow() function. You have to specify the first two arguments - mean[0] and mean[1].
21 | '''
22 | import matplotlib.pyplot as plt
23 | from sklearn.decomposition import PCA
24 | 
25 | from grains_data_from_dataset import grains
26 | 
27 | # Make a scatter plot of the untransformed points
28 | plt.scatter(grains[:, 0], grains[:, 1])
29 | 
30 | # Create a PCA instance: model
31 | model = PCA()
32 | 
33 | # Fit model to points
34 | model.fit(grains)
35 | 
36 | # Get the mean of the grain samples: mean
37 | mean = model.mean_
38 | 
39 | # Get the first principal component: first_pc
40 | first_pc = model.components_[0, :]
41 | 
42 | # Plot first_pc as an arrow, starting at mean
43 | plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)
44 | 
45 | # Keep axes on same scale
46 | plt.axis('equal')
47 | plt.show()
48 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/04-variance-of-the-pca-features.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Variance of the PCA features
 3 | 
 4 | The fish dataset is 6-dimensional. But what is its intrinsic dimension? Make a plot of the variances of the PCA
 5 | features to find out. As before, samples is a 2D array, where each row represents a fish. You'll need to standardize
 6 | the features first.
 7 | 
 8 | INSTRUCTIONS 100XP Create an instance of StandardScaler called scaler. Create a PCA instance called pca. Use the
 9 | make_pipeline() function to create a pipeline chaining scaler and pca. Use the .fit() method of pipeline to fit it to
10 | the fish samples samples. Extract the number of components used using the .n_components_ attribute of pca. Place this
11 | inside a range() function and store the result as features. Use the plt.bar() function to plot the explained
12 | variances, with features on the x-axis and pca.explained_variance_ on the y-axis. '''
13 | import matplotlib.pyplot as plt
14 | import numpy as np
15 | # Perform the necessary imports
16 | from sklearn.decomposition import PCA
17 | from sklearn.pipeline import make_pipeline
18 | from sklearn.preprocessing import StandardScaler
19 | 
20 | from helper import points
21 | 
22 | samples = np.array(points)
23 | 
24 | # Create scaler: scaler
25 | scaler = StandardScaler()
26 | 
27 | # Create a PCA instance: pca
28 | pca = PCA()
29 | 
30 | # Create pipeline: pipeline
31 | pipeline = make_pipeline(scaler, pca)
32 | 
33 | # Fit the pipeline to 'samples'
34 | pipeline.fit(samples)
35 | 
36 | # Plot the explained variances
37 | features = range(pca.n_components_)
38 | plt.bar(features, pca.explained_variance_)
39 | plt.xlabel('PCA feature')
40 | plt.ylabel('variance')
41 | plt.xticks(features)
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/05-dimension-reduction-of-the-fish-measuremenys.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Dimension reduction of the fish measurements
 3 | 
 4 | In a previous exercise, you saw that 2 was a reasonable choice for the "intrinsic dimension" of the fish
 5 | measurements. Now use PCA for dimensionality reduction of the fish measurements, retaining only the 2 most important
 6 | components.
 7 | 
 8 | The fish measurements have already been scaled for you, and are available as scaled_samples.
 9 | 
10 | INSTRUCTIONS
11 | 100XP
12 | Import PCA from sklearn.decomposition.
13 | Create a PCA instance called pca with n_components=2.
14 | Use the .fit() method of pca to fit it to the scaled fish measurements scaled_samples.
15 | Use the .transform() method of pca to transform the scaled_samples. Assign the result to pca_features.
16 | '''
17 | # Import PCA
18 | from sklearn.decomposition import PCA
19 | 
20 | from helper import scaled_samples
21 | 
22 | # Create a PCA instance: pca
23 | pca = PCA()
24 | 
25 | # Create a PCA model with 2 components: pca
26 | pca = PCA(n_components=2)
27 | 
28 | # Fit the PCA instance to the scaled samples
29 | pca.fit(scaled_samples)
30 | 
31 | # Transform the scaled samples: pca_features
32 | pca_features = pca.transform(scaled_samples)
33 | 
34 | # Print the shape of pca_features
35 | print(pca_features.shape)
36 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/06-a-tfidf-word-frequency-array.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A tf-idf word-frequency array
 3 | 
 4 | In this exercise, you'll create a tf-idf word frequency array for a toy collection of documents. For this,
 5 | use the TfidfVectorizer from sklearn. It transforms a list of documents into a word frequency array, which it outputs
 6 | as a csr_matrix. It has fit() and transform() methods like other sklearn objects.
 7 | 
 8 | You are given a list documents of toy documents about pets. Its contents have been printed in the IPython Shell.
 9 | 
10 | INSTRUCTIONS
11 | 100XP
12 | Import TfidfVectorizer from sklearn.feature_extraction.text.
13 | Create a TfidfVectorizer instance called tfidf.
14 | Apply .fit_transform() method of tfidf to documents and assign the result to csr_mat. This is a word-frequency array in csr_matrix format.
15 | Inspect csr_mat by calling its .toarray() method and printing the result. This has been done for you.
16 | The columns of the array correspond to words. Get the list of words by calling the .get_feature_names() method of tfidf, and assign the result to words.
17 | '''
18 | # Import TfidfVectorizer
19 | from sklearn.feature_extraction.text import TfidfVectorizer
20 | 
21 | documents = ['cats say meow', 'dogs say woof', 'dogs chase cats']
22 | 
23 | # Create a TfidfVectorizer: tfidf
24 | tfidf = TfidfVectorizer()
25 | 
26 | # Apply fit_transform to document: csr_mat
27 | csr_mat = tfidf.fit_transform(documents)
28 | 
29 | # Print result of toarray() method
30 | print(csr_mat.toarray())
31 | 
32 | # Get the words: words
33 | words = tfidf.get_feature_names()
34 | 
35 | # Print words
36 | print(words)
37 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/07-clustering-wikipedia-part-1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clustering Wikipedia part I
 3 | 
 4 | You saw in the video that TruncatedSVD is able to perform PCA on sparse arrays in csr_matrix format, such as word-frequency arrays. Combine your knowledge of TruncatedSVD and k-means to cluster some popular pages from Wikipedia. In this exercise, build the pipeline. In the next exercise, you'll apply it to the word-frequency array of some Wikipedia articles.
 5 | 
 6 | Create a Pipeline object consisting of a TruncatedSVD followed by KMeans. (This time, we've precomputed the word-frequency matrix for you, so there's no need for a TfidfVectorizer).
 7 | 
 8 | The Wikipedia dataset you will be working with was obtained from here.
 9 | 
10 | INSTRUCTIONS
11 | 100XP
12 | Import:
13 | TruncatedSVD from sklearn.decomposition.
14 | KMeans from sklearn.cluster.
15 | make_pipeline from sklearn.pipeline.
16 | Create a TruncatedSVD instance called svd with n_components=50.
17 | Create a KMeans instance called kmeans with n_clusters=6.
18 | Create a pipeline called pipeline consisting of svd and kmeans.
19 | '''
20 | from sklearn.cluster import KMeans
21 | # Perform the necessary imports
22 | from sklearn.decomposition import TruncatedSVD
23 | from sklearn.pipeline import make_pipeline
24 | 
25 | # Create a TruncatedSVD instance: svd
26 | svd = TruncatedSVD(n_components=50)
27 | 
28 | # Create a KMeans instance: kmeans
29 | kmeans = KMeans(n_clusters=6)
30 | 
31 | # Create a pipeline: pipeline
32 | pipeline = make_pipeline(svd, kmeans)
33 | 
34 | --------------
35 | 
36 | '''
37 | Clustering Wikipedia part II
38 | 
39 | It is now time to put your pipeline from the previous exercise to work! You are given an array articles of tf-idf
40 | word-frequencies of some popular Wikipedia articles, and a list titles of their titles. Use your pipeline to cluster
41 | the Wikipedia articles.
42 | 
43 | A solution to the previous exercise has been pre-loaded for you, so a Pipeline pipeline chaining TruncatedSVD with
44 | KMeans is available.
45 | 
46 | INSTRUCTIONS
47 | 100XP
48 | Import pandas as pd.
49 | Fit the pipeline to the word-frequency array articles.
50 | Predict the cluster labels.
51 | Align the cluster labels with the list titles of article titles by creating a DataFrame df with labels and titles as columns. This has been done for you.
52 | Use the .sort_values() method of df to sort the DataFrame by the 'label' column, and print the result.
53 | Hit 'Submit Answer' and take a moment to investigate your amazing clustering of Wikipedia pages!
54 | '''
55 | # Import pandas
56 | import pandas as pd
57 | from helper import titles
58 | 
59 | # Fit the pipeline to articles
60 | pipeline.fit(articles)
61 | 
62 | # Calculate the cluster labels: labels
63 | labels = pipeline.predict(articles)
64 | 
65 | # Create a DataFrame aligning labels and titles: df
66 | df = pd.DataFrame({'label': labels, 'article': titles})
67 | 
68 | # Display df sorted by cluster label
69 | print(df.sort_values('label'))
70 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/08-clustering-wikipedia-part-2.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Clustering Wikipedia part II
 3 | 
 4 | It is now time to put your pipeline from the previous exercise to work! You are given an array articles of tf-idf
 5 | word-frequencies of some popular Wikipedia articles, and a list titles of their titles. Use your pipeline to cluster
 6 | the Wikipedia articles.
 7 | 
 8 | A solution to the previous exercise has been pre-loaded for you, so a Pipeline pipeline chaining TruncatedSVD with
 9 | KMeans is available.
10 | 
11 | INSTRUCTIONS
12 | 100XP
13 | Import pandas as pd.
14 | Fit the pipeline to the word-frequency array articles.
15 | Predict the cluster labels.
16 | Align the cluster labels with the list titles of article titles by creating a DataFrame df with labels and titles as columns. This has been done for you.
17 | Use the .sort_values() method of df to sort the DataFrame by the 'label' column, and print the result.
18 | Hit 'Submit Answer' and take a moment to investigate your amazing clustering of Wikipedia pages!
19 | '''
20 | # Import pandas
21 | import pandas as pd
22 | # Fit the pipeline to articles
23 | pipeline.fit(articles)
24 | 
25 | # Calculate the cluster labels: labels
26 | labels = pipeline.predict(articles)
27 | 
28 | # Create a DataFrame aligning labels and titles: df
29 | df = pd.DataFrame({'label': labels, 'article': titles})
30 | 
31 | # Display df sorted by cluster label
32 | print(df.sort_values('label'))
33 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/ch3_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/ch3_slides.pdf


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/chapter-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/chapter-details.png


--------------------------------------------------------------------------------
/src/ml-unsupervised/03-decorrelating-your-data-and-dimension-reduction/chapter-details.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 3 | {\colortbl;\red255\green255\blue255;\red85\green92\blue98;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;\cssrgb\c40784\c43529\c45882;\cssrgb\c100000\c100000\c100000;}
 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 6 | \deftab720
 7 | \pard\pardeftab720\partightenfactor0
 8 | 
 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0
10 | Dimension reduction summarizes a dataset using its common occuring patterns. In this chapter, you'll learn about the most fundamental of dimension reduction techniques, "Principal Component Analysis" ("PCA"). PCA is often used before supervised learning to improve model performance and generalization. It can also be useful for unsupervised learning. For example, you'll employ a variant of PCA will allow you to cluster Wikipedia articles by their content!\cf2 \cb3 .}


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/01-nmf-applied-to-wikipedia-articles.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | NMF applied to Wikipedia articles
 3 | 
 4 | In the video, you saw NMF applied to transform a toy word-frequency array. Now it's your turn to apply NMF, this time using the tf-idf word-frequency array of Wikipedia articles, given as a csr matrix articles. Here, fit the model and transform the articles. In the next exercise, you'll explore the result.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | Import NMF from sklearn.decomposition.
 9 | Create an NMF instance called model with 6 components.
10 | Fit the model to the word count data articles.
11 | Use the .transform() method of model to transform articles, and assign the result to nmf_features.
12 | Print nmf_features to get a first idea what it looks like.
13 | '''
14 | # Import NMF
15 | from sklearn.decomposition import NMF
16 | 
17 | # Create an NMF instance: model
18 | model = NMF(n_components=6)
19 | 
20 | # Fit the model to articles
21 | model.fit(articles)
22 | 
23 | # Transform the articles: nmf_features
24 | nmf_features = model.transform(articles)
25 | 
26 | # Print the NMF features
27 | print(nmf_features)
28 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/02-nmf-features-of-the-wikipedia-articles.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | NMF features of the Wikipedia articles
 3 | 
 4 | Now you will explore the NMF features you created in the previous exercise. A solution to the previous exercise has been pre-loaded, so the array nmf_features is available. Also available is a list titles giving the title of each Wikipedia article.
 5 | 
 6 | When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component. In the next video, you'll see why: NMF components represent topics (for instance, acting!).
 7 | 
 8 | INSTRUCTIONS
 9 | 100XP
10 | Import pandas as pd.
11 | Create a DataFrame df from nmf_features using pd.DataFrame(). Set the index to titles using index=titles.
12 | Use the .loc[] accessor of df to select the row with title 'Anne Hathaway', and print the result. These are the NMF features for the article about the actress Anne Hathaway.
13 | Repeat the last step for 'Denzel Washington' (another actor).
14 | '''
15 | # Import pandas
16 | import pandas as pd
17 | 
18 | # Create a pandas DataFrame: df
19 | df = pd.DataFrame(nmf_features, index=titles)
20 | 
21 | # Print the row for 'Anne Hathaway'
22 | print(df.loc['Anne Hathaway'])
23 | 
24 | # Print the row for 'Denzel Washington'
25 | print(df.loc['Denzel Washington'])
26 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/03-nmf-learns-topics-of-documents.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | NMF learns topics of documents
 3 | 
 4 | In the video, you learned when NMF is applied to documents, the components correspond to topics of documents, and the NMF features reconstruct the documents from the topics. Verify this for yourself for the NMF model that you built earlier using the Wikipedia articles. Previously, you saw that the 3rd NMF feature value was high for the articles about actors Anne Hathaway and Denzel Washington. In this exercise, identify the topic of the corresponding NMF component.
 5 | 
 6 | The NMF model you built earlier is available as model, while words is a list of the words that label the columns of the word-frequency array.
 7 | 
 8 | After you are done, take a moment to recognise the topic that the articles about Anne Hathaway and Denzel Washington have in common!
 9 | 
10 | INSTRUCTIONS
11 | 100XP
12 | Import pandas as pd.
13 | Create a DataFrame components_df from model.components_, setting columns=words so that columns are labeled by the words.
14 | Print components_df.shape to check the dimensions of the DataFrame.
15 | Use the .iloc[] accessor on the DataFrame components_df to select row 3. Assign the result to component.
16 | Call the .nlargest() method of component, and print the result. This gives the five words with the highest values for that component.
17 | '''
18 | # Import pandas
19 | import pandas as pd
20 | 
21 | # Create a DataFrame: components_df
22 | components_df = pd.DataFrame(model.components_, columns=words)
23 | 
24 | # Print the shape of the DataFrame
25 | print(components_df.shape)
26 | 
27 | # Select row 3: component
28 | component = components_df.iloc[3]
29 | 
30 | # Print result of nlargest
31 | print(component.nlargest())


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/04-explore-the-led-digits-dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Explore the LED digits dataset
 3 | 
 4 | In the following exercises, you'll use NMF to decompose grayscale images into their commonly occurring patterns. Firstly, explore the image dataset and see how it is encoded as an array. You are given 100 images as a 2D array samples, where each row represents a single 13x8 image. The images in your dataset are pictures of a LED digital display.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | Import matplotlib.pyplot as plt.
 9 | Select row 0 of samples and assign the result to digit. For example, to select column 2 of an array a, you could use a[:,2]. Remember that since samples is a NumPy array, you can't use the .loc[] or iloc[] accessors to select specific rows or columns.
10 | Print digit. This has been done for you. Notice that it is a 1D array of 0s and 1s.
11 | Use the .reshape() method of digit to get a 2D array with shape (13, 8). Assign the result to bitmap.
12 | Print bitmap, and notice that the 1s show the digit 7!
13 | Use the plt.imshow() function to display bitmap as an image.
14 | '''
15 | import csv
16 | 
17 | import numpy as np
18 | # Import pyplot
19 | from matplotlib import pyplot as plt
20 | 
21 | from helper import path
22 | 
23 | with open('../' + path + 'lcd-digits.csv', 'r') as f:
24 |     samples = list(csv.reader(f, delimiter=','))
25 |     samples = np.array(samples).astype(np.float)
26 | 
27 | # Select the 0th row: digit
28 | digit = samples[0, :]
29 | 
30 | # Print digit
31 | print(digit)
32 | 
33 | # Reshape digit to a 13x8 array: bitmap
34 | bitmap = digit.reshape(13, 8)
35 | 
36 | # Print bitmap
37 | print(bitmap)
38 | 
39 | # Use plt.imshow to display bitmap
40 | plt.imshow(bitmap, cmap='gray', interpolation='nearest')
41 | plt.colorbar()
42 | plt.show()
43 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/05-nmf-learns-the-parts-of-images.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | NMF learns the parts of images
 3 | 
 4 | Now use what you've learned about NMF to decompose the digits dataset. You are again given the digit images as a 2D array samples. This time, you are also provided with a function show_as_image() that displays the image encoded by any 1D array:
 5 | 
 6 | def show_as_image(sample):
 7 |     bitmap = sample.reshape((13, 8))
 8 |     plt.figure()
 9 |     plt.imshow(bitmap, cmap='gray', interpolation='nearest')
10 |     plt.colorbar()
11 |     plt.show()
12 | After you are done, take a moment to look through the plots and notice how NMF has expressed the digit as a sum of the components!
13 | 
14 | INSTRUCTIONS
15 | 100XP
16 | Import NMF from sklearn.decomposition.
17 | Create an NMF instance called model with 7 components. (7 is the number of cells in an LED display).
18 | Apply the .fit_transform() method of model to samples. Assign the result to features.
19 | To each component of the model (accessed via model.components_), apply the show_as_image() function to that component inside the loop.
20 | Assign the row 0 of features to digit_features.
21 | Print digit_features.
22 | '''
23 | import csv
24 | 
25 | import numpy as np
26 | # Import pyplot
27 | from matplotlib import pyplot as plt
28 | # Import NMF
29 | from sklearn.decomposition import NMF
30 | 
31 | from helper import path
32 | 
33 | with open('../' + path + 'lcd-digits.csv', 'r') as f:
34 |     samples = list(csv.reader(f, delimiter=','))
35 |     samples = np.array(samples).astype(np.float)
36 | 
37 | 
38 | def show_as_image(sample):
39 |     bitmap = sample.reshape((13, 8))
40 |     plt.figure()
41 |     plt.imshow(bitmap, cmap='gray', interpolation='nearest')
42 |     plt.colorbar()
43 |     plt.show()
44 | 
45 | 
46 | # Create an NMF model: model
47 | model = NMF(n_components=7)
48 | 
49 | # Apply fit_transform to samples: features
50 | features = model.fit_transform(samples)
51 | 
52 | # Call show_as_image on each component
53 | for component in model.components_:
54 |     show_as_image(component)
55 | 
56 | # Assign the 0th row of features: digit_features
57 | digit_features = features[0, :]
58 | 
59 | # Print digit_features
60 | print(digit_features)
61 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/06-pca-doesnt-learn-parts.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | PCA doesn't learn parts
 3 | 
 4 | Unlike NMF, PCA doesn't learn the parts of things. Its components do not correspond to topics (in the case of documents) or to parts of images, when trained on images. Verify this for yourself by inspecting the components of a PCA model fit to the dataset of LED digit images from the previous exercise. The images are available as a 2D array samples. Also available is a modified version of the show_as_image() function which colors a pixel red if the value is negative.
 5 | 
 6 | After submitting the answer, notice that the components of PCA do not represent meaningful parts of images of LED digits!
 7 | 
 8 | INSTRUCTIONS
 9 | 100XP
10 | Import PCA from sklearn.decomposition.
11 | Create a PCA instance called model with 7 components.
12 | Apply the .fit_transform() method of model to samples. Assign the result to features.
13 | To each component of the model (accessed via model.components_), apply the show_as_image() function to that component inside the loop.
14 | '''
15 | import csv
16 | 
17 | import numpy as np
18 | # Import pyplot
19 | from matplotlib import pyplot as plt
20 | # Import PCA
21 | from sklearn.decomposition import PCA
22 | 
23 | from helper import path
24 | 
25 | with open('../' + path + 'lcd-digits.csv', 'r') as f:
26 |     samples = list(csv.reader(f, delimiter=','))
27 |     samples = np.array(samples).astype(np.float)
28 | 
29 | 
30 | def show_as_image(sample):
31 |     bitmap = sample.reshape((13, 8))
32 |     bitmap[bitmap >= 0] = 1
33 |     bitmap[bitmap < 0] = 0
34 |     plt.figure()
35 |     plt.imshow(bitmap, cmap='gist_yarg', interpolation='nearest', vmin=-.1, vmax=1.1)
36 |     plt.colorbar()
37 |     plt.show()
38 | 
39 | 
40 | # Create a PCA instance: model
41 | model = PCA(n_components=7)
42 | 
43 | # Apply fit_transform to samples: features
44 | features = model.fit_transform(samples)
45 | 
46 | # Call show_as_image on each component
47 | for component in model.components_:
48 |     show_as_image(component)
49 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/07-which-articles-are-similar-to-cristiano-ronaldo.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Which articles are similar to 'Cristiano Ronaldo'?
 3 | 
 4 | In the video, you learned how to use NMF features and the cosine similarity to find similar articles. Apply this to your NMF model for popular Wikipedia articles, by finding the articles most similar to the article about the footballer Cristiano Ronaldo. The NMF features you obtained earlier are available as nmf_features, while titles is a list of the article titles.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | Import normalize from sklearn.preprocessing.
 9 | Apply the normalize() function to nmf_features. Store the result as norm_features.
10 | Create a DataFrame df from norm_features, using titles as an index.
11 | Use the .loc[] accessor of df to select the row of 'Cristiano Ronaldo'. Assign the result to article.
12 | Apply the .dot() method of df to article to calculate the cosine similarity of every row with article.
13 | Print the result of the .nlargest() method of similarities to display the most similiar articles. This has been done for you, so hit 'Submit Answer' to see the result!
14 | '''
15 | # Perform the necessary imports
16 | import pandas as pd
17 | from sklearn.preprocessing import normalize
18 | 
19 | # Normalize the NMF features: norm_features
20 | norm_features = normalize(nmf_features)
21 | 
22 | # Create a DataFrame: df
23 | df = pd.DataFrame(norm_features, index=titles)
24 | 
25 | # Select the row corresponding to 'Cristiano Ronaldo': article
26 | article = df.loc['Cristiano Ronaldo']
27 | 
28 | # Compute the dot products: similarities
29 | similarities = df.dot(article)
30 | 
31 | # Display those with the largest cosine similarity
32 | print(similarities.nlargest())


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/08-recommend-musical-artists-part-1.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Recommend musical artists part I
 3 | 
 4 | In this exercise and the next, you'll use what you've learned about NMF to recommend popular music artists! You are given a sparse array artists whose rows correspond to artists and whose column correspond to users. The entries give the number of times each artist was listened to by each user.
 5 | 
 6 | In this exercise, build a pipeline and transform the array into normalized NMF features. The first step in the pipeline, MaxAbsScaler, transforms the data so that all users have the same influence on the model, regardless of how many different artists they've listened to. In the next exercise, you'll use the resulting normalized NMF features for recommendation!
 7 | 
 8 | This data is part of a larger dataset available here.
 9 | 
10 | INSTRUCTIONS
11 | 100XP
12 | Import:
13 | NMF from sklearn.decomposition.
14 | Normalizer and MaxAbsScaler from sklearn.preprocessing.
15 | make_pipeline from sklearn.pipeline.
16 | Create an instance of MaxAbsScaler called scaler.
17 | Create an NMF instance with 20 components called nmf.
18 | Create an instance of Normalizer called normalizer.
19 | Create a pipeline called pipeline that chains together scaler, nmf, and normalizer.
20 | Apply the .fit_transform() method of pipeline to artists. Assign the result to norm_features.
21 | '''
22 | # Perform the necessary imports
23 | from sklearn.decomposition import NMF
24 | from sklearn.preprocessing import Normalizer, MaxAbsScaler
25 | from sklearn.pipeline import make_pipeline
26 | 
27 | # Create a MaxAbsScaler: scaler
28 | scaler = MaxAbsScaler()
29 | 
30 | # Create an NMF model: nmf
31 | nmf = NMF(n_components=20)
32 | 
33 | # Create a Normalizer: normalizer
34 | normalizer = Normalizer()
35 | 
36 | # Create a pipeline: pipeline
37 | pipeline = make_pipeline(scaler, nmf, normalizer)
38 | 
39 | # Apply fit_transform to artists: norm_features
40 | norm_features = pipeline.fit_transform(artists)
41 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/09-recommend-musical-artists-part-2.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Recommend musical artists part II
 3 | 
 4 | Suppose you were a big fan of Bruce Springsteen - which other musicial artists might you like? Use your NMF features from the previous exercise and the cosine similarity to find similar musical artists. A solution to the previous exercise has been run, so norm_features is an array containing the normalized NMF features as rows. The names of the musical artists are available as the list artist_names.
 5 | 
 6 | INSTRUCTIONS
 7 | 100XP
 8 | Import pandas as pd.
 9 | Create a DataFrame df from norm_features, using artist_names as an index.
10 | Use the .loc[] accessor of df to select the row of 'Bruce Springsteen'. Assign the result to artist.
11 | Apply the .dot() method of df to artist to calculate the dot product of every row with artist. Save the result as similarities.
12 | Print the result of the .nlargest() method of similarities to display the artists most similar to 'Bruce Springsteen'.
13 | '''
14 | # Import pandas
15 | import pandas as pd
16 | 
17 | # Create a DataFrame: df
18 | df = pd.DataFrame(norm_features, index=artist_names)
19 | 
20 | # Select row of 'Bruce Springsteen': artist
21 | artist = df.loc['Bruce Springsteen']
22 | 
23 | # Compute cosine similarities: similarities
24 | similarities = df.dot(artist)
25 | 
26 | # Display those with highest cosine similarity
27 | print(similarities.nlargest())
28 | 


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/ch4_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/04-discovering-interpretable-features/ch4_slides.pdf


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/chapter-details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/04-discovering-interpretable-features/chapter-details.png


--------------------------------------------------------------------------------
/src/ml-unsupervised/04-discovering-interpretable-features/chapter-details.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf200
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 3 | {\colortbl;\red255\green255\blue255;\red44\green44\blue44;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;\cssrgb\c22745\c22745\c22745;\cssrgb\c100000\c100000\c100000;}
 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 6 | \deftab720
 7 | \pard\pardeftab720\partightenfactor0
 8 | 
 9 | \f0\fs30 \cf2 \cb3 \expnd0\expndtw0\kerning0
10 | In this chapter, you'll learn about a dimension reduction technique called "Non-negative matrix factorization" ("NMF") that expresses samples as combinations of interpretable parts. For example, it expresses documents as combinations of topics, and images in terms of commonly occurring visual patterns. You'll also learn to use NMF to build recommender systems that can find you similar articles to read, or musical artists that match your listening history!}


--------------------------------------------------------------------------------
/src/ml-unsupervised/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/__init__.py


--------------------------------------------------------------------------------
/src/ml-unsupervised/course-description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/ml-unsupervised/course-description.png


--------------------------------------------------------------------------------
/src/ml-unsupervised/k-means_clustering.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.cluster import KMeans
 4 | 
 5 | from helper import path
 6 | 
 7 | # Read the CSV file into a DataFrame: df
 8 | df = pd.read_csv(path + 'data_1024.csv', sep='\t')
 9 | 
10 | f1 = df['Distance_Feature'].values
11 | f2 = df['Speeding_Feature'].values
12 | 
13 | X = np.matrix(zip(f1, f2))
14 | kmeans = KMeans(n_clusters=2).fit(X)
15 | 


--------------------------------------------------------------------------------
/src/python_core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/python_core/__init__.py


--------------------------------------------------------------------------------
/src/python_core/output_questions/1.py:
--------------------------------------------------------------------------------
 1 | # what will be the output?
 2 | 
 3 | n = [1, 2, 3, 4, 5, 10, 3, 100, 9, 24]
 4 | 
 5 | n1 = [i for i in n if i > 5]
 6 | 
 7 | for e in n:
 8 |     print('inter-> ')
 9 |     if e < 5:
10 |         print('removing: {}'.format(e))
11 |         n.remove(e)
12 |         print('list after removal: {}'.format(n))
13 | 
14 | print(n)
15 | print(n1)
16 | 


--------------------------------------------------------------------------------
/src/python_core/output_questions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/saranshbansal/data-science-with-python/651e67d93f8aa65ce386978684523f7e14cd3b2a/src/python_core/output_questions/__init__.py


--------------------------------------------------------------------------------