├── scripts
    └── install_requirements.sh
├── decision_trees
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── data.py
    ├── model.py
    └── utils.py
├── neural_network
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── data.py
    ├── model.py
    └── utils.py
├── network_analysis
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── data.py
    ├── model.py
    └── utils.py
├── working_with_data
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── comma_delimited_stock_prices.csv
    ├── model.py
    └── data.py
├── k_means_clustering
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── data.py
    ├── model.py
    ├── Understanding the algorithm.md
    └── utils.py
├── k_nearest_neighbors
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── model.py
    ├── Understanding the algorithm.md
    ├── data.py
    └── utils.py
├── logistic_regression
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── utils.py
    ├── model.py
    └── data.py
├── multiple_regression
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── model.py
    ├── utils.py
    └── data.py
├── recommender_systems
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── data.py
    ├── model.py
    └── utils.py
├── LDA scikit-learn
    ├── __pycache__
    │   ├── preprocess.cpython-34.pyc
    │   ├── load20newsgroups.cpython-34.pyc
    │   └── nmf_lda_scikitlearn.cpython-34.pyc
    ├── load20newsgroups.py
    ├── nmf_lda_scikitlearn.py
    ├── displaytopics.py
    └── preprocess.py
├── natural_language_processing
    ├── __pycache__
    │   ├── data.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── model.py
    ├── data.py
    └── utils.py
├── .github
    ├── dependabot.yml
    └── ISSUE_TEMPLATE
    │   ├── feature_request.md
    │   └── bug_report.md
├── SECURITY.md
├── naive_bayes_classfier
    ├── naivebayesclassifier.py
    ├── utils.py
    └── model.py
├── requirements.txt
├── mnist-deep-learning.py
├── logistic_regression_banking
    ├── utils.py
    └── binary_logisitic_regression.py
├── LICENSE
├── simple_linear_regression
    ├── model.py
    ├── utils.py
    └── data.py
├── helpers
    ├── machine_learning.py
    ├── linear_algebra.py
    ├── probabilty.py
    ├── gradient_descent.py
    └── stats.py
├── .gitignore
├── NN_churn_prediction.py
├── telecom_churn_prediction.py
├── Anamoly_Detection_notes.md
├── regression_intro.py
├── hparams_grid_search_keras_nn.py
├── CODE_OF_CONDUCT.md
├── use_cases_insurnace.md
├── Understanding Vanishing Gradient.md
├── CONTRIBUTING.md
├── prec_rec_curve.py
├── Understanding SQL Queries.md
├── hypothesis_inference.py
├── friendster_network.py
├── README.md
└── sonar_clf_rf.py


/scripts/install_requirements.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install -r requirements.txt
4 | 


--------------------------------------------------------------------------------
/decision_trees/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/decision_trees/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/decision_trees/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/decision_trees/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/neural_network/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/neural_network/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/neural_network/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/neural_network/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/network_analysis/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/network_analysis/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/network_analysis/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/network_analysis/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/working_with_data/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/working_with_data/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/k_means_clustering/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/k_means_clustering/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/k_means_clustering/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/k_means_clustering/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/k_nearest_neighbors/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/k_nearest_neighbors/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/k_nearest_neighbors/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/k_nearest_neighbors/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/logistic_regression/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/logistic_regression/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/logistic_regression/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/logistic_regression/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/multiple_regression/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/multiple_regression/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/multiple_regression/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/multiple_regression/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/recommender_systems/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/recommender_systems/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/recommender_systems/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/recommender_systems/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/working_with_data/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/working_with_data/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/LDA scikit-learn/__pycache__/preprocess.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/LDA scikit-learn/__pycache__/preprocess.cpython-34.pyc


--------------------------------------------------------------------------------
/working_with_data/comma_delimited_stock_prices.csv:
--------------------------------------------------------------------------------
1 | 6/20/2014,AAPL,90.91
2 | 6/20/2014,MSFT,41.68
3 | 6/20/3014,FB,64.5
4 | 6/19/2014,AAPL,91.86
5 | 6/19/2014,MSFT,n/a
6 | 6/19/2014,FB,64.34


--------------------------------------------------------------------------------
/natural_language_processing/__pycache__/data.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/natural_language_processing/__pycache__/data.cpython-35.pyc


--------------------------------------------------------------------------------
/LDA scikit-learn/__pycache__/load20newsgroups.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/LDA scikit-learn/__pycache__/load20newsgroups.cpython-34.pyc


--------------------------------------------------------------------------------
/natural_language_processing/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/natural_language_processing/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/LDA scikit-learn/__pycache__/nmf_lda_scikitlearn.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/devAmoghS/Machine-Learning-with-Python/HEAD/LDA scikit-learn/__pycache__/nmf_lda_scikitlearn.cpython-34.pyc


--------------------------------------------------------------------------------
/LDA scikit-learn/load20newsgroups.py:
--------------------------------------------------------------------------------
1 | from sklearn.datasets import fetch_20newsgroups
2 | 
3 | dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
4 | documents = dataset.data
5 | 


--------------------------------------------------------------------------------
/k_means_clustering/data.py:
--------------------------------------------------------------------------------
 1 | inputs = [[-14, -5],
 2 |  [13, 13],
 3 |  [20, 23],
 4 |  [-19, -11],
 5 |  [-9, -16],
 6 |  [21, 27],
 7 |  [-49, 15],
 8 |  [26, 13],
 9 |  [-46, 5],
10 |  [-34, -1],
11 |  [11, 15],
12 |  [-49, 0],
13 |  [-22, -16],
14 |  [19, 28],
15 |  [-12, -8],
16 |  [-13, -19],
17 |  [-41, 8],
18 |  [-11, -6],
19 |  [-25, -9],
20 |  [-18, -3]]
21 | 


--------------------------------------------------------------------------------
/LDA scikit-learn/nmf_lda_scikitlearn.py:
--------------------------------------------------------------------------------
 1 | from sklearn.decomposition import NMF, LatentDirichletAllocation
 2 | from preprocess import tfidf, tf
 3 | num_topics = 20
 4 | 
 5 | # Run NMF
 6 | nmf = NMF(n_components=num_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
 7 | 
 8 | # Run LDA
 9 | lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=5, learning_method='online',
10 |                                 learning_offset=50, random_state=0).fit(tf)
11 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify whic
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/LDA scikit-learn/displaytopics.py:
--------------------------------------------------------------------------------
 1 | from nmf_lda_scikitlearn import nmf, lda
 2 | from preprocess import tfidf_feature_names, tf_feature_names
 3 | 
 4 | 
 5 | def display_topics(model, feature_names, num_top_words):
 6 |     for topic_idx, topic in enumerate(model.components_):
 7 |         print("Topic %d:" % (topic_idx))
 8 |         print("".join([feature_names[i]
 9 |           for i in topic.argsort()[:-num_top_words - 1:-1]])
10 |         )
11 | 
12 | 
13 | num_top_words = 10
14 | display_topics(nmf, tfidf_feature_names, num_top_words)
15 | display_topics(lda, tf_feature_names, num_top_words)
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/network_analysis/data.py:
--------------------------------------------------------------------------------
 1 | users = [
 2 |     { "id": 0, "name": "Hero" },
 3 |     { "id": 1, "name": "Dunn" },
 4 |     { "id": 2, "name": "Sue" },
 5 |     { "id": 3, "name": "Chi" },
 6 |     { "id": 4, "name": "Thor" },
 7 |     { "id": 5, "name": "Clive" },
 8 |     { "id": 6, "name": "Hicks" },
 9 |     { "id": 7, "name": "Devin" },
10 |     { "id": 8, "name": "Kate" },
11 |     { "id": 9, "name": "Klein" }
12 | ]
13 | 
14 | friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
15 |                (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
16 | 
17 | endorsements = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2),
18 |                 (2, 1), (1, 3), (2, 3), (3, 4), (5, 4),
19 |                 (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]


--------------------------------------------------------------------------------
/LDA scikit-learn/preprocess.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 2 | from load20newsgroups import documents
 3 | 
 4 | num_features = 1000
 5 | 
 6 | # NMF is able to use tf-idf
 7 | tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
 8 | tfidf = tfidf_vectorizer.fit_transform(documents)
 9 | tfidf_feature_names = tfidf_vectorizer.get_feature_names()
10 | 
11 | # LDA can only use raw term counts
12 | # because it is a probablistic graphical model
13 | tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
14 | tf = tf_vectorizer.fit_transform(documents)
15 | tf_feature_names = tf_vectorizer.get_feature_names()
16 | 


--------------------------------------------------------------------------------
/network_analysis/model.py:
--------------------------------------------------------------------------------
 1 | from network_analysis.data import users
 2 | from network_analysis.utils import eigenvector_centralities, page_rank
 3 | 
 4 | if __name__ == '__main__':
 5 | 
 6 |     print("Betweenness Centrality")
 7 |     for user in users:
 8 |         print(user["id"], user["betweenness_centrality"])
 9 |     print()
10 | 
11 |     print("Closeness Centrality")
12 |     for user in users:
13 |         print(user["id"], user["closeness_centrality"])
14 |     print()
15 | 
16 |     print("Eigenvector Centrality")
17 |     for user_id, centrality in enumerate(eigenvector_centralities):
18 |         print(user_id, centrality)
19 |     print()
20 | 
21 |     print("PageRank")
22 |     for user_id, pr in page_rank(users).items():
23 |         print(user_id, pr)


--------------------------------------------------------------------------------
/naive_bayes_classfier/naivebayesclassifier.py:
--------------------------------------------------------------------------------
 1 | from naive_bayes_classfier.utils import count_words, word_probabilities, spam_probability
 2 | 
 3 | 
 4 | class NaiveBayesClassifier:
 5 | 
 6 |     def __init__(self, k=0.5):
 7 |         self.k = k
 8 |         self.word_probs = []
 9 | 
10 |     def train(self, training_set):
11 | 
12 |         # count spam and non-spam messages
13 |         num_spams = len([is_spam
14 |                          for message, is_spam in training_set
15 |                          if is_spam])
16 |         num_non_spams = len(training_set) - num_spams
17 | 
18 |         # run training data through a "pipeline"
19 |         word_counts = count_words(training_set)
20 |         self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)
21 | 
22 |     def classify(self, message):
23 |         return spam_probability(self.word_probs, message)
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Keras==2.13.1
 2 | Keras-Preprocessing==1.0.5
 3 | PySocks==1.6.8
 4 | Pygments==2.15.0
 5 | Quandl==3.4.5
 6 | asn1crypto==0.24.0
 7 | backcall==0.1.0
 8 | beautifulsoup4==4.6.3
 9 | certifi==2023.7.22
10 | cffi==1.11.5
11 | chardet==3.0.4
12 | cryptography==44.0.1
13 | cycler==0.10.0
14 | h5py==2.9.0
15 | idna==3.7
16 | inflection==0.3.1
17 | ipython==8.10.0
18 | jedi==0.13.2
19 | kiwisolver==1.0.1
20 | matplotlib==3.0.0
21 | more-itertools==5.0.0
22 | numpy==1.22.0
23 | pandas==0.23.4
24 | patsy==0.5.0
25 | pexpect==4.6.0
26 | pickleshare==0.7.5
27 | pip==23.3
28 | ptyprocess==0.6.0
29 | pyOpenSSL==18.0.0
30 | pycparser==2.19
31 | pyparsing==2.2.1
32 | python-dateutil==2.7.3
33 | pytz==2018.5
34 | requests>=2.20.0
35 | scikit-learn==1.5.0
36 | scipy==1.10.0
37 | seaborn==0.9.0
38 | setuptools==70.0.0
39 | six==1.11.0
40 | statsmodels==0.9.0
41 | tornado==6.4.2
42 | traitlets==4.3.2
43 | wcwidth==0.1.7
44 | wheel==0.38.1
45 | 


--------------------------------------------------------------------------------
/neural_network/data.py:
--------------------------------------------------------------------------------
 1 | raw_digits = [
 2 |     """11111
 3 |        1...1
 4 |        1...1
 5 |        1...1
 6 |        11111""",
 7 | 
 8 |     """..1..
 9 |        ..1..
10 |        ..1..
11 |        ..1..
12 |        ..1..""",
13 | 
14 |     """11111
15 |        ....1
16 |        11111
17 |        1....
18 |        11111""",
19 | 
20 |     """11111
21 |        ....1
22 |        11111
23 |        ....1
24 |        11111""",
25 | 
26 |     """1...1
27 |        1...1
28 |        11111
29 |        ....1
30 |        ....1""",
31 | 
32 |     """11111
33 |        1....
34 |        11111
35 |        ....1
36 |        11111""",
37 | 
38 |     """11111
39 |        1....
40 |        11111
41 |        1...1
42 |        11111""",
43 | 
44 |     """11111
45 |        ....1
46 |        ....1
47 |        ....1
48 |        ....1""",
49 | 
50 |     """11111
51 |        1...1
52 |        11111
53 |        1...1
54 |        11111""",
55 | 
56 |     """11111
57 |        1...1
58 |        11111
59 |        ....1
60 |        11111"""]


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/recommender_systems/data.py:
--------------------------------------------------------------------------------
 1 | users_interests = [
 2 |     ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
 3 |     ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
 4 |     ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
 5 |     ["R", "Python", "statistics", "regression", "probability"],
 6 |     ["machine learning", "regression", "decision trees", "libsvm"],
 7 |     ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
 8 |     ["statistics", "probability", "mathematics", "theory"],
 9 |     ["machine learning", "scikit-learn", "Mahout", "neural networks"],
10 |     ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
11 |     ["Hadoop", "Java", "MapReduce", "Big Data"],
12 |     ["statistics", "R", "statsmodels"],
13 |     ["C++", "deep learning", "artificial intelligence", "probability"],
14 |     ["pandas", "R", "Python"],
15 |     ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
16 |     ["libsvm", "regression", "support vector machines"]
17 | ]
18 | 


--------------------------------------------------------------------------------
/mnist-deep-learning.py:
--------------------------------------------------------------------------------
 1 | from keras.datasets import mnist
 2 | (train_images, train_labels), (test_images, test_labels) = mnist.load_data()
 3 | 
 4 | # 1. Prepare the Data
 5 | from keras.utils import to_categorical
 6 | train_images = train_images.reshape((60000, 28 * 28)).astype('float32') / 255
 7 | test_images = test_images.reshape((10000, 28 * 28)).astype('float32') / 255
 8 | train_labels = to_categorical(train_labels)
 9 | test_labels = to_categorical(test_labels)
10 | 
11 | # 2. Set up network architecture
12 | from keras import models, layers
13 | network = models.Sequential()
14 | network.add(layers.Dense(512, activation='relu', input_shape=(28 * 28,)))
15 | network.add(layers.Dense(10, activation='softmax'))
16 | 
17 | # 3/4. Pick Optimizer and Loss
18 | network.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
19 | network.fit(train_images, train_labels, epochs=5, batch_size=128)
20 | 
21 | # 5. Measure on test
22 | test_loss, test_acc = network.evaluate(test_images, test_labels)
23 | print('test_acc:', test_acc)
24 | 


--------------------------------------------------------------------------------
/logistic_regression_banking/utils.py:
--------------------------------------------------------------------------------
 1 | import seaborn as sns
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | plt.rc("font", size=14)
 5 | sns.set(style="white")
 6 | sns.set(style="whitegrid", color_codes=True)
 7 | 
 8 | 
 9 | def plot_data(data):
10 |     # barplot for the depencent variable
11 |     sns.countplot(x='y', data=data, palette='hls')
12 |     plt.show()
13 | 
14 |     # check the missing values
15 |     print(data.isnull().sum())
16 | 
17 |     # customer distribution plot
18 |     sns.countplot(y='job', data=data)
19 |     plt.show()
20 | 
21 |     # customer marital status distribution
22 |     sns.countplot(x='marital', data=data)
23 |     plt.show()
24 | 
25 |     # barplot for credit in default
26 |     sns.countplot(x='default', data=data)
27 |     plt.show()
28 | 
29 |     # barptot for housing loan
30 |     sns.countplot(x='housing', data=data)
31 |     plt.show()
32 | 
33 |     # barplot for personal loan
34 |     sns.countplot(x='loan', data=data)
35 |     plt.show()
36 | 
37 |     # barplot for previous marketing campaign outcome
38 |     sns.countplot(x='poutcome', data=data)
39 |     plt.show()
40 | 
41 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Amogh Singhal
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/simple_linear_regression/model.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from helpers.gradient_descent import minimize_stochastic
 3 | from simple_linear_regression.data import num_friends_good, daily_minutes_good
 4 | from simple_linear_regression.utils import least_squares_fit, r_squared, squared_error, squared_error_gradient
 5 | 
 6 | if __name__ == '__main__':
 7 | 
 8 |     alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
 9 |     print("alpha", alpha)
10 |     print("beta", beta)
11 | 
12 |     print("r-squared", r_squared(alpha, beta, num_friends_good, daily_minutes_good))
13 | 
14 |     print()
15 | 
16 |     print("gradient descent:")
17 |     # choose random value to start
18 |     random.seed(0)
19 |     theta = [random.random(), random.random()]
20 |     alpha, beta = minimize_stochastic(squared_error,
21 |                                       squared_error_gradient,
22 |                                       num_friends_good,
23 |                                       daily_minutes_good,
24 |                                       theta,
25 |                                       0.0001)
26 |     print("alpha", alpha)
27 |     print("beta", beta)
28 | 


--------------------------------------------------------------------------------
/decision_trees/data.py:
--------------------------------------------------------------------------------
 1 | inputs = [
 2 |     ({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'no'}, False),
 3 |     ({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'yes'}, False),
 4 |     ({'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, True),
 5 |     ({'level': 'Junior', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, True),
 6 |     ({'level': 'Junior', 'lang': 'R', 'tweets': 'yes', 'phd': 'no'}, True),
 7 |     ({'level': 'Junior', 'lang': 'R', 'tweets': 'yes', 'phd': 'yes'}, False),
 8 |     ({'level': 'Mid', 'lang': 'R', 'tweets': 'yes', 'phd': 'yes'}, True),
 9 |     ({'level': 'Senior', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, False),
10 |     ({'level': 'Senior', 'lang': 'R', 'tweets': 'yes', 'phd': 'no'}, True),
11 |     ({'level': 'Junior', 'lang': 'Python', 'tweets': 'yes', 'phd': 'no'}, True),
12 |     ({'level': 'Senior', 'lang': 'Python', 'tweets': 'yes', 'phd': 'yes'}, True),
13 |     ({'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'yes'}, True),
14 |     ({'level': 'Mid', 'lang': 'Java', 'tweets': 'yes', 'phd': 'no'}, True),
15 |     ({'level': 'Junior', 'lang': 'Python', 'tweets': 'no', 'phd': 'yes'}, False)
16 | ]


--------------------------------------------------------------------------------
/k_means_clustering/model.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from k_means_clustering.data import inputs
 4 | from k_means_clustering.utils import KMeans, bottom_up_cluster, \
 5 |     generate_clusters, get_values
 6 | 
 7 | if __name__ == '__main__':
 8 |     random.seed(0)
 9 |     cluster = KMeans(3)
10 |     cluster.train(inputs=inputs)
11 |     print("3-means:")
12 |     print(cluster.means)
13 |     print()
14 | 
15 |     random.seed(0)
16 |     cluster = KMeans(2)
17 |     cluster.train(inputs=inputs)
18 |     print("2-means:")
19 |     print(cluster.means)
20 |     print()
21 | 
22 |     # for k in range(1, len(inputs) + 1):
23 |     #     print(k, squared_clustering_errors(inputs, k))
24 |     # print()
25 | 
26 |     # recolor_image('/home/amogh/Pictures/symantec.png')
27 | 
28 |     print("bottom up hierarchical clustering")
29 | 
30 |     base_cluster = bottom_up_cluster(inputs)
31 |     print(base_cluster)
32 | 
33 |     print()
34 |     print("three clusters, min:")
35 |     for cluster in generate_clusters(base_cluster, 3):
36 |         print(get_values(cluster))
37 | 
38 |     print()
39 |     print("three clusters, max:")
40 |     base_cluster = bottom_up_cluster(inputs, max)
41 |     for cluster in generate_clusters(base_cluster, 3):
42 |         print(get_values(cluster))


--------------------------------------------------------------------------------
/k_nearest_neighbors/model.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from k_nearest_neighbors.data import cities
 3 | from k_nearest_neighbors.utils import knn_classify, random_distances
 4 | from helpers.stats import mean
 5 | 
 6 | if __name__ == "__main__":
 7 | 
 8 |     # try several different values for k
 9 |     for k in [1, 3, 5, 7]:
10 |         num_correct = 0
11 | 
12 |         for location, actual_language in cities:
13 | 
14 |             other_cities = [other_city
15 |                             for other_city in cities
16 |                             if other_city != (location, actual_language)]
17 | 
18 |             predicted_language = knn_classify(k, other_cities, location)
19 | 
20 |             if predicted_language == actual_language:
21 |                 num_correct += 1
22 | 
23 |         print(k, "neighbor[s]:", num_correct, "correct out of", len(cities))
24 | 
25 |     dimensions = range(1, 101, 5)
26 | 
27 |     avg_distances = []
28 |     min_distances = []
29 | 
30 |     random.seed(0)
31 |     for dim in dimensions:
32 |         distances = random_distances(dim, 10000)  # 10,000 random pairs
33 |         avg_distances.append(mean(distances))     # track the average
34 |         min_distances.append(min(distances))      # track the minimum
35 |         print(dim, min(distances), mean(distances), min(distances) / mean(distances))
36 | 


--------------------------------------------------------------------------------
/logistic_regression/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from functools import reduce
 3 | 
 4 | from helpers.linear_algebra import vector_add, dot
 5 | 
 6 | 
 7 | def logistic(x):
 8 |     return 1.0 / (1 + math.exp(-x))
 9 | 
10 | 
11 | def logistic_prime(x):
12 |     return logistic(x) * (1 - logistic(x))
13 | 
14 | 
15 | def logistic_log_likelihood_i(x_i, y_i, beta):
16 |     if y_i == 1:
17 |         return math.log(logistic(dot(x_i, beta)))
18 |     else:
19 |         return math.log(1 - logistic(dot(x_i, beta)))
20 | 
21 | 
22 | def logistic_log_likelihood(x, y, beta):
23 |     return sum(logistic_log_likelihood_i(x_i, y_i, beta)
24 |                for x_i, y_i in zip(x, y))
25 | 
26 | 
27 | def logistic_log_partial_ij(x_i, y_i, beta, j):
28 |     """here i is the index of the data point,
29 |     j the index of the derivative"""
30 | 
31 |     return (y_i - logistic(dot(x_i, beta))) * x_i[j]
32 | 
33 | 
34 | def logistic_log_gradient_i(x_i, y_i, beta):
35 |     """the gradient of the log likelihood
36 |     corresponding to the i-th data point"""
37 | 
38 |     return [logistic_log_partial_ij(x_i, y_i, beta, j)
39 |             for j, _ in enumerate(beta)]
40 | 
41 | 
42 | def logistic_log_gradient(x, y, beta):
43 |     return reduce(vector_add,
44 |                   [logistic_log_gradient_i(x_i, y_i, beta)
45 |                    for x_i, y_i in zip(x,y)])


--------------------------------------------------------------------------------
/simple_linear_regression/utils.py:
--------------------------------------------------------------------------------
 1 | from helpers.stats import correlation, standard_deviation, mean, de_mean
 2 | 
 3 | 
 4 | def predict(alpha, beta, x_i):
 5 |     return beta * x_i + alpha
 6 | 
 7 | 
 8 | def error(alpha, beta, x_i, y_i):
 9 |     return y_i - predict(alpha, beta, x_i)
10 | 
11 | 
12 | def sum_of_squared_errors(alpha, beta, x, y):
13 |     return sum(error(alpha, beta, x_i, y_i) ** 2
14 |                for x_i, y_i in zip(x, y))
15 | 
16 | 
17 | def least_squares_fit(x, y):
18 |     beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
19 |     alpha = mean(y) - beta * mean(x)
20 |     return alpha, beta
21 | 
22 | 
23 | def total_sum_of_squares(y):
24 |     """The total squared variation of y_i's from their mean"""
25 |     return sum(v ** 2 for v in de_mean(y))
26 | 
27 | 
28 | def r_squared(alpha, beta, x, y):
29 |     """the fraction of variation in y captured by the model"""
30 |     return 1 - sum_of_squared_errors(alpha, beta, x, y) / total_sum_of_squares(y)
31 | 
32 | 
33 | def squared_error(x_i, y_i, theta):
34 |     alpha, beta = theta
35 |     return error(alpha, beta, x_i, y_i) ** 2
36 | 
37 | 
38 | def squared_error_gradient(x_i, y_i, theta):
39 |     alpha, beta = theta
40 |     return [-2 * error(alpha, beta, x_i, y_i),          # alpha partial derivative
41 |             -2 * error(alpha, beta, x_i, y_i) * x_i]    # beta partial derivative
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/helpers/machine_learning.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | #
 4 | # data splitting
 5 | #
 6 | 
 7 | 
 8 | def split_data(data, prob):
 9 |     """split data into fractions [prob, 1 - prob]"""
10 |     results = [], []
11 |     for row in data:
12 |         results[0 if random.random() < prob else 1].append(row)
13 |     return results
14 | 
15 | 
16 | def train_test_split(x, y, test_pct):
17 |     data = list(zip(x, y))                        # pair corresponding values
18 |     train, test = split_data(data, 1 - test_pct)  # split the data-set of pairs
19 |     x_train, y_train = list(zip(*train))          # magical un-zip trick
20 |     x_test, y_test = list(zip(*test))
21 |     return x_train, x_test, y_train, y_test
22 | 
23 | #
24 | # correctness
25 | #
26 | 
27 | 
28 | def accuracy(tp, fp, fn, tn):
29 |     correct = tp + tn
30 |     total = tp + fp + fn + tn
31 |     return correct / total
32 | 
33 | 
34 | def precision(tp, fp):
35 |     return tp / (tp + fp)
36 | 
37 | 
38 | def recall(tp, fn):
39 |     return tp / (tp + fn)
40 | 
41 | 
42 | def f1_score(tp, fp, fn):
43 |     p = precision(tp, fp)
44 |     r = recall(tp, fn)
45 | 
46 |     return 2 * p * r / (p + r)
47 | 
48 | 
49 | if __name__ == "__main__":
50 | 
51 |     print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070))
52 |     print("precision(70, 4930, 13930, 981070)", precision(70, 4930))
53 |     print("recall(70, 4930, 13930, 981070)", recall(70, 13930))
54 |     print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930))
55 | 


--------------------------------------------------------------------------------
/decision_trees/model.py:
--------------------------------------------------------------------------------
 1 | from decision_trees.data import inputs
 2 | from decision_trees.utils import partition_entropy_by, build_tree_id3, classify
 3 | 
 4 | if __name__ == "__main__":
 5 | 
 6 |     for key in ['level', 'lang', 'tweets', 'phd']:
 7 |         print(key, partition_entropy_by(inputs, key))
 8 |     print()
 9 | 
10 |     senior_inputs = [(input, label)
11 |                      for input, label in inputs if input["level"] == "Senior"]
12 | 
13 |     for key in ['lang', 'tweets', 'phd']:
14 |         print(key, partition_entropy_by(senior_inputs, key))
15 |     print()
16 | 
17 |     print("building the tree")
18 |     tree = build_tree_id3(inputs)
19 |     print(tree)
20 | 
21 |     print("Junior / Java / tweets / no phd", classify(tree,
22 |                                                       {"level": "Junior",
23 |                                                        "lang": "Java",
24 |                                                        "tweets": "yes",
25 |                                                        "phd": "no"}))
26 | 
27 |     print("Junior / Java / tweets / phd", classify(tree,
28 |                                                    {"level": "Junior",
29 |                                                     "lang": "Java",
30 |                                                     "tweets": "yes",
31 |                                                     "phd": "yes"}))
32 | 
33 |     print("Intern", classify(tree, {"level": "Intern"}))
34 |     print("Senior", classify(tree, {"level": "Senior"}))
35 | 


--------------------------------------------------------------------------------
/natural_language_processing/model.py:
--------------------------------------------------------------------------------
 1 | from natural_language_processing.data import grammar, documents
 2 | from natural_language_processing.utils import generate_sentence, topic_word_counts, document_topic_counts
 3 | 
 4 | if __name__ == '__main__':
 5 |     # plot_resumes()
 6 | 
 7 |     # document = get_document()
 8 | 
 9 |     # bigrams = zip(document, document[1:])       # gives us precisely the pairs of consecutive elements of document
10 |     # bigrams_transitions = defaultdict(list)
11 |     # for prev, current in bigrams:
12 |     #     bigrams_transitions[prev].append(current)
13 | 
14 |     # trigrams = zip(document, document[1:], document[2:])
15 |     # trigrams_transitions = defaultdict(list)
16 |     # starts = []
17 |     #
18 |     # for prev, current, next in trigrams:
19 |     #     if prev == ".":             # if previous word is a period
20 |     #         starts.append(current)  # then this is start word
21 |     #
22 |     #     trigrams_transitions[(prev, current)].append(next)
23 |     #
24 |     # print(generate_using_trigrams(starts, trigrams_transitions))
25 | 
26 |     # print(generate_sentence(grammar=grammar))
27 | 
28 |     for k, word_counts in enumerate(topic_word_counts):
29 |         for word, count in word_counts.most_common():
30 |             if count > 0:
31 |                 print(k, word, count)
32 | 
33 |     topic_names = ["Big Data and programming languages",
34 |                    "databases",
35 |                    "machine learning",
36 |                    "statistics"]
37 | 
38 |     for document, topic_counts in zip(documents, document_topic_counts):
39 |         for topic, count in topic_counts.most_common():
40 |             if count > 0:
41 |                 print(topic_names[topic], count)
42 |         print()
43 | 
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/recommender_systems/model.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | from recommender_systems.data import users_interests
 4 | from recommender_systems.utils import make_user_interest_vector, cosine_similarity, most_similar_users_to, \
 5 |     user_based_suggestions, most_similar_interests_to, item_based_suggestions
 6 | 
 7 | if __name__ == '__main__':
 8 |     unique_interests = sorted(list({interest
 9 |                                     for user_interests in users_interests
10 |                                     for interest in user_interests}))
11 | 
12 |     print("unique interests")
13 |     print(unique_interests)
14 | 
15 |     user_interest_matrix = map(partial(make_user_interest_vector, unique_interests), users_interests)
16 | 
17 |     user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
18 |                           for interest_vector_j in user_interest_matrix]
19 |                          for interest_vector_i in user_interest_matrix]
20 | 
21 |     print(most_similar_users_to(user_similarities, 0))
22 | 
23 |     print(user_based_suggestions(user_similarities, users_interests, 0))
24 | 
25 |     # item-based
26 |     interest_user_matrix = [[user_interest_vector[j]
27 |                              for user_interest_vector in user_interest_matrix]
28 |                             for j, _ in enumerate(unique_interests)]
29 | 
30 |     interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
31 |                               for user_vector_j in interest_user_matrix]
32 |                              for user_vector_i in interest_user_matrix]
33 | 
34 |     print(most_similar_interests_to(interest_similarities, 0, unique_interests))
35 | 
36 |     print(item_based_suggestions(interest_similarities, users_interests, user_interest_matrix, unique_interests, 0))


--------------------------------------------------------------------------------
/natural_language_processing/data.py:
--------------------------------------------------------------------------------
 1 | data = [("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
 2 |         ("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
 3 |         ("data science", 60, 70), ("analytics", 90, 3),
 4 |         ("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
 5 |         ("actionable insights", 40, 30), ("think out of the box", 45, 10),
 6 |         ("self-starter", 30, 50), ("customer focus", 65, 15),
 7 |         ("thought leadership", 35, 35)]
 8 | 
 9 | grammar = {
10 |     "_S": ["_NP _VP"],
11 |     "_NP": ["_N", "_A _NP _P _A _N"],
12 |     "_VP": ["_V", "_V _NP"],
13 |     "_N": ["data science", "Python", "regression"],
14 |     "_A": ["big", "linear", "logistic"],
15 |     "_P": ["about", "near"],
16 |     "_V": ["learns", "trains", "tests", "is"]
17 | }
18 | 
19 | documents = [
20 |     ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
21 |     ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
22 |     ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
23 |     ["R", "Python", "statistics", "regression", "probability"],
24 |     ["machine learning", "regression", "decision trees", "libsvm"],
25 |     ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
26 |     ["statistics", "probability", "mathematics", "theory"],
27 |     ["machine learning", "scikit-learn", "Mahout", "neural networks"],
28 |     ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
29 |     ["Hadoop", "Java", "MapReduce", "Big Data"],
30 |     ["statistics", "R", "statsmodels"],
31 |     ["C++", "deep learning", "artificial intelligence", "probability"],
32 |     ["pandas", "R", "Python"],
33 |     ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
34 |     ["libsvm", "regression", "support vector machines"]
35 | ]
36 | 


--------------------------------------------------------------------------------
/simple_linear_regression/data.py:
--------------------------------------------------------------------------------
1 | num_friends_good = [49, 41, 40, 25, 21, 21, 19, 19, 18, 18, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
2 | 
3 | daily_minutes_good = [68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22, 34.76, 54.01, 38.79, 47.59, 49.1, 27.66, 41.03, 36.73, 48.65, 28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57, 31.65, 31.21, 36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8, 24.23, 21.4, 27.94, 32.24, 40.57, 25.07, 19.42, 22.39, 18.42, 46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02, 29.47, 30.2, 31, 38.11, 38.18, 36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28, 15.28, 24.17, 22.31, 30.17, 25.53, 19.85, 35.37, 44.6, 17.23, 13.47, 26.33, 35.02, 32.09, 24.81, 19.33, 28.77, 24.26, 31.98, 25.73, 24.86, 16.28, 34.51, 15.23, 39.72, 40.8, 26.06, 35.76, 34.76, 16.13, 44.04, 18.03, 19.65, 32.62, 35.59, 39.43, 14.18, 35.24, 40.13, 41.82, 35.45, 36.07, 43.67, 24.61, 20.9, 21.9, 18.79, 27.61, 27.21, 26.61, 29.77, 20.59, 27.53, 13.82, 33.2, 25, 33.1, 36.65, 18.63, 14.87, 22.2, 36.81, 25.53, 24.62, 26.25, 18.21, 28.08, 19.42, 29.79, 32.8, 35.99, 28.32, 27.79, 35.88, 29.06, 36.28, 14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48, 18.95, 33.55, 14.24, 29.04, 32.51, 25.63, 22.22, 19, 32.73, 15.16, 13.9, 27.2, 32.01, 29.27, 33, 13.74, 20.42, 27.32, 18.23, 35.35, 28.48, 9.08, 24.62, 20.12, 35.26, 19.92, 31.02, 16.49, 12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82, 23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84]
4 | 


--------------------------------------------------------------------------------
/naive_bayes_classfier/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import re
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def tokenise(message):
 7 |     """Tokenise message into distinct words"""
 8 |     message = message.lower()                       # convert to lowercase
 9 |     all_words = re.findall("[a-z0-9']+", message)   # extract the words
10 |     return set(all_words)                           # remove duplicates
11 | 
12 | 
13 | def count_words(training_set):
14 |     """training set consists of parts (meesage, is_spam)"""
15 |     counts = defaultdict(lambda: [0, 0])
16 |     for message, is_spam in training_set:
17 |         for word in tokenise(message):
18 |             counts[word][0 if is_spam else 1] += 1
19 |     return counts
20 | 
21 | 
22 | def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
23 |     """Turn the word_counts into a list of triplets: w, p(w|spam) and p(w|~spam)"""
24 |     return [(w,
25 |             (spam + k)/(total_spams + 2 * k),
26 |             (non_spam + k)/(total_non_spams + 2 * k))
27 |              for w, (spam, non_spam) in counts.items()]
28 | 
29 | 
30 | def spam_probability(word_probs, message):
31 |     """assigns word probabilities to messages"""
32 |     message_words = tokenise(message)
33 |     log_prob_if_spam = log_prob_if_not_spam = 0.0
34 | 
35 |     # iterate through each word in our vocabulary
36 |     for word, prob_if_spam, prob_if_not_spam in word_probs:
37 | 
38 |         # if "word" appears in the message,
39 |         # add the log probability of seeing it
40 |         if word in message_words:
41 |             log_prob_if_spam += math.log(prob_if_spam)
42 |             log_prob_if_not_spam += math.log(prob_if_not_spam)
43 | 
44 |         # if the "word" doesn't appear in the message
45 |         # add the log probability of not seeing it
46 |         else:
47 |             log_prob_if_spam += math.log(1.0 - prob_if_spam)
48 |             log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
49 | 
50 |     prob_if_spam = math.exp(log_prob_if_spam)
51 |     prob_if_not_spam = math.exp(log_prob_if_not_spam)
52 | 
53 |     return prob_if_spam / (prob_if_spam + prob_if_not_spam)
54 | 
55 | 


--------------------------------------------------------------------------------
/NN_churn_prediction.py:
--------------------------------------------------------------------------------
 1 | """Importing the libraries"""
 2 | from keras.models import Sequential
 3 | from keras.layers import Dense
 4 | import pandas as pd
 5 | from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.preprocessing import LabelEncoder, StandardScaler
 8 | 
 9 | """Loading the data"""
10 | dataset = pd.read_csv("/home/amogh/Downloads/Churn_Modelling.csv")
11 | 
12 | # filtering features and labels
13 | X = dataset.iloc[:, 3:13].values
14 | y = dataset.iloc[:, 13].values
15 | 
16 | """Preprocessing the data"""
17 | # encoding the Gender and Geography
18 | labelencoder_X_1 = LabelEncoder()
19 | X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])   # Column 4 [France, Germany, Spain] => [0, 1, 2]
20 | labelencoder_X_2 = LabelEncoder()
21 | X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])   # Column 5 [Male, Female] => [0, 1]
22 | 
23 | # splitting the data into training and testing
24 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
25 | 
26 | # scaling features
27 | sc = StandardScaler()
28 | X_train = sc.fit_transform(X_train)
29 | X_test = sc.fit_transform(X_test)
30 | 
31 | """Building the neural network"""
32 | # initializing the neural network
33 | model = Sequential()
34 | # input and first hidden layer
35 | model.add(Dense(6, input_dim=10, activation='relu'))
36 | # second hidden layer
37 | model.add(Dense(6, activation='relu'))
38 | # output layer - probability of churning
39 | model.add(Dense(1, activation='sigmoid'))
40 | # compiling the model
41 | model.compile(optimizer='adam',
42 |               loss='binary_crossentropy',
43 |               metrics=['accuracy'])
44 | 
45 | """Running the model on the data"""
46 | # fitting the model
47 | model.fit(X_train, y_train, batch_size=10, epochs=100)
48 | 
49 | y_pred = model.predict(X_test)
50 | y_pred = (y_pred > 0.5)     # converting probabilities into binary
51 | 
52 | """Evaluating the results"""
53 | # generating the confusion matrix
54 | cm = confusion_matrix(y_test, y_pred)
55 | print(cm)
56 | 
57 | # determining the accuracy
58 | accuracy = accuracy_score(y_test, y_pred)
59 | print(accuracy)
60 | 
61 | # generating the classification report
62 | cr = classification_report(y_test, y_pred)
63 | print(cr)
64 | 


--------------------------------------------------------------------------------
/k_nearest_neighbors/Understanding the algorithm.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | K-nearest nieghbor is a supervised machine learning algorithm.
 4 | 
 5 | ### Problem Statement
 6 | 
 7 | Given some labelled data points, we have to classify a new data point according to its nearest neigbors.
 8 | 
 9 | **Example used here**
10 | 
11 | We have the data for a large social networking company which ran polls for their favroite programming language. The users belong from a group of large cities. Now the VP of Community Engagement want you to `predict the` **favorite programming language** `for the places that were` **not** `part of the survey`
12 | 
13 | ### Intuition
14 | 
15 | * In kNN, k is the no. of neigbors you will evaluate to decide which group a new data point will belong to ?
16 | * Value of k is decided by plotting the error rate against the different value of k
17 | * Once the value of k is initiliazed, we take the nearest the k neigbors from the data point
18 | * The measure of distance between the data points can be calculated using either `Euclidean Distance` or `Manhattan Distance`
19 | * Once we calculate the distance of all the k nearest neigbors, we then look for the majority of labels in the neigbots
20 | * The data point is assigned to the group which has maximum no. of neigbors
21 | 
22 | ### Choosing K value
23 | * First divide the entire data set into training set and test set. 
24 | * Apply the KNN algorithm into training set and cross validate it with test set.
25 | * Lets assume you have a train set `xtrain` and test set `xtest`
26 | * Now create the model with `k` value `1` and predict with test set data 
27 | * Check the accuracy and other parameters then repeat the same process after increasing the k value by 1 each time.
28 | 
29 | 
30 | Here I am increasing the k value by 1 from `1 to 29` and printing the accuracy with respected `k` value.
31 | ![Code](https://qphs.fs.quoracdn.net/main-qimg-9e8fedc07dafba2106eb11f0bfd4ba7d.webp)
32 | 
33 | ### Note
34 | 
35 | * kNN is impacted by `Imbalanced datasets`. 
36 | Suppose there are `m` instances of **class 1** and `n` insatnces of **class 2** where `n << m`. 
37 | In a case where `k > n`, then this may lead to counting of more instances of m and 
38 | hence it will impact the majority election in k nearest neigbors
39 | 
40 | * kNN is also very sensitve to `outliers`
41 | 


--------------------------------------------------------------------------------
/telecom_churn_prediction.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from IPython.display import display
 4 | from sklearn.ensemble import RandomForestClassifier
 5 | from sklearn.metrics import confusion_matrix, roc_curve
 6 | from sklearn.model_selection import train_test_split
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | df = pd.read_csv("/home/amogh/Downloads/Churn.csv")
10 | display(df.head(5))
11 | 
12 | """Data Exploration and Cleaning"""
13 | # print("Number of rows: ", df.shape[0])
14 | # counts = df.describe().iloc[0]
15 | # display(pd.DataFrame(counts.tolist(), columns=["Count of values"], index=counts.index.values).transpose)
16 | 
17 | """Feature Selection"""
18 | df = df.drop(["Phone", "Area Code", "State"], axis=1)
19 | features = df.drop(["Churn"], axis=1).columns
20 | 
21 | """Fitting the model"""
22 | df_train, df_test = train_test_split(df, test_size=0.25)
23 | clf = RandomForestClassifier()
24 | clf.fit(df_train[features], df_train["Churn"])
25 | 
26 | # Make predictions
27 | preds = clf.predict(df_test[features])
28 | probs = clf.predict_proba(df_test[features])
29 | display(preds)
30 | 
31 | """Evaluating the model"""
32 | score = clf.score(df_test[features], df_test["Churn"])
33 | print("Accuracy: ", score)
34 | 
35 | cf = pd.DataFrame(confusion_matrix(df_test["Churn"], preds), columns=["Predicted False", "Predicted True"], index=["Actual False", "Actual True"])
36 | 
37 | display(cf)
38 | 
39 | # Plotting the ROC curve
40 | 
41 | fpr, tpr, threshold = roc_curve(df_test["Churn"], probs[:, 1])
42 | plt.title('Receiver Operating Characteristic')
43 | plt.plot(fpr, tpr, 'b')
44 | plt.plot([0, 1], [0, 1],'r--')
45 | plt.xlim([0, 1])
46 | plt.ylim([0, 1])
47 | plt.ylabel('True Positive Rate')
48 | plt.xlabel('False Positive Rate')
49 | plt.show()
50 | 
51 | # Feature Importance Plot
52 | fig = plt.figure(figsize=(20, 18))
53 | ax = fig.add_subplot(111)
54 | 
55 | df_f = pd.DataFrame(clf.feature_importances_, columns=["importance"])
56 | df_f["labels"] = features
57 | df_f.sort_values("importance", inplace=True, ascending=False)
58 | display(df_f.head(5))
59 | 
60 | index = np.arange(len(clf.feature_importances_))
61 | bar_width = 0.5
62 | rects = plt.barh(index, df_f["importance"], bar_width, alpha=0.4, color='b', label='Main')
63 | plt.yticks(index, df_f["labels"])
64 | plt.show()
65 | 
66 | df_test["prob_true"] = probs[:, 1]
67 | df_risky = df_test[df_test["prob_true"] > 0.9]
68 | display(df_risky.head(5)[["prob_true"]])
69 | 


--------------------------------------------------------------------------------
/multiple_regression/model.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from helpers.linear_algebra import dot
 4 | from helpers.stats import median, standard_deviation
 5 | from multiple_regression.data import x, daily_minutes_good
 6 | from multiple_regression.utils import estimate_beta, multiple_r_squared, bootstrap_statistic, estimate_sample_beta, \
 7 |     p_value, estimate_beta_ridge
 8 | 
 9 | if __name__ == '__main__':
10 |     random.seed(0)
11 |     beta = estimate_beta(x, daily_minutes_good)  # [30.63, 0.972, -1.868, 0.911]
12 |     print("beta", beta)
13 |     print("r-squared", multiple_r_squared(x, daily_minutes_good, beta))
14 |     print()
15 | 
16 |     print("digression: the bootstrap")
17 |     # 101 points all very close to 100
18 |     close_to_100 = [99.5 + random.random() for _ in range(101)]
19 | 
20 |     # 101 points, 50 of them near 0, 50 of them near 200
21 |     far_from_100 = ([99.5 + random.random()] +
22 |                     [random.random() for _ in range(50)] +
23 |                     [200 + random.random() for _ in range(50)])
24 | 
25 |     print("bootstrap_statistic(close_to_100, median, 100):")
26 |     print(bootstrap_statistic(close_to_100, median, 100))
27 |     print("bootstrap_statistic(far_from_100, median, 100):")
28 |     print(bootstrap_statistic(far_from_100, median, 100))
29 |     print()
30 | 
31 |     random.seed(0)  # so that you get the same results as me
32 | 
33 |     bootstrap_betas = bootstrap_statistic(list(zip(x, daily_minutes_good)),
34 |                                           estimate_sample_beta,
35 |                                           100)
36 | 
37 |     bootstrap_standard_errors = [
38 |         standard_deviation([beta[i] for beta in bootstrap_betas])
39 |         for i in range(4)]
40 | 
41 |     print("bootstrap standard errors", bootstrap_standard_errors)
42 |     print()
43 | 
44 |     print("p_value(30.63, 1.174)", p_value(30.63, 1.174))
45 |     print("p_value(0.972, 0.079)", p_value(0.972, 0.079))
46 |     print("p_value(-1.868, 0.131)", p_value(-1.868, 0.131))
47 |     print("p_value(0.911, 0.990)", p_value(0.911, 0.990))
48 |     print()
49 | 
50 |     print("regularization")
51 | 
52 |     random.seed(0)
53 |     for alpha in [0.0, 0.01, 0.1, 1, 10]:
54 |         beta = estimate_beta_ridge(x, daily_minutes_good, alpha=alpha)
55 |         print("alpha", alpha)
56 |         print("beta", beta)
57 |         print("dot(beta[1:],beta[1:])", dot(beta[1:], beta[1:]))
58 |         print("r-squared", multiple_r_squared(x, daily_minutes_good, beta))
59 |         print()


--------------------------------------------------------------------------------
/naive_bayes_classfier/model.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import re
 3 | from collections import Counter
 4 | import random
 5 | from naive_bayes_classfier.naivebayesclassifier import NaiveBayesClassifier
 6 | 
 7 | 
 8 | def split_data(data, prob):
 9 |     """split data into fractions [prob, 1 - prob]"""
10 |     results = [], []
11 |     for row in data:
12 |         results[0 if random.random() < prob else 1].append(row)
13 |     return results
14 | 
15 | 
16 | def get_subject_data(path):
17 | 
18 |     data = []
19 | 
20 |     # regex for stripping out the leading "Subject:" and any spaces after it
21 |     subject_regex = re.compile(r"^Subject:\s+")
22 | 
23 |     # glob.glob returns every filename that matches the wildcarded path
24 |     for fn in glob.glob(path):
25 |         is_spam = "ham" not in fn
26 | 
27 |         with open(fn, 'r', encoding='ISO-8859-1') as file:
28 |             for line in file:
29 |                 if line.startswith("Subject:"):
30 |                     subject = subject_regex.sub("", line).strip()
31 |                     data.append((subject, is_spam))
32 | 
33 |     return data
34 | 
35 | 
36 | def p_spam_given_word(word_prob):
37 |     word, prob_if_spam, prob_if_not_spam = word_prob
38 |     return prob_if_spam / (prob_if_spam + prob_if_not_spam)
39 | 
40 | 
41 | def train_and_test_model(path):
42 | 
43 |     data = get_subject_data(path)
44 |     random.seed(0)      # just so you get the same answers as me
45 |     train_data, test_data = split_data(data, 0.75)
46 | 
47 |     classifier = NaiveBayesClassifier()
48 |     classifier.train(train_data)
49 | 
50 |     classified = [(subject, is_spam, classifier.classify(subject))
51 |               for subject, is_spam in test_data]
52 | 
53 |     counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
54 |                      for _, is_spam, spam_probability in classified)
55 | 
56 |     print(counts)
57 | 
58 |     classified.sort(key=lambda row: row[2])
59 |     spammiest_hams = list(filter(lambda row: not row[1], classified))[-5:]
60 |     hammiest_spams = list(filter(lambda row: row[1], classified))[:5]
61 | 
62 |     print("\nspammiest_hams", spammiest_hams)
63 |     print("\nhammiest_spams", hammiest_spams)
64 | 
65 |     words = sorted(classifier.word_probs, key=p_spam_given_word)
66 | 
67 |     spammiest_words = words[-5:]
68 |     hammiest_words = words[:5]
69 | 
70 |     print("\nspammiest_words", spammiest_words)
71 |     print("\nhammiest_words", hammiest_words)
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     train_and_test_model(r"data/*/*")
76 | 


--------------------------------------------------------------------------------
/logistic_regression/model.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from functools import partial
 3 | 
 4 | from helpers.gradient_descent import maximize_batch, maximize_stochastic
 5 | from helpers.linear_algebra import dot
 6 | from helpers.machine_learning import train_test_split
 7 | from logistic_regression.data import data
 8 | from logistic_regression.utils import logistic_log_likelihood, logistic_log_gradient, logistic_log_likelihood_i, \
 9 |     logistic_log_gradient_i, logistic
10 | from multiple_regression.utils import estimate_beta
11 | from working_with_data.utils import rescale
12 | 
13 | if __name__ == '__main__':
14 |     x = [[1] + row[:2] for row in data]  # each element is [1, experience, salary]
15 |     y = [row[2] for row in data]  # each element is paid_account
16 | 
17 |     print("linear regression:")
18 | 
19 |     rescaled_x = rescale(x)
20 |     beta = estimate_beta(rescaled_x, y)
21 |     print(beta)
22 | 
23 |     print("logistic regression:")
24 | 
25 |     random.seed(0)
26 |     x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33)
27 | 
28 |     # want to maximize log likelihood on the training data
29 |     fn = partial(logistic_log_likelihood, x_train, y_train)
30 |     gradient_fn = partial(logistic_log_gradient, x_train, y_train)
31 | 
32 |     # pick a random starting point
33 |     beta_0 = [1, 1, 1]
34 | 
35 |     # and maximize using gradient descent
36 |     beta_hat = maximize_batch(fn, gradient_fn, beta_0)
37 | 
38 |     print("beta_batch", beta_hat)
39 | 
40 |     beta_0 = [1, 1, 1]
41 |     beta_hat = maximize_stochastic(logistic_log_likelihood_i,
42 |                                    logistic_log_gradient_i,
43 |                                    x_train, y_train, beta_0)
44 | 
45 |     print("beta stochastic", beta_hat)
46 | 
47 |     true_positives = false_positives = true_negatives = false_negatives = 0
48 | 
49 |     for x_i, y_i in zip(x_test, y_test):
50 |         predict = logistic(dot(beta_hat, x_i))
51 | 
52 |         if y_i == 1 and predict >= 0.5:  # TP: paid and we predict paid
53 |             true_positives += 1
54 |         elif y_i == 1:  # FN: paid and we predict unpaid
55 |             false_negatives += 1
56 |         elif predict >= 0.5:  # FP: unpaid and we predict paid
57 |             false_positives += 1
58 |         else:  # TN: unpaid and we predict unpaid
59 |             true_negatives += 1
60 | 
61 |     precision = true_positives / (true_positives + false_positives)
62 |     recall = true_positives / (true_positives + false_negatives)
63 | 
64 |     print("precision", precision)
65 |     print("recall", recall)


--------------------------------------------------------------------------------
/neural_network/model.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from neural_network.data import raw_digits
 4 | from neural_network.utils import backpropagate, feed_forward
 5 | 
 6 | if __name__ == "__main__":
 7 | 
 8 |     def make_digit(raw_digit):
 9 |         return [1 if c == '1' else 0
10 |                 for row in raw_digit.split("\n")
11 |                 for c in row.strip()]
12 | 
13 | 
14 |     inputs = list(map(make_digit, raw_digits))
15 | 
16 |     targets = [[1 if i == j else 0 for i in range(10)]
17 |                for j in range(10)]
18 | 
19 |     random.seed(0)  # to get repeatable results
20 |     input_size = 25  # each input is a vector of length 25
21 |     num_hidden = 5  # we'll have 5 neurons in the hidden layer
22 |     output_size = 10  # we need 10 outputs for each input
23 | 
24 |     # each hidden neuron has one weight per input, plus a bias weight
25 |     hidden_layer = [[random.random() for __ in range(input_size + 1)]
26 |                     for __ in range(num_hidden)]
27 | 
28 |     # each output neuron has one weight per hidden neuron, plus a bias weight
29 |     output_layer = [[random.random() for __ in range(num_hidden + 1)]
30 |                     for __ in range(output_size)]
31 | 
32 |     # the network starts out with random weights
33 |     network = [hidden_layer, output_layer]
34 | 
35 |     # 10,000 iterations seems enough to converge
36 |     for __ in range(10000):
37 |         for input_vector, target_vector in zip(inputs, targets):
38 |             backpropagate(network, input_vector, target_vector)
39 | 
40 | 
41 |     def predict(input):
42 |         return feed_forward(network, input)[-1]
43 | 
44 | 
45 |     for i, input in enumerate(inputs):
46 |         outputs = predict(input)
47 |         print(i, [round(p, 2) for p in outputs])
48 | 
49 |     print(""".@@@.
50 |              ...@@
51 |              ..@@.
52 |              ...@@
53 |              .@@@.""")
54 | 
55 |     print([round(x, 2) for x in
56 |            predict([0, 1, 1, 1, 0,      # .@@@.
57 |                     0, 0, 0, 1, 1,      # ...@@
58 |                     0, 0, 1, 1, 0,      # ..@@.
59 |                     0, 0, 0, 1, 1,      # ...@@
60 |                     0, 1, 1, 1, 0])])   # .@@@.
61 |     print()
62 | 
63 |     print(""".@@@.
64 |              @..@@
65 |              .@@@.
66 |              @..@@
67 |              .@@@.""")
68 | 
69 |     print([round(x, 2) for x in
70 |            predict([0, 1, 1, 1, 0,      # .@@@.
71 |                     1, 0, 0, 1, 1,      # @..@@
72 |                     0, 1, 1, 1, 0,      # .@@@.
73 |                     1, 0, 0, 1, 1,      # @..@@
74 |                     0, 1, 1, 1, 0])])   # .@@@.
75 |     print()
76 | 


--------------------------------------------------------------------------------
/recommender_systems/utils.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from collections import defaultdict
 3 | 
 4 | from helpers.linear_algebra import dot
 5 | 
 6 | 
 7 | def cosine_similarity(v, w):
 8 |     return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))
 9 | 
10 | 
11 | def make_user_interest_vector(interests, user_interests):
12 |     return [1 if interest in user_interests else 0
13 |             for interest in interests]
14 | 
15 | 
16 | def most_similar_users_to(user_similarities, user_id):
17 |     pairs = [(other_user_id, similarity)
18 |              for other_user_id, similarity in
19 |              enumerate(user_similarities[user_id])
20 |              if user_id != other_user_id and similarity > 0]
21 | 
22 |     return sorted(pairs, key=lambda pair: pair[1], reverse=True)
23 | 
24 | 
25 | def most_similar_interests_to(interest_similarities, interest_id, unique_interests):
26 |     pairs = [(unique_interests[other_interest_id], similarity)
27 |              for other_interest_id, similarity in
28 |              enumerate(interest_similarities[interest_id])
29 |              if interest_id != other_interest_id and similarity > 0]
30 | 
31 |     return sorted(pairs, key=lambda pair: pair[1], reverse=True)
32 | 
33 | 
34 | def user_based_suggestions(user_similarities, users_interests, user_id, include_current_interests=False):
35 |     suggestions = defaultdict(float)
36 |     for other_user_id, similarity in most_similar_users_to(user_similarities, user_id):
37 |         for interest in users_interests[other_user_id]:
38 |             suggestions[interest] += similarity
39 | 
40 |     suggestions = sorted(suggestions.items(), key=lambda pair: pair[1], reverse=True)
41 | 
42 |     if include_current_interests:
43 |         return suggestions
44 |     else:
45 |         return [(suggestion, weight)
46 |                 for suggestion, weight in suggestions
47 |                 if suggestion not in users_interests[user_id]]
48 | 
49 | 
50 | def item_based_suggestions(interest_similarities, users_interests, user_interest_matrix, unique_interests, user_id, include_current_interests=False):
51 |     suggestions = defaultdict(float)
52 |     for interest_id, is_interested in enumerate(user_interest_matrix[user_id]):
53 |         if is_interested == 1:
54 |             for interest, similarity in most_similar_interests_to(interest_similarities, interest_id, unique_interests):
55 |                 suggestions[interest] += similarity
56 | 
57 |     suggestions = sorted(suggestions.items(), key=lambda pair: pair[1], reverse=True)
58 | 
59 |     if include_current_interests:
60 |         return suggestions
61 |     else:
62 |         return [(suggestion, weight)
63 |                 for suggestion, weight in suggestions
64 |                 if suggestion not in users_interests[user_id]]
65 | 
66 | 


--------------------------------------------------------------------------------
/logistic_regression_banking/binary_logisitic_regression.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | import seaborn as sns
 4 | from sklearn.decomposition import PCA
 5 | from sklearn.linear_model import LogisticRegression
 6 | from sklearn.metrics import confusion_matrix, classification_report
 7 | from sklearn.model_selection import train_test_split
 8 | 
 9 | plt.rc("font", size=14)
10 | sns.set(style="white")
11 | sns.set(style="whitegrid", color_codes=True)
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     data = pd.read_csv('banking.csv', header=0)
16 |     data = data.dropna()
17 |     print(data.shape)
18 |     print(list(data.columns))
19 | 
20 |     # plot_data(data)
21 | 
22 |     # The prediction will be based on the variables selected in plot_data(), all other varaible are dropped
23 | 
24 |     data.drop(data.columns[[0, 3, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19]], axis=1, inplace=True)
25 | 
26 |     # print(data.shape)
27 |     # print(list(data.columns))
28 | 
29 |     # Data preprocessing
30 | 
31 |     """dummy varaiable are variables with only two values: one or zero."""
32 | 
33 |     data2 = pd.get_dummies(data, columns=['job', 'marital', 'default', 'housing', 'loan', 'poutcome'])
34 | 
35 |     # drop the unknown columns
36 |     data2.drop(data2.columns[[12, 16, 18, 21, 24]], axis=1, inplace=True)
37 | 
38 |     print(data2.columns)
39 | 
40 |     # plot the correlation between variables
41 |     # sns.heatmap(data2.corr())
42 |     # plt.show()
43 | 
44 |     # split the data into training and test sets
45 |     X = data2.iloc[:, 1:]
46 |     y = data2.iloc[:, 0]
47 | 
48 |     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
49 | 
50 |     print(X_train.shape)
51 | 
52 |     # Logistic Regression Model
53 |     clf = LogisticRegression(random_state=0)
54 |     clf.fit(X_train, y_train)
55 | 
56 |     # predicting the test results and confusion matrix
57 |     y_pred = clf.predict(X_test)
58 |     confusion_matrix = confusion_matrix(y_test, y_pred)
59 |     print(confusion_matrix)
60 | 
61 |     print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))
62 | 
63 |     print(classification_report(y_test, y_pred))
64 | 
65 |     pca = PCA(n_components=2).fit_transform(X)
66 |     X_train, X_test, y_train, y_test = train_test_split(pca, y, random_state=0)
67 | 
68 |     plt.figure(dpi=120)
69 |     plt.scatter(pca[y.values == 0, 0], pca[y.values == 0, 1], alpha=0.5, label='YES', s=2, color='navy')
70 |     plt.scatter(pca[y.values == 1, 0], pca[y.values == 1, 1], alpha=0.5, label='NO', s=2, color='darkorange')
71 |     plt.legend()
72 |     plt.title('Bank Marketing Data Set\nFirst Two Principal Components')
73 |     plt.xlabel('PC1')
74 |     plt.ylabel('PC2')
75 |     plt.gca().set_aspect('equal')
76 |     plt.show()
77 | 
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/Anamoly_Detection_notes.md:
--------------------------------------------------------------------------------
 1 | Inspired from the following [blog post](https://iwringer.wordpress.com/2015/11/17/anomaly-detection-concepts-and-techniques/): 
 2 | Kudos to [Srinath Perera](https://www.linkedin.com/in/srinathperera) for writing this 👍
 3 | 
 4 | ## Anomaly Detection
 5 | 
 6 | ![Image](https://iwringer.files.wordpress.com/2015/11/anomelydetectionmethods.jpg?w=656)
 7 | 
 8 | Four common classes of machine learning applications:
 9 | 
10 | a. classification <br>
11 | b. predicting next value [also known as regression] <br>
12 | c. anamoly detection <br>
13 | d. discovering data strucuture <br>
14 | 
15 | ### Anamoly Detection
16 | As the name suggests, the core focus of anamoly detection is to identify data points that deos not align with the rest of the data. In statistics, these data points are also referred as `outliers`
17 | 
18 | #### Outliers
19 | Having outliers have **significant effect on the mean and the standard deviation** of your data and hence your results are skewed if they are not dealt properly
20 | 
21 | #### Applications of Anamoly Detection
22 | Here are some of the examples where anamoly detection is heavily employed:
23 | a. fraud detection <br>
24 | b. surveillance <br>
25 | c. diagnosis <br>
26 | d. data cleanup <br>
27 | e. monitring predicitive maintenance [IoT devices]
28 | 
29 | ##### Since data is categorised as anomalous and non-anomalous, can't we solve it using classification ?
30 | This assumption is correct as long as the following three conditions hold good:
31 | 
32 | a. Training data present with us is labelled <br>
33 | b. Anomalous and non-anomalous classes are balanced (at least 1:5 proportion) <br>
34 | c. Present data point is not dependent on paast data points [not suitable for time series]
35 | 
36 | #### Reality
37 | a. Hard to obtain labelled training data all the time <br>
38 | b. Real-life scenarios have heavily imbalanced classes, for e.g. fraud detection in credit cards can have the distribution of 1:10^x where x can go from 3 to 6 <br>
39 | c. One more caveat is that of precision and recall scores for such classifiers ? What is the cost of missing a false positive or a false negative ? <br>
40 | [**Precision** governs of how many anomalies detected by classifiers are truly anamolies] <br>
41 | [**Recall** governs of how many anomalies the classifier is able to capture]
42 | 
43 | ### Types of Anomalies
44 | a. **Point Anomalies**: Individual instance of data is considered as anomalous with respect to rest of data (e.g. purchase with a large transaction value) <br>
45 | b. **Contexual Anomalies**: The instance of data is considered as anomalous with respect to the context, but not otherwise (e.g. large spike in a trend at middle of night) <br>
46 | c. **Collective Anomalies**: Unlike the previous two, here we consider a collection of data instances making up for an anomaly with respect to the rest of data <br>
47 |   i. Events that are actually ordered but showing a degree of disorder (e.g. rhythm in ECG) <br>
48 |   ii. Unexpected value comnbinations (e.g. buying a large number of expensive items) <br>
49 | 


--------------------------------------------------------------------------------
/regression_intro.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import math
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import quandl
 6 | from matplotlib import style
 7 | from sklearn import preprocessing, model_selection
 8 | from sklearn.linear_model import LinearRegression
 9 | 
10 | # Style file for plotting graph
11 | style.use('ggplot')
12 | 
13 | # Retrieve dataframe from Quandl
14 | df = quandl.get('WIKI/GOOGL')
15 | 
16 | df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume', ]]
17 | # High Low Change => Volatility of the stock
18 | df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Low'] * 100.0
19 | # Percentage Change => Volatility change
20 | df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
21 | 
22 | # Modified data frame with important features and labels
23 | df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
24 | forecast_col = 'Adj. Close'
25 | 
26 | # In case, data is missing: replace with threshold value to make it outlier
27 | df.fillna(-99999, inplace=True)
28 | 
29 | # Predicting 1% (0.01) [1 day into the future]
30 | forecast_out = int(math.ceil(0.01 * len(df)))
31 | print(forecast_out)
32 | 
33 | # shifting them by 35 days timeframe
34 | df['label'] = df[forecast_col].shift(-forecast_out)
35 | 
36 | # *** FEATURES & LABELS are obtained ***
37 | # X is the set of features except the label, 1 indicates the column
38 | # ref: stackoverflow.com => ambiguity-in-pandas-dataframe-numpy-array-axis-definition
39 | X = np.array(df.drop(['label'], 1))
40 | 
41 | # Scaling the features to normalize them between -1 and 1
42 | # done for efficiency and accuracy, but not required
43 | X = preprocessing.scale(X)
44 | # Prediction will be made against X_lately
45 | X_lately = X[-forecast_out:]
46 | X = X[:-forecast_out]
47 | df.dropna(inplace=True)
48 | # y is the label array
49 | y = np.array(df['label'])
50 | 
51 | # *** CREATING TRAINING TESTING SETS with 20% (0.2) data ***
52 | X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
53 | 
54 | # using two classifiers: LinearRegression(single-threaded) and SVM(default kernel)
55 | clf = LinearRegression()
56 | # clf = svm.SVR()
57 | clf.fit(X_train, y_train)
58 | accuracy = clf.score(X_test, y_test)
59 | # forecast_set will be an array of predicted values for the next 35 days
60 | forecast_set = clf.predict(X_lately)
61 | # print(accuracy)
62 | print(forecast_set, accuracy, forecast_out)
63 | 
64 | df['Forecast'] = np.nan
65 | 
66 | # *** DateTime information for our dataframe is obtained ***
67 | last_date = df.iloc[-1].name
68 | last_unix = last_date.timestamp()
69 | one_day_in_secs = 86400
70 | next_unix = last_unix + one_day_in_secs
71 | 
72 | for i in forecast_set:
73 |     next_date = datetime.datetime.fromtimestamp(next_unix)
74 |     next_unix += one_day_in_secs
75 |     # loc is used for indexing
76 |     df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i]
77 | 
78 | print(df.tail())
79 | 
80 | # *** VISUALISATION OF FORECAST ***
81 | df['Adj. Close'].plot()
82 | df['Forecast'].plot()
83 | plt.legend(loc=4)
84 | plt.xlabel('Date')
85 | plt.ylabel('Price')
86 | plt.show()
87 | 


--------------------------------------------------------------------------------
/hparams_grid_search_keras_nn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from keras import Sequential
  4 | from keras.layers import Dense
  5 | from keras.wrappers.scikit_learn import KerasClassifier
  6 | from sklearn.model_selection import GridSearchCV
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | DATA_FILE = ''
 10 | 
 11 | feature_cols = ['feat1', 'feat2', 'feat3', 'feat4', 'feat5', 'feat6']
 12 | labels = ['y']
 13 | 
 14 | 
 15 | def load_data(filepath):
 16 |     data = pd.read_csv(filepath)
 17 |     return data
 18 | 
 19 | 
 20 | def describe_data(data, name):
 21 |     print('\nGetting the summary for ' + name + '\n')
 22 |     print('Dataset Length:', len(data))
 23 |     print('Dataset Shape:', data.shape)
 24 |     print(data.columns)
 25 |     print(data.dtypes)
 26 | 
 27 | 
 28 | def create_model():
 29 |     model = Sequential()
 30 |     model.add(Dense(12, input_dim=5, kernel_initializer='uniform', activation='relu'))
 31 |     model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
 32 |     model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
 33 | 
 34 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 35 | 
 36 |     return model
 37 | 
 38 | 
 39 | if __name__ == '__main__':
 40 | 
 41 |     data_df = load_data(DATA_FILE)
 42 | 
 43 |     data_df = data_df.dropna()
 44 |     print(data_df.isnull().sum(axis=0))
 45 | 
 46 |     X_data = data_df[feature_cols]
 47 |     y_data = data_df[['y']]
 48 | 
 49 |     # seed for reproducibility
 50 |     seed = 7
 51 |     np.random.seed(seed=seed)
 52 | 
 53 |     # train test split
 54 |     X, X_test, y, y_test = train_test_split(X_data, y_data, test_size=.20, random_state=42)
 55 | 
 56 |     # train val split
 57 |     X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.20, random_state=42)
 58 | 
 59 |     # summarize the datasets
 60 |     describe_data(X_train, name="X_train")
 61 |     describe_data(X_val, name="X_val")
 62 |     describe_data(X_test, name="X_test")
 63 |     describe_data(y_train, name="y_train")
 64 |     describe_data(y_val, name="y_val")
 65 |     describe_data(y_test, name="y_test")
 66 | 
 67 |     # create model
 68 |     model = KerasClassifier(build_fn=create_model)
 69 | 
 70 |     # hyperparamater optimization
 71 |     batch_size = [10, 20, 40, 60, 80, 100]
 72 |     epochs = [10, 50, 100]
 73 |     learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3]
 74 |     momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
 75 |     weight_constraint = [1, 2, 3, 4, 5]
 76 |     dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
 77 |     neurons = [1, 5, 10, 15, 20, 25, 30]
 78 | 
 79 |     param_grid = dict(batch_size=batch_size, epochs=epochs)
 80 |     grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
 81 |     grid_result = grid.fit(X=X_train, y=y_train)
 82 | 
 83 |     # summarize results
 84 |     print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
 85 |     means = grid_result.cv_results_['mean_test_score']
 86 |     stds = grid_result.cv_results_['std_test_score']
 87 |     params = grid_result.cv_results_['params']
 88 |     for mean, stdev, param in zip(means, stds, params):
 89 |         print("%f (%f) with: %r" % (mean, stdev, param))
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/k_nearest_neighbors/data.py:
--------------------------------------------------------------------------------
 1 | cities = [(-86.75, 33.5666666666667, 'Python'),
 2 |  (-88.25, 30.6833333333333, 'Python'),
 3 |  (-112.016666666667, 33.4333333333333, 'Java'),
 4 |  (-110.933333333333, 32.1166666666667, 'Java'),
 5 |  (-92.2333333333333, 34.7333333333333, 'R'),
 6 |  (-121.95, 37.7, 'R'),
 7 |  (-118.15, 33.8166666666667, 'Python'),
 8 |  (-118.233333333333, 34.05, 'Java'),
 9 |  (-122.316666666667, 37.8166666666667, 'R'),
10 |  (-117.6, 34.05, 'Python'),
11 |  (-116.533333333333, 33.8166666666667, 'Python'),
12 |  (-121.5, 38.5166666666667, 'R'),
13 |  (-117.166666666667, 32.7333333333333, 'R'),
14 |  (-122.383333333333, 37.6166666666667, 'R'),
15 |  (-121.933333333333, 37.3666666666667, 'R'),
16 |  (-122.016666666667, 36.9833333333333, 'Python'),
17 |  (-104.716666666667, 38.8166666666667, 'Python'),
18 |  (-104.866666666667, 39.75, 'Python'),
19 |  (-72.65, 41.7333333333333, 'R'),
20 |  (-75.6, 39.6666666666667, 'Python'),
21 |  (-77.0333333333333, 38.85, 'Python'),
22 |  (-80.2666666666667, 25.8, 'Java'),
23 |  (-81.3833333333333, 28.55, 'Java'),
24 |  (-82.5333333333333, 27.9666666666667, 'Java'),
25 |  (-84.4333333333333, 33.65, 'Python'),
26 |  (-116.216666666667, 43.5666666666667, 'Python'),
27 |  (-87.75, 41.7833333333333, 'Java'),
28 |  (-86.2833333333333, 39.7333333333333, 'Java'),
29 |  (-93.65, 41.5333333333333, 'Java'),
30 |  (-97.4166666666667, 37.65, 'Java'),
31 |  (-85.7333333333333, 38.1833333333333, 'Python'),
32 |  (-90.25, 29.9833333333333, 'Java'),
33 |  (-70.3166666666667, 43.65, 'R'),
34 |  (-76.6666666666667, 39.1833333333333, 'R'),
35 |  (-71.0333333333333, 42.3666666666667, 'R'),
36 |  (-72.5333333333333, 42.2, 'R'),
37 |  (-83.0166666666667, 42.4166666666667, 'Python'),
38 |  (-84.6, 42.7833333333333, 'Python'),
39 |  (-93.2166666666667, 44.8833333333333, 'Python'),
40 |  (-90.0833333333333, 32.3166666666667, 'Java'),
41 |  (-94.5833333333333, 39.1166666666667, 'Java'),
42 |  (-90.3833333333333, 38.75, 'Python'),
43 |  (-108.533333333333, 45.8, 'Python'),
44 |  (-95.9, 41.3, 'Python'),
45 |  (-115.166666666667, 36.0833333333333, 'Java'),
46 |  (-71.4333333333333, 42.9333333333333, 'R'),
47 |  (-74.1666666666667, 40.7, 'R'),
48 |  (-106.616666666667, 35.05, 'Python'),
49 |  (-78.7333333333333, 42.9333333333333, 'R'),
50 |  (-73.9666666666667, 40.7833333333333, 'R'),
51 |  (-80.9333333333333, 35.2166666666667, 'Python'),
52 |  (-78.7833333333333, 35.8666666666667, 'Python'),
53 |  (-100.75, 46.7666666666667, 'Java'),
54 |  (-84.5166666666667, 39.15, 'Java'),
55 |  (-81.85, 41.4, 'Java'),
56 |  (-82.8833333333333, 40, 'Java'),
57 |  (-97.6, 35.4, 'Python'),
58 |  (-122.666666666667, 45.5333333333333, 'Python'),
59 |  (-75.25, 39.8833333333333, 'Python'),
60 |  (-80.2166666666667, 40.5, 'Python'),
61 |  (-71.4333333333333, 41.7333333333333, 'R'),
62 |  (-81.1166666666667, 33.95, 'R'),
63 |  (-96.7333333333333, 43.5666666666667, 'Python'),
64 |  (-90, 35.05, 'R'),
65 |  (-86.6833333333333, 36.1166666666667, 'R'),
66 |  (-97.7, 30.3, 'Python'),
67 |  (-96.85, 32.85, 'Java'),
68 |  (-95.35, 29.9666666666667, 'Java'),
69 |  (-98.4666666666667, 29.5333333333333, 'Java'),
70 |  (-111.966666666667, 40.7666666666667, 'Python'),
71 |  (-73.15, 44.4666666666667, 'R'),
72 |  (-77.3333333333333, 37.5, 'Python'),
73 |  (-122.3, 47.5333333333333, 'Python'),
74 |  (-89.3333333333333, 43.1333333333333, 'R'),
75 |  (-104.816666666667, 41.15, 'Java')]
76 |  
77 | cities = [([longitude,  latitude],  language) for longitude,  latitude,  language in cities]
78 | 


--------------------------------------------------------------------------------
/k_means_clustering/Understanding the algorithm.md:
--------------------------------------------------------------------------------
 1 | ### Introduction
 2 | 
 3 | * K-means clustering is an unsupervised machine learning algorithm.
 4 | * K-means algorithm is an iterative algorithm that tries to partition the dataset into `K` pre-defined distinct non-overlapping subgroups(clusters) where each data point belongs to only one group. 
 5 | * It tries to make the intra-cluster data points as similar as possible while also keeping the clusters as different (far) as possible. 
 6 | * It assigns data points to a cluster such that the sum of the squared distance between the data points and the cluster’s centroid (arithmetic mean of all the data points that belong to that cluster) is at the minimum. 
 7 | * The less variation we have within clusters, the more homogeneous (similar) the data points are within the same cluster
 8 | 
 9 | ### Problem Statement
10 | 
11 | Given some **unlabelled** data points, we have to identify subgroups such that 
12 | 1. Points in the same subgroup are similar to each other.
13 | 2. Points in different subgroup are dissimilar to each other.
14 | 
15 | **Example used here**
16 | 
17 | We have the data for a large social networking company which is planning to host meetups for their users. We have the users' location data. Now the VP of Growth want you to `choose the` **meetup locations** `so it becomes convinient for everyone to attend`
18 | 
19 | ### Intuition
20 | 
21 | * In K-means, `k` is the `no. of subgroups` you want the data to be segregated into ?
22 | * Optimal value of `k` can be derived by using `elbow method` (discussed below)
23 | **Centroid Initialization**
24 | * We begin by initializing `k` random data points as the centroids (first pass)
25 | * The measure of distance between the data points and centroids can be calculated using either `Euclidean Distance` or `Manhattan Distance`
26 | 
27 | **Iteration**
28 | * **Cluster assigment:** We assign a cluster to the data point that is nearest to it.
29 | * Once all the points are assigned to their nearest centroids, then for each cluster the centroid is calculated again using centroid initialization step.
30 | * With the new centroids, we repeat the step of cluster assignment.
31 | * These two steps are iterated as long as `there is no change in cluster assigment of data points` i.e. no data point is moving into a new cluster.
32 | 
33 | ### Choosing K value - Elbow method
34 | * Elbow method gives us an idea on what a good k number of clusters.
35 | * This is based on the sum of squared distance (SSE) between data points and their assigned clusters’ centroids. 
36 | * We pick `k` at the spot where SSE starts to flatten out and forming an elbow. 
37 | 
38 | Here I am increasing the k value by 1 from `1 to 10` and printing the sum of squared distance with respected `k` value.
39 | ![Code](https://miro.medium.com/max/866/1*9z8erk4kvsnxkfv-QhsHZg.png)
40 | 
41 | ### Note
42 | 
43 | * K-means gives more weight to the bigger clusters.
44 | * K-means assumes spherical shapes of clusters (with radius equal to the distance between the centroid and the furthest data point) and doesn’t work well when clusters are in different shapes such as elliptical clusters.
45 | * If there is overlapping between clusters, K-means doesn’t have an intrinsic measure for uncertainty for the examples belong to the overlapping region in order to determine for which cluster to assign each data point.
46 | * K-means may still cluster the data even if it can’t be clustered such as data that comes from uniform distributions.
47 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at singhal.amogh1995@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/logistic_regression/data.py:
--------------------------------------------------------------------------------
 1 | data = [(0.7, 48000, 1), (1.9, 48000, 0), (2.5, 60000, 1), (4.2, 63000, 0), (6, 76000, 0), (6.5, 69000, 0),
 2 |         (7.5, 76000, 0), (8.1, 88000, 0), (8.7, 83000, 1), (10, 83000, 1), (0.8, 43000, 0), (1.8, 60000, 0),
 3 |         (10, 79000, 1), (6.1, 76000, 0), (1.4, 50000, 0), (9.1, 92000, 0), (5.8, 75000, 0), (5.2, 69000, 0),
 4 |         (1, 56000, 0), (6, 67000, 0), (4.9, 74000, 0), (6.4, 63000, 1), (6.2, 82000, 0), (3.3, 58000, 0),
 5 |         (9.3, 90000, 1), (5.5, 57000, 1), (9.1, 102000, 0), (2.4, 54000, 0), (8.2, 65000, 1), (5.3, 82000, 0),
 6 |         (9.8, 107000, 0), (1.8, 64000, 0), (0.6, 46000, 1), (0.8, 48000, 0), (8.6, 84000, 1), (0.6, 45000, 0),
 7 |         (0.5, 30000, 1), (7.3, 89000, 0), (2.5, 48000, 1), (5.6, 76000, 0), (7.4, 77000, 0), (2.7, 56000, 0),
 8 |         (0.7, 48000, 0), (1.2, 42000, 0), (0.2, 32000, 1), (4.7, 56000, 1), (2.8, 44000, 1), (7.6, 78000, 0),
 9 |         (1.1, 63000, 0), (8, 79000, 1), (2.7, 56000, 0), (6, 52000, 1), (4.6, 56000, 0), (2.5, 51000, 0),
10 |         (5.7, 71000, 0), (2.9, 65000, 0), (1.1, 33000, 1), (3, 62000, 0), (4, 71000, 0), (2.4, 61000, 0),
11 |         (7.5, 75000, 0), (9.7, 81000, 1), (3.2, 62000, 0), (7.9, 88000, 0), (4.7, 44000, 1), (2.5, 55000, 0),
12 |         (1.6, 41000, 0), (6.7, 64000, 1), (6.9, 66000, 1), (7.9, 78000, 1), (8.1, 102000, 0), (5.3, 48000, 1),
13 |         (8.5, 66000, 1), (0.2, 56000, 0), (6, 69000, 0), (7.5, 77000, 0), (8, 86000, 0), (4.4, 68000, 0),
14 |         (4.9, 75000, 0), (1.5, 60000, 0), (2.2, 50000, 0), (3.4, 49000, 1), (4.2, 70000, 0), (7.7, 98000, 0),
15 |         (8.2, 85000, 0), (5.4, 88000, 0), (0.1, 46000, 0), (1.5, 37000, 0), (6.3, 86000, 0), (3.7, 57000, 0),
16 |         (8.4, 85000, 0), (2, 42000, 0), (5.8, 69000, 1), (2.7, 64000, 0), (3.1, 63000, 0), (1.9, 48000, 0),
17 |         (10, 72000, 1), (0.2, 45000, 0), (8.6, 95000, 0), (1.5, 64000, 0), (9.8, 95000, 0), (5.3, 65000, 0),
18 |         (7.5, 80000, 0), (9.9, 91000, 0), (9.7, 50000, 1), (2.8, 68000, 0), (3.6, 58000, 0), (3.9, 74000, 0),
19 |         (4.4, 76000, 0), (2.5, 49000, 0), (7.2, 81000, 0), (5.2, 60000, 1), (2.4, 62000, 0), (8.9, 94000, 0),
20 |         (2.4, 63000, 0), (6.8, 69000, 1), (6.5, 77000, 0), (7, 86000, 0), (9.4, 94000, 0), (7.8, 72000, 1),
21 |         (0.2, 53000, 0), (10, 97000, 0), (5.5, 65000, 0), (7.7, 71000, 1), (8.1, 66000, 1), (9.8, 91000, 0),
22 |         (8, 84000, 0), (2.7, 55000, 0), (2.8, 62000, 0), (9.4, 79000, 0), (2.5, 57000, 0), (7.4, 70000, 1),
23 |         (2.1, 47000, 0), (5.3, 62000, 1), (6.3, 79000, 0), (6.8, 58000, 1), (5.7, 80000, 0), (2.2, 61000, 0),
24 |         (4.8, 62000, 0), (3.7, 64000, 0), (4.1, 85000, 0), (2.3, 51000, 0), (3.5, 58000, 0), (0.9, 43000, 0),
25 |         (0.9, 54000, 0), (4.5, 74000, 0), (6.5, 55000, 1), (4.1, 41000, 1), (7.1, 73000, 0), (1.1, 66000, 0),
26 |         (9.1, 81000, 1), (8, 69000, 1), (7.3, 72000, 1), (3.3, 50000, 0), (3.9, 58000, 0), (2.6, 49000, 0),
27 |         (1.6, 78000, 0), (0.7, 56000, 0), (2.1, 36000, 1), (7.5, 90000, 0), (4.8, 59000, 1), (8.9, 95000, 0),
28 |         (6.2, 72000, 0), (6.3, 63000, 0), (9.1, 100000, 0), (7.3, 61000, 1), (5.6, 74000, 0), (0.5, 66000, 0),
29 |         (1.1, 59000, 0), (5.1, 61000, 0), (6.2, 70000, 0), (6.6, 56000, 1), (6.3, 76000, 0), (6.5, 78000, 0),
30 |         (5.1, 59000, 0), (9.5, 74000, 1), (4.5, 64000, 0), (2, 54000, 0), (1, 52000, 0), (4, 69000, 0), (6.5, 76000, 0),
31 |         (3, 60000, 0), (4.5, 63000, 0), (7.8, 70000, 0), (3.9, 60000, 1), (0.8, 51000, 0), (4.2, 78000, 0),
32 |         (1.1, 54000, 0), (6.2, 60000, 0), (2.9, 59000, 0), (2.1, 52000, 0), (8.2, 87000, 0), (4.8, 73000, 0),
33 |         (2.2, 42000, 1), (9.1, 98000, 0), (6.5, 84000, 0), (6.9, 73000, 0), (5.1, 72000, 0), (9.1, 69000, 1),
34 |         (9.8, 79000, 1), ]
35 | data = list(map(list, data))  # change tuples to lists
36 | 


--------------------------------------------------------------------------------
/use_cases_insurnace.md:
--------------------------------------------------------------------------------
 1 | #### Reference:- https://activewizards.com/blog/top-10-data-science-use-cases-in-insurance/
 2 | 
 3 | ## Other use cases
 4 | 
 5 | ### Lapse management: 
 6 | ##### Identifies policies that are likely to lapse, and how to approach the insured about maintaining the policy. Calculate the probability to lapse
 7 | 
 8 | ### Recommendation engine: 
 9 | ##### Given similar customers, discovers where individual insureds may have too much, or too little, insurance. Then, proactively help them get the right insurance for their current situation.
10 | 
11 | ### Assessor assistant: 
12 | ##### Once a car has been towed to a body shop, use computer vision to help the assessor identify issues which need to be fixed. This helps accuracy, speeds an assessment, and keeps the customer informed with any repairs. Car damage detection
13 | 
14 | ### Property analysis: 
15 | ##### Given images of a property, identifies structures on the property and any condition issues. Insurers can proactively help customers schedule repairs by identifying issues in their roofs, or suggest other coverage when new structures, like a swimming pool, are installed.
16 | 
17 | ### Fraud detection: 
18 | ##### Identifies claims which are potentially fraudulent. Rare events problem. Class imbalance is a huge challenge here
19 | 
20 | ### Personalized offers: 
21 | ##### Improves the customer experience by offering relevant information about the coverage the insured may need based on life events, such as the birth of a child, purchase of a home or car.
22 | 
23 | ### Claims processing
24 | ##### Claims processing includes multiple tasks, including review, investigation, adjustment, remittance, or denial. While performing these tasks, numerous issues might occur:
25 | 
26 | * Manual/inconsistent processing: Many claims processing tasks require human interaction that is prone to errors.
27 | * Varying data formats: Customers send data in different formats to make claims.
28 | * Changing regulation: Businesses need to accord in changing regulations promptly. Thus, constant staff training and process update are required for these companies.
29 | 
30 | ### Claims document processing
31 | As customers make claims when they are in an uncomfortable position, customer experience and speed are critical in these processes. Thanks to document capture technologies, businesses can rapidly handle large volumes of documents required for claims processing tasks, detect fraudulent claims, and check if claims fit regulations.
32 | 
33 | ### Application processing
34 | Application processing requires extracting information from a high volume of documents. While performing this task manually can take too long and prone to errors, document capture technologies enable insurance companies to automatically extract relevant data from application documents and accelerate insurance application processes with fewer errors and improved customer satisfaction.
35 | 
36 | ### Insurance pricing
37 | AI can assess customers’ risk profiles based on lab testing, biometric data, claims data, patient-generated health data, and identify the optimal prices to quote with the right insurance plan. This would decrease the workflow in business operations and reduce costs while improving customer satisfaction.
38 | 
39 | ### Document creation
40 | Insurance companies need to generate high volumes of documents, including specific information about the insurer. While creating these documents manually consume time and prone to errors, using AI and automation technologies can generate policy statements without mistakes. 
41 | 
42 | ### Responding to customer queries
43 | Conversational AI technologies can support insurance companies for faster replies to customer queries. For example, a South African insurance company, Hollard, has achieved 98% automation and reduced cost per transaction by 91%, according to its solution providers, LarcAI and UiPath.
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/multiple_regression/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from functools import partial
  3 | 
  4 | from helpers.gradient_descent import minimize_stochastic
  5 | from helpers.linear_algebra import dot, vector_add
  6 | from helpers.probabilty import normal_cdf
  7 | from helpers.stats import de_mean
  8 | 
  9 | 
 10 | def predict(x_i, beta):
 11 |     return dot(x_i, beta)
 12 | 
 13 | 
 14 | def error(x_i, y_i, beta):
 15 |     return y_i - predict(x_i, beta)
 16 | 
 17 | 
 18 | def squared_error(x_i, y_i, beta):
 19 |     return error(x_i, y_i, beta) ** 2
 20 | 
 21 | 
 22 | def squared_error_gradient(x_i, y_i, beta):
 23 |     """the gradient corresponding to the ith squared error term"""
 24 |     return [-2 * x_ij * error(x_i, y_i, beta)
 25 |             for x_ij in x_i]
 26 | 
 27 | 
 28 | def total_sum_of_squares(y):
 29 |     """The total squared variation of y_i's from their mean"""
 30 |     return sum(v ** 2 for v in de_mean(y))
 31 | 
 32 | 
 33 | def estimate_beta(x, y):
 34 |     beta_initial = [random.random() for x_i in x[0]]
 35 |     return minimize_stochastic(squared_error,
 36 |                                squared_error_gradient,
 37 |                                x, y,
 38 |                                beta_initial,
 39 |                                0.001)
 40 | 
 41 | 
 42 | def multiple_r_squared(x, y, beta):
 43 |     sum_of_squared_errors = sum(error(x_i, y_i, beta) ** 2
 44 |                                 for x_i, y_i in zip(x, y))
 45 |     return 1.0 - sum_of_squared_errors / total_sum_of_squares(y)
 46 | 
 47 | 
 48 | def bootstrap_sample(data):
 49 |     """randomly samples len(data) elements with replacement"""
 50 |     return [random.choice(data) for _ in data]
 51 | 
 52 | 
 53 | def bootstrap_statistic(data, stats_fn, num_samples):
 54 |     """evaluates stats_fn on num_samples bootstrap samples from data"""
 55 |     return [stats_fn(bootstrap_sample(data))
 56 |             for _ in range(num_samples)]
 57 | 
 58 | 
 59 | def estimate_sample_beta(sample):
 60 |     x_sample, y_sample = list(zip(*sample)) # magic unzipping trick
 61 |     return estimate_beta(x_sample, y_sample)
 62 | 
 63 | 
 64 | def p_value(beta_hat_j, sigma_hat_j):
 65 |     if beta_hat_j > 0:
 66 |         return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
 67 |     else:
 68 |         return 2 * normal_cdf(beta_hat_j / sigma_hat_j)
 69 | 
 70 | #
 71 | # REGULARIZED REGRESSION
 72 | #
 73 | 
 74 | # alpha is a *hyperparameter* controlling how harsh the penalty is
 75 | # sometimes it's called "lambda" but that already means something in Python
 76 | 
 77 | 
 78 | def ridge_penalty(beta, alpha):
 79 |     return alpha * dot(beta[1:], beta[1:])
 80 | 
 81 | 
 82 | def squared_error_ridge(x_i, y_i, beta, alpha):
 83 |     """estimate error plus ridge penalty on beta"""
 84 |     return error(x_i, y_i, beta) ** 2 + ridge_penalty(beta, alpha)
 85 | 
 86 | 
 87 | def ridge_penalty_gradient(beta, alpha):
 88 |     """gradient of just the ridge penalty"""
 89 |     return [0] + [2 * alpha * beta_j for beta_j in beta[1:]]
 90 | 
 91 | 
 92 | def squared_error_ridge_gradient(x_i, y_i, beta, alpha):
 93 |     """the gradient corresponding to the ith squared error term
 94 |     including the ridge penalty"""
 95 |     return vector_add(squared_error_gradient(x_i, y_i, beta),
 96 |                       ridge_penalty_gradient(beta, alpha))
 97 | 
 98 | 
 99 | def estimate_beta_ridge(x, y, alpha):
100 |     """use gradient descent to fit a ridge regression
101 |     with penalty alpha"""
102 |     beta_initial = [random.random() for x_i in x[0]]
103 |     return minimize_stochastic(partial(squared_error_ridge, alpha=alpha),
104 |                                partial(squared_error_ridge_gradient,
105 |                                        alpha=alpha),
106 |                                x, y,
107 |                                beta_initial,
108 |                                0.001)
109 | 
110 | 
111 | def lasso_penalty(beta, alpha):
112 |     return alpha * sum(abs(beta_i) for beta_i in beta[1:])


--------------------------------------------------------------------------------
/working_with_data/model.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | from collections import defaultdict
  3 | from functools import reduce
  4 | 
  5 | import dateutil
  6 | 
  7 | from helpers.stats import correlation
  8 | from working_with_data.data import X
  9 | from working_with_data.utils import parse_rows_with, parse_dict, day_over_day_changes, picker, group_by, random_normal, \
 10 |     pluck, scale, rescale, de_mean_matrix, principal_component_analysis, transform_vector
 11 | 
 12 | if __name__ == "__main__":
 13 | 
 14 |     xs = [random_normal() for _ in range(1000)]
 15 |     ys1 = [x + random_normal() / 2 for x in xs]
 16 |     ys2 = [-x + random_normal() / 2 for x in xs]
 17 | 
 18 |     print("correlation(xs, ys1)", correlation(xs, ys1))
 19 |     print("correlation(xs, ys2)", correlation(xs, ys2))
 20 | 
 21 |     # safe parsing
 22 | 
 23 |     data = []
 24 | 
 25 |     with open("comma_delimited_stock_prices.csv", "r", encoding='utf8', newline='') as f:
 26 |         reader = csv.reader(f)
 27 |         for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
 28 |             data.append(line)
 29 | 
 30 |     for row in data:
 31 |         if any(x is None for x in row):
 32 |             print(row)
 33 | 
 34 |     print("stocks")
 35 |     with open("stocks.txt", "r", encoding='utf8', newline='') as f:
 36 |         reader = csv.DictReader(f, delimiter="\t")
 37 |         data = [parse_dict(row, { 'date' : dateutil.parser.parse,
 38 |                                   'closing_price' : float })
 39 |                 for row in reader]
 40 | 
 41 |     max_aapl_price = max(row["closing_price"]
 42 |                          for row in data
 43 |                          if row["symbol"] == "AAPL")
 44 |     print("max aapl price", max_aapl_price)
 45 | 
 46 |     # group rows by symbol
 47 |     by_symbol = defaultdict(list)
 48 | 
 49 |     for row in data:
 50 |         by_symbol[row["symbol"]].append(row)
 51 | 
 52 |     # use a dict comprehension to find the max for each symbol
 53 |     max_price_by_symbol = { symbol : max(row["closing_price"]
 54 |                             for row in grouped_rows)
 55 |                             for symbol, grouped_rows in by_symbol.items() }
 56 |     print("max price by symbol")
 57 |     print(max_price_by_symbol)
 58 | 
 59 |     # key is symbol, value is list of "change" dicts
 60 |     changes_by_symbol = group_by(picker("symbol"), data, day_over_day_changes)
 61 |     # collect all "change" dicts into one big list
 62 |     all_changes = [change
 63 |                    for changes in changes_by_symbol.values()
 64 |                    for change in changes]
 65 | 
 66 |     print("max change", max(all_changes, key=picker("change")))
 67 |     print("min change", min(all_changes, key=picker("change")))
 68 | 
 69 |     # to combine percent changes, we add 1 to each, multiply them, and subtract 1
 70 |     # for instance, if we combine +10% and -20%, the overall change is
 71 |     # (1 + 10%) * (1 - 20%) - 1 = 1.1 * .8 - 1 = -12%
 72 |     def combine_pct_changes(pct_change1, pct_change2):
 73 |         return (1 + pct_change1) * (1 + pct_change2) - 1
 74 | 
 75 |     def overall_change(changes):
 76 |         return reduce(combine_pct_changes, pluck("change", changes))
 77 | 
 78 |     overall_change_by_month = group_by(lambda row: row['date'].month,
 79 |                                        all_changes,
 80 |                                        overall_change)
 81 |     print("overall change by month")
 82 |     print(overall_change_by_month)
 83 | 
 84 |     print("rescaling")
 85 | 
 86 |     data = [[1, 20, 2],
 87 |             [1, 30, 3],
 88 |             [1, 40, 4]]
 89 | 
 90 |     print("original: ", data)
 91 |     print("scale: ", scale(data))
 92 |     print("rescaled: ", rescale(data))
 93 |     print()
 94 | 
 95 |     print("PCA")
 96 | 
 97 |     Y = de_mean_matrix(X)
 98 |     components = principal_component_analysis(Y, 2)
 99 |     print("principal components", components)
100 |     print("first point", Y[0])
101 |     print("first point transformed", transform_vector(Y[0], components))


--------------------------------------------------------------------------------
/k_nearest_neighbors/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import Counter
  3 | 
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | from helpers.linear_algebra import distance
  7 | from k_nearest_neighbors.data import cities
  8 | 
  9 | 
 10 | def raw_majority_vote(labels):
 11 |     votes = Counter(labels)
 12 |     winner, _ = votes.most_common(1)[0]
 13 |     return winner
 14 | 
 15 | 
 16 | def majority_vote(labels):
 17 |     """assumes that labels are ordered from nearest to farthest"""
 18 |     vote_counts = Counter(labels)
 19 |     winner, winner_count = vote_counts.most_common(1)[0]
 20 |     num_winners = len([count
 21 |                        for count in vote_counts.values()
 22 |                        if count == winner_count])
 23 | 
 24 |     if num_winners == 1:
 25 |         return winner                     # unique winner, so return it
 26 |     else:
 27 |         return majority_vote(labels[:-1]) # try again without the farthest
 28 | 
 29 | 
 30 | def knn_classify(k, labeled_points, new_point):
 31 |     """each labeled point should be a pair (point, label)"""
 32 | 
 33 |     # order the labeled points from nearest to farthest
 34 |     by_distance = sorted(labeled_points,
 35 |                          key=lambda point_label: distance(point_label[0], new_point))
 36 | 
 37 |     # find the labels for the k closest
 38 |     k_nearest_labels = [label for _, label in by_distance[:k]]
 39 | 
 40 |     # and let them vote
 41 |     return majority_vote(k_nearest_labels)
 42 | 
 43 | 
 44 | def plot_state_borders(plt):
 45 |     pass
 46 | 
 47 | 
 48 | def plot_cities():
 49 | 
 50 |     # key is language, value is pair (longitudes, latitudes)
 51 |     plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }
 52 | 
 53 |     # we want each language to have a different marker and color
 54 |     markers = { "Java" : "o", "Python" : "s", "R" : "^" }
 55 |     colors  = { "Java" : "r", "Python" : "b", "R" : "g" }
 56 | 
 57 |     for (longitude, latitude), language in cities:
 58 |         plots[language][0].append(longitude)
 59 |         plots[language][1].append(latitude)
 60 | 
 61 |     # create a scatter series for each language
 62 |     for language, (x, y) in plots.items():
 63 |         plt.scatter(x, y, color=colors[language], marker=markers[language],
 64 |                           label=language, zorder=10)
 65 | 
 66 |     plot_state_borders(plt)    # assume we have a function that does this
 67 | 
 68 |     plt.legend(loc=0)          # let matplotlib choose the location
 69 |     plt.axis([-130,-60,20,55]) # set the axes
 70 |     plt.title("Favorite Programming Languages")
 71 |     plt.show()
 72 | 
 73 | 
 74 | def classify_and_plot_grid(k=1):
 75 |     plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }
 76 |     markers = { "Java" : "o", "Python" : "s", "R" : "^" }
 77 |     colors  = { "Java" : "r", "Python" : "b", "R" : "g" }
 78 | 
 79 |     for longitude in range(-130, -60):
 80 |         for latitude in range(20, 55):
 81 |             predicted_language = knn_classify(k, cities, [longitude, latitude])
 82 |             plots[predicted_language][0].append(longitude)
 83 |             plots[predicted_language][1].append(latitude)
 84 | 
 85 |     # create a scatter series for each language
 86 |     for language, (x, y) in plots.items():
 87 |         plt.scatter(x, y, color=colors[language], marker=markers[language],
 88 |                           label=language, zorder=0)
 89 | 
 90 |     plot_state_borders(plt)    # assume we have a function that does this
 91 | 
 92 |     plt.legend(loc=0)          # let matplotlib choose the location
 93 |     plt.axis([-130,-60,20,55]) # set the axes
 94 |     plt.title(str(k) + "-Nearest Neighbor Programming Languages")
 95 |     plt.show()
 96 | 
 97 | #
 98 | # the curse of dimensionality
 99 | #
100 | 
101 | 
102 | def random_point(dim):
103 |     return [random.random() for _ in range(dim)]
104 | 
105 | 
106 | def random_distances(dim, num_pairs):
107 |     return [distance(random_point(dim), random_point(dim))
108 |             for _ in range(num_pairs)]
109 | 


--------------------------------------------------------------------------------
/neural_network/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import matplotlib
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | from helpers.linear_algebra import dot
  7 | 
  8 | 
  9 | def step_function(x):
 10 |     return 1 if x >= 0 else 0
 11 | 
 12 | 
 13 | def perceptron_output(weights, bias, x):
 14 |     """returns 1 if the perceptron 'fires', 0 if not"""
 15 |     return step_function(dot(weights, x) + bias)
 16 | 
 17 | 
 18 | def sigmoid(t):
 19 |     return 1 / (1 + math.exp(-t))
 20 | 
 21 | 
 22 | def neuron_output(weights, inputs):
 23 |     return sigmoid(dot(weights, inputs))
 24 | 
 25 | 
 26 | def feed_forward(neural_network, input_vector):
 27 |     """takes in a neural network
 28 |     (represented as a list of lists of lists of weights)
 29 |     and returns the output from forward-propagating the input"""
 30 | 
 31 |     outputs = []
 32 | 
 33 |     for layer in neural_network:
 34 |         input_with_bias = input_vector + [1]  # add a bias input
 35 |         output = [neuron_output(neuron, input_with_bias)  # compute the output
 36 |                   for neuron in layer]  # for this layer
 37 |         outputs.append(output)  # and remember it
 38 | 
 39 |         # the input to the next layer is the output of this one
 40 |         input_vector = output
 41 | 
 42 |     return outputs
 43 | 
 44 | 
 45 | def backpropagate(network, input_vector, target):
 46 |     hidden_outputs, outputs = feed_forward(network, input_vector)
 47 | 
 48 |     # the output * (1 - output) is from the derivative of sigmoid
 49 |     output_deltas = [output * (1 - output) * (output - target[i])
 50 |                      for i, output in enumerate(outputs)]
 51 | 
 52 |     # adjust weights for output layer (network[-1])
 53 |     for i, output_neuron in enumerate(network[-1]):
 54 |         for j, hidden_output in enumerate(hidden_outputs + [1]):
 55 |             output_neuron[j] -= output_deltas[i] * hidden_output
 56 | 
 57 |     # back-propagate errors to hidden layer
 58 |     hidden_deltas = [hidden_output * (1 - hidden_output) *
 59 |                      dot(output_deltas, [n[i] for n in network[-1]])
 60 |                      for i, hidden_output in enumerate(hidden_outputs)]
 61 | 
 62 |     # adjust weights for hidden layer (network[0])
 63 |     for i, hidden_neuron in enumerate(network[0]):
 64 |         for j, input in enumerate(input_vector + [1]):
 65 |             hidden_neuron[j] -= hidden_deltas[i] * input
 66 | 
 67 | 
 68 | def patch(x, y, hatch, color):
 69 |     """return a matplotlib 'patch' object with the specified
 70 |     location, crosshatch pattern, and color"""
 71 |     return matplotlib.patches.Rectangle((x - 0.5, y - 0.5), 1, 1,
 72 |                                         hatch=hatch, fill=False, color=color)
 73 | 
 74 | 
 75 | def show_weights(network, neuron_idx):
 76 |     weights = network[0][neuron_idx]
 77 |     abs_weights = [abs(weight) for weight in weights]
 78 | 
 79 |     grid = [abs_weights[row:(row + 5)]  # turn the weights into a 5x5 grid
 80 |             for row in range(0, 25, 5)]  # [weights[0:5], ..., weights[20:25]]
 81 | 
 82 |     ax = plt.gca()  # to use hatching, we'll need the axis
 83 | 
 84 |     ax.imshow(grid,  # here same as plt.imshow
 85 |               cmap=matplotlib.cm.binary,  # use white-black color scale
 86 |               interpolation='none')  # plot blocks as blocks
 87 | 
 88 |     # cross-hatch the negative weights
 89 |     for i in range(5):  # row
 90 |         for j in range(5):  # column
 91 |             if weights[5 * i + j] < 0:  # row i, column j = weights[5*i + j]
 92 |                 # add black and white hatches, so visible whether dark or light
 93 |                 ax.add_patch(patch(j, i, '/', "white"))
 94 |                 ax.add_patch(patch(j, i, '\\', "black"))
 95 |     plt.show()
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     xor_network = [[[20, 20, -30],
100 |                     [20, 20, -10]],
101 |                    [[-60, 60, -30]]]
102 | 
103 |     for x in [0,1]:
104 |         for y in [0,1]:
105 |             print(x, y, feed_forward(xor_network, [x, y]))
106 | 


--------------------------------------------------------------------------------
/helpers/linear_algebra.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from functools import reduce
  3 | 
  4 | 
  5 | #
  6 | # functions for working with vectors
  7 | #
  8 | 
  9 | 
 10 | def vector_add(v, w):
 11 |     """adds two vectors componentwise"""
 12 |     return [v_i + w_i for v_i, w_i in zip(v, w)]
 13 | 
 14 | 
 15 | def vector_subtract(v, w):
 16 |     """subtracts two vectors componentwise"""
 17 |     return [v_i - w_i for v_i, w_i in zip(v, w)]
 18 | 
 19 | 
 20 | def vector_sum(vectors):
 21 |     return reduce(vector_add, vectors)
 22 | 
 23 | 
 24 | def scalar_multiply(c, v):
 25 |     return [c * v_i for v_i in v]
 26 | 
 27 | 
 28 | def vector_mean(vectors):
 29 |     """compute the vector whose i-th element is the mean of the
 30 |     i-th elements of the input vectors"""
 31 |     n = len(vectors)
 32 |     return scalar_multiply(1 / n, vector_sum(vectors))
 33 | 
 34 | 
 35 | def dot(v, w):
 36 |     """v_1 * w_1 + ... + v_n * w_n"""
 37 |     return sum(v_i * w_i for v_i, w_i in zip(v, w))
 38 | 
 39 | 
 40 | def sum_of_squares(v):
 41 |     """v_1 * v_1 + ... + v_n * v_n"""
 42 |     return dot(v, v)
 43 | 
 44 | 
 45 | def magnitude(v):
 46 |     return math.sqrt(sum_of_squares(v))
 47 | 
 48 | 
 49 | def squared_distance(v, w):
 50 |     return sum_of_squares(vector_subtract(v, w))
 51 | 
 52 | 
 53 | def distance(v, w):
 54 |     return math.sqrt(squared_distance(v, w))
 55 | 
 56 | 
 57 | #
 58 | # functions for working with matrices
 59 | #
 60 | 
 61 | 
 62 | def shape(A):
 63 |     num_rows = len(A)
 64 |     num_cols = len(A[0]) if A else 0
 65 |     return num_rows, num_cols
 66 | 
 67 | 
 68 | def get_row(A, i):
 69 |     return A[i]
 70 | 
 71 | 
 72 | def get_column(A, j):
 73 |     return [A_i[j] for A_i in A]
 74 | 
 75 | 
 76 | def make_matrix(num_rows, num_cols, entry_fn):
 77 |     """returns a num_rows x num_cols matrix
 78 |     whose (i,j)-th entry is entry_fn(i, j)"""
 79 |     return [[entry_fn(i, j) for j in range(num_cols)]
 80 |             for i in range(num_rows)]
 81 | 
 82 | 
 83 | def is_diagonal(i, j):
 84 |     """1's on the 'diagonal', 0's everywhere else"""
 85 |     return 1 if i == j else 0
 86 | 
 87 | 
 88 | identity_matrix = make_matrix(5, 5, is_diagonal)
 89 | 
 90 | #          user 0  1  2  3  4  5  6  7  8  9
 91 | #
 92 | friendships = [[0, 1, 1, 0, 0, 0, 0, 0, 0, 0],  # user 0
 93 |                [1, 0, 1, 1, 0, 0, 0, 0, 0, 0],  # user 1
 94 |                [1, 1, 0, 1, 0, 0, 0, 0, 0, 0],  # user 2
 95 |                [0, 1, 1, 0, 1, 0, 0, 0, 0, 0],  # user 3
 96 |                [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],  # user 4
 97 |                [0, 0, 0, 0, 1, 0, 1, 1, 0, 0],  # user 5
 98 |                [0, 0, 0, 0, 0, 1, 0, 0, 1, 0],  # user 6
 99 |                [0, 0, 0, 0, 0, 1, 0, 0, 1, 0],  # user 7
100 |                [0, 0, 0, 0, 0, 0, 1, 1, 0, 1],  # user 8
101 |                [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]]  # user 9
102 | 
103 | 
104 | def matrix_add(A, B):
105 |     if shape(A) != shape(B):
106 |         raise ArithmeticError("cannot add matrices with different shapes")
107 | 
108 |     num_rows, num_cols = shape(A)
109 | 
110 |     def entry_fn(i, j): return A[i][j] + B[i][j]
111 | 
112 |     return make_matrix(num_rows, num_cols, entry_fn)
113 | 
114 | 
115 | def make_graph_dot_product_as_vector_projection(plt):
116 |     v = [2, 1]
117 |     w = [math.sqrt(.25), math.sqrt(.75)]
118 |     c = dot(v, w)
119 |     vonw = scalar_multiply(c, w)
120 |     o = [0, 0]
121 | 
122 |     plt.arrow(0, 0, v[0], v[1],
123 |               width=0.002, head_width=.1, length_includes_head=True)
124 |     plt.annotate("v", v, xytext=[v[0] + 0.1, v[1]])
125 |     plt.arrow(0, 0, w[0], w[1],
126 |               width=0.002, head_width=.1, length_includes_head=True)
127 |     plt.annotate("w", w, xytext=[w[0] - 0.1, w[1]])
128 |     plt.arrow(0, 0, vonw[0], vonw[1], length_includes_head=True)
129 |     plt.annotate(u"(v?w)w", vonw, xytext=[vonw[0] - 0.1, vonw[1] + 0.1])
130 |     plt.arrow(v[0], v[1], vonw[0] - v[0], vonw[1] - v[1],
131 |               linestyle='dotted', length_includes_head=True)
132 |     plt.scatter(*zip(v, w, o), marker='.')
133 |     plt.axis('equal')
134 |     plt.show()
135 | 


--------------------------------------------------------------------------------
/decision_trees/utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from collections import Counter, defaultdict
  3 | from functools import partial
  4 | 
  5 | 
  6 | def entropy(class_probabilities):
  7 |     """given a list of class probabilities, compute the entropy"""
  8 |     return sum(-p * math.log(p, 2) for p in class_probabilities if p)
  9 | 
 10 | 
 11 | def class_probabilities(labels):
 12 |     total_count = len(labels)
 13 |     return [count / total_count
 14 |             for count in Counter(labels).values()]
 15 | 
 16 | 
 17 | def data_entropy(labeled_data):
 18 |     labels = [label for _, label in labeled_data]
 19 |     probabilities = class_probabilities(labels)
 20 |     return entropy(probabilities)
 21 | 
 22 | 
 23 | def partition_entropy(subsets):
 24 |     """find the entropy from this partition of data into subsets"""
 25 |     total_count = sum(len(subset) for subset in subsets)
 26 | 
 27 |     return sum(data_entropy(subset) * len(subset) / total_count
 28 |                for subset in subsets)
 29 | 
 30 | 
 31 | def group_by(items, key_fn):
 32 |     """returns a defaultdict(list), where each input item
 33 |     is in the list whose key is key_fn(item)"""
 34 |     groups = defaultdict(list)
 35 |     for item in items:
 36 |         key = key_fn(item)
 37 |         groups[key].append(item)
 38 |     return groups
 39 | 
 40 | 
 41 | def partition_by(inputs, attribute):
 42 |     """returns a dict of inputs partitioned by the attribute
 43 |     each input is a pair (attribute_dict, label)"""
 44 |     return group_by(inputs, lambda x: x[0][attribute])
 45 | 
 46 | 
 47 | def partition_entropy_by(inputs, attribute):
 48 |     """computes the entropy corresponding to the given partition"""
 49 |     partitions = partition_by(inputs, attribute)
 50 |     return partition_entropy(partitions.values())
 51 | 
 52 | 
 53 | def classify(tree, input):
 54 |     """classify the input using the given decision tree"""
 55 | 
 56 |     # if this is a leaf node, return its value
 57 |     if tree in [True, False]:
 58 |         return tree
 59 | 
 60 |     # otherwise find the correct subtree
 61 |     attribute, subtree_dict = tree
 62 | 
 63 |     subtree_key = input.get(attribute)  # None if input is missing attribute
 64 | 
 65 |     if subtree_key not in subtree_dict:  # if no subtree for key,
 66 |         subtree_key = None  # we'll use the None subtree
 67 | 
 68 |     subtree = subtree_dict[subtree_key]  # choose the appropriate subtree
 69 |     return classify(subtree, input)  # and use it to classify the input
 70 | 
 71 | 
 72 | def build_tree_id3(inputs, split_candidates=None):
 73 |     # if this is our first pass,
 74 |     # all keys of the first input are split candidates
 75 |     if split_candidates is None:
 76 |         split_candidates = inputs[0][0].keys()
 77 | 
 78 |     # count Trues and Falses in the inputs
 79 |     num_inputs = len(inputs)
 80 |     num_trues = len([label for item, label in inputs if label])
 81 |     num_falses = num_inputs - num_trues
 82 | 
 83 |     if num_trues == 0:  # if only Falses are left
 84 |         return False  # return a "False" leaf
 85 | 
 86 |     if num_falses == 0:  # if only Trues are left
 87 |         return True  # return a "True" leaf
 88 | 
 89 |     if not split_candidates:  # if no split candidates left
 90 |         return num_trues >= num_falses  # return the majority leaf
 91 | 
 92 |     # otherwise, split on the best attribute
 93 |     best_attribute = min(split_candidates,
 94 |                          key=partial(partition_entropy_by, inputs))
 95 | 
 96 |     partitions = partition_by(inputs, best_attribute)
 97 |     new_candidates = [a for a in split_candidates
 98 |                       if a != best_attribute]
 99 | 
100 |     # recursively build the subtrees
101 |     subtrees = {attribute: build_tree_id3(subset, new_candidates)
102 |                 for attribute, subset in partitions.items()}
103 | 
104 |     subtrees[None] = num_trues > num_falses  # default case
105 | 
106 |     return best_attribute, subtrees
107 | 
108 | 
109 | def forest_classify(trees, input):
110 |     votes = [classify(tree, input) for tree in trees]
111 |     vote_counts = Counter(votes)
112 |     return vote_counts.most_common(1)[0][0]
113 | 


--------------------------------------------------------------------------------
/Understanding Vanishing Gradient.md:
--------------------------------------------------------------------------------
 1 | # Understanding Vanishing Gradients in Neural Networks
 2 | 
 3 | #### Credits: Thanks to [Chi-Feng Wang](https://towardsdatascience.com/@reina.wang) for writing this [article](https://towardsdatascience.com/the-vanishing-gradient-problem-69bf08b15484)
 4 | 
 5 | ![Vanishing Gradient](https://i.stack.imgur.com/YUlyb.jpg)
 6 | 
 7 | ### TL;DR
 8 | The gradient used in backprop is calculated using the derivative chain rule, meaning it is a product of about as many factors as there are layers (in a vanilla feedforward net). <br> 
 9 | If all those factors are e.g. between 0 and 1 (e.g. due to the choice of 'squishing' activation functions), and some are very small (typical in the earlier layers and when activations are saturated), then the overall product (gradient) will get very small, near zero.<br> 
10 | The risk of this happening grows with the number of factors (the number of layers). <br>
11 | The problem is that this may happen for a weight configuration that is nowhere near optimal, yet training will slow down or stop
12 | 
13 | ### Introduction
14 | 
15 | We all know that neural networks perform learning through the process of forward pass and backward pass.<br> 
16 | This cycle goes on until we find a optimal value for the cost function that we are trying to minimize. <br>
17 | The optmization happens with the help of gradient descent.<br>
18 | 
19 | ### What are gradients ?
20 | Gradients are the derivative of a function. It determines how much change happens when the input to the function is changed by a very big number <br>
21 | 
22 | Gradients of neural networks are found using backpropagation(backward pass as mentioned above). <br>
23 | 1. Backpropogation finds the derivatives of the network by moving layer by layer from the final layer to the initial one.<br> 
24 | 2. By the chain rule, the derivatives of each layer are multiplied down the network (from the final layer to the initial) to compute the derivatives of the initial layers.
25 | 
26 | ### Why does it happen ?
27 | 
28 | A very commonly used activation function is the sigmoid function.
29 | 
30 | The sigmoid function squashes the input value into a range of 0 to 1. <br>
31 | Hence if there is a large change in the value, there is not much change in the output by the sigmoid. Hence the derivative of this function is very small. <br>
32 | 
33 | The graph below also shows us the same picture. For very large or small values of x, the derivative of sigmoid is very small (almost closer to zero)
34 | 
35 | ![Sigmoid Function and its Derivative](https://miro.medium.com/max/1000/1*6A3A_rt4YmumHusvTvVTxw.png)
36 | 
37 | ### How does it impact ?
38 | 
39 | As explained above, we are multiplying gradients with each other in the bacward pass step using chain rule. <br>
40 | So when we are multiplying a lot of small numbers (almost near zero quantities). The gradient value is descreased very sharply.  <br>
41 | 
42 | A small gradient means that the weights and biases of the initial layers will not be updated effectively with each training session. 
43 | 
44 | **Since these initial layers are often crucial to recognizing the core elements of the input data, it can lead to overall inaccuracy of the whole network.**
45 | 
46 | ### Solutions to the vanishing gradients
47 | 
48 | 1. We can use other other activation function like `Relu`
49 | ` Relu(x) = max(x,0)`
50 | 
51 | 2. Using residual networks is also an effective solution where we add the input value X to the next layer before applying the activation.  <br>
52 | This way the overall derivative is not reduced to a small value. Refer the diagram below.
53 | 
54 | ![A Residual Block](https://miro.medium.com/max/385/1*mxJ5gBvZnYPVo0ISZE5XkA.png)
55 | 
56 | 3. Batch normalization is also an effective solution. We normalize the input value x ==> |x| so that it does not have extremely large or small values and hence the derivative is not very small.  <br>
57 | We limit the input function to a small range and hence the output from the sigmoid also remains normal. We can see the same behavior that the green region does not have very small derivatives. Refer the diagram below
58 | 
59 | ![Sigmoig function with limited values](https://miro.medium.com/max/700/1*XCtAytGsbhRQnu-x7Ynr0Q.png)
60 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via issue,
 4 | email, or any other method with the owners of this repository before making a change. 
 5 | 
 6 | Please note we have a code of conduct, please follow it in all your interactions with the project.
 7 | 
 8 | ## Pull Request Process
 9 | 
10 | 1. Ensure any install or build dependencies are removed before the end of the layer when doing a 
11 |    build.
12 | 2. Update the README.md with details of changes to the interface, this includes new environment 
13 |    variables, exposed ports, useful file locations and container parameters.
14 | 3. Increase the version numbers in any examples files and the README.md to the new version that this
15 |    Pull Request would represent. The versioning scheme we use is [SemVer](http://semver.org/).
16 | 4. You may merge the Pull Request in once you have the sign-off of two other developers, or if you 
17 |    do not have permission to do that, you may request the second reviewer to merge it for you.
18 | 
19 | ## Code of Conduct
20 | 
21 | ### Our Pledge
22 | 
23 | In the interest of fostering an open and welcoming environment, we as
24 | contributors and maintainers pledge to making participation in our project and
25 | our community a harassment-free experience for everyone, regardless of age, body
26 | size, disability, ethnicity, gender identity and expression, level of experience,
27 | nationality, personal appearance, race, religion, or sexual identity and
28 | orientation.
29 | 
30 | ### Our Standards
31 | 
32 | Examples of behavior that contributes to creating a positive environment
33 | include:
34 | 
35 | * Using welcoming and inclusive language
36 | * Being respectful of differing viewpoints and experiences
37 | * Gracefully accepting constructive criticism
38 | * Focusing on what is best for the community
39 | * Showing empathy towards other community members
40 | 
41 | Examples of unacceptable behavior by participants include:
42 | 
43 | * The use of sexualized language or imagery and unwelcome sexual attention or
44 | advances
45 | * Trolling, insulting/derogatory comments, and personal or political attacks
46 | * Public or private harassment
47 | * Publishing others' private information, such as a physical or electronic
48 |   address, without explicit permission
49 | * Other conduct which could reasonably be considered inappropriate in a
50 |   professional setting
51 | 
52 | ### Our Responsibilities
53 | 
54 | Project maintainers are responsible for clarifying the standards of acceptable
55 | behavior and are expected to take appropriate and fair corrective action in
56 | response to any instances of unacceptable behavior.
57 | 
58 | Project maintainers have the right and responsibility to remove, edit, or
59 | reject comments, commits, code, wiki edits, issues, and other contributions
60 | that are not aligned to this Code of Conduct, or to ban temporarily or
61 | permanently any contributor for other behaviors that they deem inappropriate,
62 | threatening, offensive, or harmful.
63 | 
64 | ### Scope
65 | 
66 | This Code of Conduct applies both within project spaces and in public spaces
67 | when an individual is representing the project or its community. Examples of
68 | representing a project or community include using an official project e-mail
69 | address, posting via an official social media account, or acting as an appointed
70 | representative at an online or offline event. Representation of a project may be
71 | further defined and clarified by project maintainers.
72 | 
73 | ### Enforcement
74 | 
75 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
76 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All
77 | complaints will be reviewed and investigated and will result in a response that
78 | is deemed necessary and appropriate to the circumstances. The project team is
79 | obligated to maintain confidentiality with regard to the reporter of an incident.
80 | Further details of specific enforcement policies may be posted separately.
81 | 
82 | Project maintainers who do not follow or enforce the Code of Conduct in good
83 | faith may face temporary or permanent repercussions as determined by other
84 | members of the project's leadership.
85 | 
86 | ### Attribution
87 | 
88 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
89 | available at [http://contributor-covenant.org/version/1/4][version]
90 | 
91 | [homepage]: http://contributor-covenant.org
92 | [version]: http://contributor-covenant.org/version/1/4/
93 | 


--------------------------------------------------------------------------------
/helpers/probabilty.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import math
  3 | import random
  4 | from matplotlib import pyplot as plt
  5 | 
  6 | 
  7 | def random_kid():
  8 |     return random.choice(["boy", "girl"])
  9 | 
 10 | 
 11 | def uniform_pdf(x):
 12 |     return 1 if 0 <= x < 1 else 0
 13 | 
 14 | 
 15 | def uniform_cdf(x):
 16 |     """returns the probability that a uniform random variable is less than x"""
 17 |     if x < 0:
 18 |         return 0  # uniform random is never less than 0
 19 |     elif x < 1:
 20 |         return x  # e.g. P(X < 0.4) = 0.4
 21 |     else:
 22 |         return 1  # uniform random is always less than 1
 23 | 
 24 | 
 25 | def normal_pdf(x, mu=0, sigma=1.0):
 26 |     sqrt_two_pi = math.sqrt(2 * math.pi)
 27 |     return math.exp(-(x - mu) ** 2 / 2 / sigma ** 2) / (sqrt_two_pi * sigma)
 28 | 
 29 | 
 30 | def plot_normal_pdfs(plt):
 31 |     xs = [x / 10.0 for x in range(-50, 50)]
 32 |     plt.plot(xs, [normal_pdf(x, sigma=1) for x in xs], '-', label='mu=0,sigma=1')
 33 |     plt.plot(xs, [normal_pdf(x, sigma=2) for x in xs], '--', label='mu=0,sigma=2')
 34 |     plt.plot(xs, [normal_pdf(x, sigma=0.5) for x in xs], ':', label='mu=0,sigma=0.5')
 35 |     plt.plot(xs, [normal_pdf(x, mu=-1) for x in xs], '-.', label='mu=-1,sigma=1')
 36 |     plt.legend()
 37 |     plt.show()
 38 | 
 39 | 
 40 | def normal_cdf(x, mu=0, sigma=1.0):
 41 |     return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2
 42 | 
 43 | 
 44 | def plot_normal_cdfs(plt):
 45 |     xs = [x / 10.0 for x in range(-50, 50)]
 46 |     plt.plot(xs, [normal_cdf(x, sigma=1) for x in xs], '-', label='mu=0,sigma=1')
 47 |     plt.plot(xs, [normal_cdf(x, sigma=2) for x in xs], '--', label='mu=0,sigma=2')
 48 |     plt.plot(xs, [normal_cdf(x, sigma=0.5) for x in xs], ':', label='mu=0,sigma=0.5')
 49 |     plt.plot(xs, [normal_cdf(x, mu=-1) for x in xs], '-.', label='mu=-1,sigma=1')
 50 |     plt.legend(loc=4)  # bottom right
 51 |     plt.show()
 52 | 
 53 | 
 54 | def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001):
 55 |     """find approximate inverse using binary search"""
 56 | 
 57 |     # if not standard, compute standard and rescale
 58 |     if mu != 0 or sigma != 1:
 59 |         return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)
 60 | 
 61 |     low_z, low_p = -10.0, 0  # normal_cdf(-10) is (very close to) 0
 62 |     hi_z, hi_p = 10.0, 1  # normal_cdf(10)  is (very close to) 1
 63 |     mid_z = None
 64 |     while hi_z - low_z > tolerance:
 65 |         mid_z = (low_z + hi_z) / 2  # consider the midpoint
 66 |         mid_p = normal_cdf(mid_z)  # and the cdf's value there
 67 |         if mid_p < p:
 68 |             # midpoint is still too low, search above it
 69 |             low_z, low_p = mid_z, mid_p
 70 |         elif mid_p > p:
 71 |             # midpoint is still too high, search below it
 72 |             hi_z, hi_p = mid_z, mid_p
 73 |         else:
 74 |             break
 75 | 
 76 |     return mid_z
 77 | 
 78 | 
 79 | def bernoulli_trial(p):
 80 |     return 1 if random.random() < p else 0
 81 | 
 82 | 
 83 | def binomial(p, n):
 84 |     return sum(bernoulli_trial(p) for _ in range(n))
 85 | 
 86 | 
 87 | def make_hist(p, n, num_points):
 88 |     data = [binomial(p, n) for _ in range(num_points)]
 89 | 
 90 |     # use a bar chart to show the actual binomial samples
 91 |     histogram = Counter(data)
 92 |     plt.bar([x - 0.4 for x in histogram.keys()],
 93 |             [v / num_points for v in histogram.values()],
 94 |             0.8,
 95 |             color='0.75')
 96 | 
 97 |     mu = p * n
 98 |     sigma = math.sqrt(n * p * (1 - p))
 99 | 
100 |     # use a line chart to show the normal approximation
101 |     xs = range(min(data), max(data) + 1)
102 |     ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma)
103 |           for i in xs]
104 |     plt.plot(xs, ys)
105 |     plt.show()
106 | 
107 | 
108 | if __name__ == "__main__":
109 | 
110 |     #
111 |     # CONDITIONAL PROBABILITY
112 |     #
113 | 
114 |     both_girls = 0
115 |     older_girl = 0
116 |     either_girl = 0
117 | 
118 |     random.seed(0)
119 |     for _ in range(10000):
120 |         younger = random_kid()
121 |         older = random_kid()
122 |         if older == "girl":
123 |             older_girl += 1
124 |         if older == "girl" and younger == "girl":
125 |             both_girls += 1
126 |         if older == "girl" or younger == "girl":
127 |             either_girl += 1
128 | 
129 |     print("P(both | older):", both_girls / older_girl)  # 0.514 ~ 1/2
130 |     print("P(both | either): ", both_girls / either_girl)  # 0.342 ~ 1/3
131 | 


--------------------------------------------------------------------------------
/working_with_data/data.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # DIMENSIONALITY REDUCTION
  3 | #
  4 | 
  5 | X = [
  6 |     [20.9666776351559,-13.1138080189357],
  7 |     [22.7719907680008,-19.8890894944696],
  8 |     [25.6687103160153,-11.9956004517219],
  9 |     [18.0019794950564,-18.1989191165133],
 10 |     [21.3967402102156,-10.8893126308196],
 11 |     [0.443696899177716,-19.7221132386308],
 12 |     [29.9198322142127,-14.0958668502427],
 13 |     [19.0805843080126,-13.7888747608312],
 14 |     [16.4685063521314,-11.2612927034291],
 15 |     [21.4597664701884,-12.4740034586705],
 16 |     [3.87655283720532,-17.575162461771],
 17 |     [34.5713920556787,-10.705185165378],
 18 |     [13.3732115747722,-16.7270274494424],
 19 |     [20.7281704141919,-8.81165591556553],
 20 |     [24.839851437942,-12.1240962157419],
 21 |     [20.3019544741252,-12.8725060780898],
 22 |     [21.9021426929599,-17.3225432396452],
 23 |     [23.2285885715486,-12.2676568419045],
 24 |     [28.5749111681851,-13.2616470619453],
 25 |     [29.2957424128701,-14.6299928678996],
 26 |     [15.2495527798625,-18.4649714274207],
 27 |     [26.5567257400476,-9.19794350561966],
 28 |     [30.1934232346361,-12.6272709845971],
 29 |     [36.8267446011057,-7.25409849336718],
 30 |     [32.157416823084,-10.4729534347553],
 31 |     [5.85964365291694,-22.6573731626132],
 32 |     [25.7426190674693,-14.8055803854566],
 33 |     [16.237602636139,-16.5920595763719],
 34 |     [14.7408608850568,-20.0537715298403],
 35 |     [6.85907008242544,-18.3965586884781],
 36 |     [26.5918329233128,-8.92664811750842],
 37 |     [-11.2216019958228,-27.0519081982856],
 38 |     [8.93593745011035,-20.8261235122575],
 39 |     [24.4481258671796,-18.0324012215159],
 40 |     [2.82048515404903,-22.4208457598703],
 41 |     [30.8803004755948,-11.455358009593],
 42 |     [15.4586738236098,-11.1242825084309],
 43 |     [28.5332537090494,-14.7898744423126],
 44 |     [40.4830293441052,-2.41946428697183],
 45 |     [15.7563759125684,-13.5771266003795],
 46 |     [19.3635588851727,-20.6224770470434],
 47 |     [13.4212840786467,-19.0238227375766],
 48 |     [7.77570680426702,-16.6385739839089],
 49 |     [21.4865983854408,-15.290799330002],
 50 |     [12.6392705930724,-23.6433305964301],
 51 |     [12.4746151388128,-17.9720169566614],
 52 |     [23.4572410437998,-14.602080545086],
 53 |     [13.6878189833565,-18.9687408182414],
 54 |     [15.4077465943441,-14.5352487124086],
 55 |     [20.3356581548895,-10.0883159703702],
 56 |     [20.7093833689359,-12.6939091236766],
 57 |     [11.1032293684441,-14.1383848928755],
 58 |     [17.5048321498308,-9.2338593361801],
 59 |     [16.3303688220188,-15.1054735529158],
 60 |     [26.6929062710726,-13.306030567991],
 61 |     [34.4985678099711,-9.86199941278607],
 62 |     [39.1374291499406,-10.5621430853401],
 63 |     [21.9088956482146,-9.95198845621849],
 64 |     [22.2367457578087,-17.2200123442707],
 65 |     [10.0032784145577,-19.3557700653426],
 66 |     [14.045833906665,-15.871937521131],
 67 |     [15.5640911917607,-18.3396956121887],
 68 |     [24.4771926581586,-14.8715313479137],
 69 |     [26.533415556629,-14.693883922494],
 70 |     [12.8722580202544,-21.2750596021509],
 71 |     [24.4768291376862,-15.9592080959207],
 72 |     [18.2230748567433,-14.6541444069985],
 73 |     [4.1902148367447,-20.6144032528762],
 74 |     [12.4332594022086,-16.6079789231489],
 75 |     [20.5483758651873,-18.8512560786321],
 76 |     [17.8180560451358,-12.5451990696752],
 77 |     [11.0071081078049,-20.3938092335862],
 78 |     [8.30560561422449,-22.9503944138682],
 79 |     [33.9857852657284,-4.8371294974382],
 80 |     [17.4376502239652,-14.5095976075022],
 81 |     [29.0379635148943,-14.8461553663227],
 82 |     [29.1344666599319,-7.70862921632672],
 83 |     [32.9730697624544,-15.5839178785654],
 84 |     [13.4211493998212,-20.150199857584],
 85 |     [11.380538260355,-12.8619410359766],
 86 |     [28.672631499186,-8.51866271785711],
 87 |     [16.4296061111902,-23.3326051279759],
 88 |     [25.7168371582585,-13.8899296143829],
 89 |     [13.3185154732595,-17.8959160024249],
 90 |     [3.60832478605376,-25.4023343597712],
 91 |     [39.5445949652652,-11.466377647931],
 92 |     [25.1693484426101,-12.2752652925707],
 93 |     [25.2884257196471,-7.06710309184533],
 94 |     [6.77665715793125,-22.3947299635571],
 95 |     [20.1844223778907,-16.0427471125407],
 96 |     [25.5506805272535,-9.33856532270204],
 97 |     [25.1495682602477,-7.17350567090738],
 98 |     [15.6978431006492,-17.5979197162642],
 99 |     [37.42780451491,-10.843637288504],
100 |     [22.974620174842,-10.6171162611686],
101 |     [34.6327117468934,-9.26182440487384],
102 |     [34.7042513789061,-6.9630753351114],
103 |     [15.6563953929008,-17.2196961218915],
104 |     [25.2049825789225,-14.1592086208169]
105 | ]


--------------------------------------------------------------------------------
/multiple_regression/data.py:
--------------------------------------------------------------------------------
 1 | x = [[1, 49, 4, 0], [1, 41, 9, 0], [1, 40, 8, 0], [1, 25, 6, 0], [1, 21, 1, 0], [1, 21, 0, 0], [1, 19, 3, 0],
 2 |      [1, 19, 0, 0], [1, 18, 9, 0], [1, 18, 8, 0], [1, 16, 4, 0], [1, 15, 3, 0], [1, 15, 0, 0], [1, 15, 2, 0],
 3 |      [1, 15, 7, 0], [1, 14, 0, 0], [1, 14, 1, 0], [1, 13, 1, 0], [1, 13, 7, 0], [1, 13, 4, 0], [1, 13, 2, 0],
 4 |      [1, 12, 5, 0], [1, 12, 0, 0], [1, 11, 9, 0], [1, 10, 9, 0], [1, 10, 1, 0], [1, 10, 1, 0], [1, 10, 7, 0],
 5 |      [1, 10, 9, 0], [1, 10, 1, 0], [1, 10, 6, 0], [1, 10, 6, 0], [1, 10, 8, 0], [1, 10, 10, 0], [1, 10, 6, 0],
 6 |      [1, 10, 0, 0], [1, 10, 5, 0], [1, 10, 3, 0], [1, 10, 4, 0], [1, 9, 9, 0], [1, 9, 9, 0], [1, 9, 0, 0], [1, 9, 0, 0],
 7 |      [1, 9, 6, 0], [1, 9, 10, 0], [1, 9, 8, 0], [1, 9, 5, 0], [1, 9, 2, 0], [1, 9, 9, 0], [1, 9, 10, 0], [1, 9, 7, 0],
 8 |      [1, 9, 2, 0], [1, 9, 0, 0], [1, 9, 4, 0], [1, 9, 6, 0], [1, 9, 4, 0], [1, 9, 7, 0], [1, 8, 3, 0], [1, 8, 2, 0],
 9 |      [1, 8, 4, 0], [1, 8, 9, 0], [1, 8, 2, 0], [1, 8, 3, 0], [1, 8, 5, 0], [1, 8, 8, 0], [1, 8, 0, 0], [1, 8, 9, 0],
10 |      [1, 8, 10, 0], [1, 8, 5, 0], [1, 8, 5, 0], [1, 7, 5, 0], [1, 7, 5, 0], [1, 7, 0, 0], [1, 7, 2, 0], [1, 7, 8, 0],
11 |      [1, 7, 10, 0], [1, 7, 5, 0], [1, 7, 3, 0], [1, 7, 3, 0], [1, 7, 6, 0], [1, 7, 7, 0], [1, 7, 7, 0], [1, 7, 9, 0],
12 |      [1, 7, 3, 0], [1, 7, 8, 0], [1, 6, 4, 0], [1, 6, 6, 0], [1, 6, 4, 0], [1, 6, 9, 0], [1, 6, 0, 0], [1, 6, 1, 0],
13 |      [1, 6, 4, 0], [1, 6, 1, 0], [1, 6, 0, 0], [1, 6, 7, 0], [1, 6, 0, 0], [1, 6, 8, 0], [1, 6, 4, 0], [1, 6, 2, 1],
14 |      [1, 6, 1, 1], [1, 6, 3, 1], [1, 6, 6, 1], [1, 6, 4, 1], [1, 6, 4, 1], [1, 6, 1, 1], [1, 6, 3, 1], [1, 6, 4, 1],
15 |      [1, 5, 1, 1], [1, 5, 9, 1], [1, 5, 4, 1], [1, 5, 6, 1], [1, 5, 4, 1], [1, 5, 4, 1], [1, 5, 10, 1], [1, 5, 5, 1],
16 |      [1, 5, 2, 1], [1, 5, 4, 1], [1, 5, 4, 1], [1, 5, 9, 1], [1, 5, 3, 1], [1, 5, 10, 1], [1, 5, 2, 1], [1, 5, 2, 1],
17 |      [1, 5, 9, 1], [1, 4, 8, 1], [1, 4, 6, 1], [1, 4, 0, 1], [1, 4, 10, 1], [1, 4, 5, 1], [1, 4, 10, 1], [1, 4, 9, 1],
18 |      [1, 4, 1, 1], [1, 4, 4, 1], [1, 4, 4, 1], [1, 4, 0, 1], [1, 4, 3, 1], [1, 4, 1, 1], [1, 4, 3, 1], [1, 4, 2, 1],
19 |      [1, 4, 4, 1], [1, 4, 4, 1], [1, 4, 8, 1], [1, 4, 2, 1], [1, 4, 4, 1], [1, 3, 2, 1], [1, 3, 6, 1], [1, 3, 4, 1],
20 |      [1, 3, 7, 1], [1, 3, 4, 1], [1, 3, 1, 1], [1, 3, 10, 1], [1, 3, 3, 1], [1, 3, 4, 1], [1, 3, 7, 1], [1, 3, 5, 1],
21 |      [1, 3, 6, 1], [1, 3, 1, 1], [1, 3, 6, 1], [1, 3, 10, 1], [1, 3, 2, 1], [1, 3, 4, 1], [1, 3, 2, 1], [1, 3, 1, 1],
22 |      [1, 3, 5, 1], [1, 2, 4, 1], [1, 2, 2, 1], [1, 2, 8, 1], [1, 2, 3, 1], [1, 2, 1, 1], [1, 2, 9, 1], [1, 2, 10, 1],
23 |      [1, 2, 9, 1], [1, 2, 4, 1], [1, 2, 5, 1], [1, 2, 0, 1], [1, 2, 9, 1], [1, 2, 9, 1], [1, 2, 0, 1], [1, 2, 1, 1],
24 |      [1, 2, 1, 1], [1, 2, 4, 1], [1, 1, 0, 1], [1, 1, 2, 1], [1, 1, 2, 1], [1, 1, 5, 1], [1, 1, 3, 1], [1, 1, 10, 1],
25 |      [1, 1, 6, 1], [1, 1, 0, 1], [1, 1, 8, 1], [1, 1, 6, 1], [1, 1, 4, 1], [1, 1, 9, 1], [1, 1, 9, 1], [1, 1, 4, 1],
26 |      [1, 1, 2, 1], [1, 1, 9, 1], [1, 1, 0, 1], [1, 1, 8, 1], [1, 1, 6, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 5, 1]]
27 | 
28 | daily_minutes_good = [68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22, 34.76, 54.01, 38.79, 47.59, 49.1,
29 |                       27.66, 41.03, 36.73, 48.65, 28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57, 31.65,
30 |                       31.21, 36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8, 24.23, 21.4, 27.94, 32.24,
31 |                       40.57, 25.07, 19.42, 22.39, 18.42, 46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02, 29.47, 30.2,
32 |                       31, 38.11, 38.18, 36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28, 15.28, 24.17, 22.31, 30.17,
33 |                       25.53, 19.85, 35.37, 44.6, 17.23, 13.47, 26.33, 35.02, 32.09, 24.81, 19.33, 28.77, 24.26, 31.98,
34 |                       25.73, 24.86, 16.28, 34.51, 15.23, 39.72, 40.8, 26.06, 35.76, 34.76, 16.13, 44.04, 18.03, 19.65,
35 |                       32.62, 35.59, 39.43, 14.18, 35.24, 40.13, 41.82, 35.45, 36.07, 43.67, 24.61, 20.9, 21.9, 18.79,
36 |                       27.61, 27.21, 26.61, 29.77, 20.59, 27.53, 13.82, 33.2, 25, 33.1, 36.65, 18.63, 14.87, 22.2, 36.81,
37 |                       25.53, 24.62, 26.25, 18.21, 28.08, 19.42, 29.79, 32.8, 35.99, 28.32, 27.79, 35.88, 29.06, 36.28,
38 |                       14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48, 18.95, 33.55, 14.24, 29.04, 32.51, 25.63, 22.22,
39 |                       19, 32.73, 15.16, 13.9, 27.2, 32.01, 29.27, 33, 13.74, 20.42, 27.32, 18.23, 35.35, 28.48, 9.08,
40 |                       24.62, 20.12, 35.26, 19.92, 31.02, 16.49, 12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57,
41 |                       14.1, 33.42, 17.44, 10.12, 24.42, 9.82, 23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89,
42 |                       23.48, 8.38, 27.81, 32.35, 23.84]


--------------------------------------------------------------------------------
/k_means_clustering/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import matplotlib.image as mpimg
  3 | from helpers.linear_algebra import squared_distance, vector_mean, distance
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | 
  7 | class KMeans:
  8 |     """perfroms k-means clustering"""
  9 | 
 10 |     def __init__(self, k):
 11 |         self.k = k  # number of clusters
 12 |         self.means = None  # means of clusters
 13 | 
 14 |     def classify(self, input):
 15 |         """return the index of cluster closest to the input"""
 16 |         return min(range(self.k),
 17 |                    key=lambda i: squared_distance(input, self.means[i]))
 18 | 
 19 |     def train(self, inputs):
 20 |         """choose k random points as the initial means"""
 21 |         self.means = random.sample(inputs, self.k)
 22 |         assignments = None
 23 |         while True:
 24 | 
 25 |             # Find new assignments
 26 |             new_assignments = list(map(self.classify, inputs))
 27 |             # if no assignments have changed, we're done
 28 |             if assignments == new_assignments:
 29 |                 return
 30 |             # otherwise keep the new assignments
 31 |             assignments = new_assignments
 32 | 
 33 |             # and compute the new means based on the new assignments
 34 |             for i in range(self.k):
 35 |                 i_points = [p for p, a in zip(inputs, assignments) if a == i]
 36 | 
 37 |                 if i_points:
 38 |                     self.means[i] = vector_mean(i_points)
 39 | 
 40 | 
 41 | def squared_clustering_errors(inputs, k):
 42 |     """finds the total squared error from k-means clustering the inputs"""
 43 |     clusterer = KMeans(k)
 44 |     clusterer.train(inputs=inputs)
 45 |     means = clusterer.means
 46 |     assignments = list(map(clusterer.classify, inputs))
 47 | 
 48 |     return sum(squared_distance(inputs, means[cluster]) for input, cluster in zip(inputs, assignments))
 49 | 
 50 | 
 51 | """Clustering Colors"""
 52 | 
 53 | 
 54 | def recolor_image(input_file, k=5):
 55 |     img = mpimg.imread(input_file)
 56 |     pixels = [pixel for row in img for pixel in row]
 57 |     clusterer = KMeans(k)
 58 |     clusterer.train(pixels)  # this might take a while
 59 | 
 60 |     def recolor(pixel):
 61 |         cluster = clusterer.classify(pixel)  # index of the closest cluster
 62 |         return clusterer.means[clusterer]  # mean of the closest cluster
 63 | 
 64 |     new_img = [[recolor(pixel) for pixel in row] for row in img]
 65 |     plt.imshow(new_img)
 66 |     plt.axis('off')
 67 |     plt.show()
 68 | 
 69 | 
 70 | """Bottom up Hierarchical Clustering"""
 71 | 
 72 | 
 73 | def is_leaf(cluster):
 74 |     """a cluster is a leaf if it has length 1"""
 75 |     return len(cluster) == 1
 76 | 
 77 | 
 78 | def get_children(cluster):
 79 |     """returns the two children of this cluster if it's a merged cluster;
 80 |     raises an Exception if this is a leaf cluster"""
 81 |     if is_leaf(cluster):
 82 |         raise TypeError("a leaf cluster has no children")
 83 |     else:
 84 |         return cluster[1]
 85 | 
 86 | 
 87 | def get_values(cluster):
 88 |     """returns the value in this cluster (if it's a leaf cluster)
 89 |     or all the values in the leaf clusters below it (if it's not)"""
 90 |     if is_leaf(cluster):
 91 |         return cluster  # is already a 1-tuple containing value
 92 |     else:
 93 |         return [value
 94 |                 for child in get_children(cluster)
 95 |                 for value in get_values(child)]
 96 | 
 97 | 
 98 | def cluster_distance(cluster1, cluster2, distance_agg=min):
 99 |     """finds the aggregate distance between elements of
100 |     cluster1 and elements of cluster2"""
101 |     return distance_agg([distance(input1, input2)
102 |                          for input1 in get_values(cluster1)
103 |                          for input2 in get_values(cluster2)])
104 | 
105 | 
106 | def get_merge_order(cluster):
107 |     if is_leaf(cluster):
108 |         return float('inf')
109 |     else:
110 |         return cluster[0]
111 | 
112 | 
113 | def bottom_up_cluster(inputs, distance_agg=min):
114 |     # start with every input leaf cluster
115 |     clusters = [input for input in inputs]
116 | 
117 |     # as long as we have more than one cluster left...
118 |     while len(clusters) > 1:
119 |         # find the two closest clusters
120 |         c1, c2 = min([(cluster1, cluster2)
121 |                       for i, cluster1 in enumerate(clusters)
122 |                       for cluster2 in clusters[:i]],
123 |                      key=lambda p: cluster_distance(p[0], p[1], distance_agg))
124 | 
125 |         # remove them from the list of clusters
126 |         clusters = [c for c in clusters if c != c1 and c != c2]
127 | 
128 |         # merge them, using merge _order = # of cluster left
129 |         merged_cluster = (len(clusters), [c1, c2])
130 | 
131 |         # add their merge
132 |         clusters.append(merged_cluster)
133 | 
134 |     # when there is only one cluster left, return it
135 |     return clusters[0]
136 | 
137 | 
138 | def generate_clusters(base_cluster, num_clusters):
139 |     # start with a list of just a base cluster
140 |     clusters = [base_cluster]
141 | 
142 |     # as long as we don't have enough clusters
143 |     while len(clusters) < num_clusters:
144 |         # choose the last-merged of our clusters
145 |         next_cluster = min(clusters, key=get_merge_order)
146 |         # remove it from the list
147 |         clusters = [c for c in clusters if c != next_cluster]
148 |         # and add its children to the list (i.e. unmerge it)
149 |         clusters.extend(get_children(next_cluster))
150 | 
151 |     return clusters
152 | 


--------------------------------------------------------------------------------
/prec_rec_curve.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.metrics import confusion_matrix, precision_score, recall_score
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib.patches as ptch 
  5 | 
  6 | # Appendix A - working with single threshold
  7 | pred_scores = [0.7, 0.3, 0.5, 0.6, 0.55, 0.9, 0.4, 0.2, 0.4, 0.3]
  8 | y_true = ["positive", "negative", "negative", "positive", "positive", "positive", "negative", "positive", "negative", "positive"]
  9 | 
 10 | # To convert the scores into a class label, a threshold is used. 
 11 | # When the score is equal to or above the threshold, the sample is classified as one class. 
 12 | # Otherwise, it is classified as the other class. 
 13 | # Suppose a sample is Positive if its score is above or equal to the threshold. Otherwise, it is Negative. 
 14 | # The next block of code converts the scores into class labels with a threshold of 0.5.
 15 | 
 16 | threshold = 0.5
 17 | 
 18 | y_pred = ["positive" if score >= threshold else "negative" for score in pred_scores]
 19 | print(y_pred)
 20 | 
 21 | r = np.flip(confusion_matrix(y_true, y_pred))
 22 | print("\n# Confusion Matrix (From Left to Right & Top to Bottom: \nTrue Positive, False Negative, \nFalse Positive, True Negative)")
 23 | print(r)
 24 | 
 25 | # Remember that the higher the precision, the more confident the model is when it classifies a sample as Positive.
 26 | # Higher the recall, the more positive samples the model correctly classified as Positive.
 27 | 
 28 | precision = precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
 29 | print("\n# Precision = 4/(4+1)")
 30 | print(precision)
 31 | 
 32 | recall = recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
 33 | print("\n# Recall = 4/(4+2)")
 34 | print(recall)
 35 | 
 36 | # Appendix B - working with multiple thresholds
 37 | y_true = ["positive", "negative", "negative", "positive", "positive", "positive", "negative", "positive", "negative", "positive", "positive", "positive", "positive", "negative", "negative", "negative"]
 38 | 
 39 | pred_scores = [0.7, 0.3, 0.5, 0.6, 0.55, 0.9, 0.4, 0.2, 0.4, 0.3, 0.7, 0.5, 0.8, 0.2, 0.3, 0.35]
 40 | 
 41 | thresholds = np.arange(start=0.2, stop=0.7, step=0.05)
 42 | 
 43 | # Due to the importance of both precision and recall, there is a precision-recall curve that shows 
 44 | # the tradeoff between the precision and recall values for different thresholds. 
 45 | # This curve helps to select the best threshold to maximize both metrics
 46 | 
 47 | def precision_recall_curve(y_true, pred_scores, thresholds):
 48 |     precisions = []
 49 |     recalls = []
 50 |     f1_scores = []
 51 |     
 52 |     for threshold in thresholds:
 53 |         y_pred = ["positive" if score >= threshold else "negative" for score in pred_scores]
 54 | 
 55 |         precision = precision_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
 56 |         recall = recall_score(y_true=y_true, y_pred=y_pred, pos_label="positive")
 57 |         f1_score = (2 * precision * recall) / (precision + recall)
 58 |         
 59 |         precisions.append(precision)
 60 |         recalls.append(recall)
 61 |         f1_scores.append(f1_score)
 62 | 
 63 |     return precisions, recalls, f1_scores
 64 | 
 65 | precisions, recalls, f1_scores = precision_recall_curve(y_true=y_true, 
 66 |                                              pred_scores=pred_scores,
 67 |                                              thresholds=thresholds)
 68 | 
 69 | print("\nRecall:: 	Precision 	:: F1-Score",)
 70 | for p, r, f in zip(precisions, recalls, f1_scores):
 71 | 	print(round(r,4),"\t::\t",round(p,4),"\t::\t",round(f,4))
 72 | 
 73 | # np.max() returns the max. value in the array
 74 | # np.argmax() will return the index of the value found by np.max()
 75 | 
 76 | print('Best F1-Score: ', np.max(f1_scores))
 77 | idx_best_f1 = np.argmax(f1_scores)
 78 | print('\nBest threshold: ', thresholds[idx_best_f1])
 79 | print('Index of threshold: ', idx_best_f1)
 80 | 
 81 | # Can disable comment to display the plot
 82 | 
 83 | # plt.plot(recalls, precisions, linewidth=4, color="red")
 84 | # plt.scatter(recalls[idx_best_f1], precisions[idx_best_f1], zorder=1, linewidth=6)
 85 | # plt.xlabel("Recall", fontsize=12, fontweight='bold')
 86 | # plt.ylabel("Precision", fontsize=12, fontweight='bold')
 87 | # plt.title("Precision-Recall Curve", fontsize=15, fontweight="bold")
 88 | # plt.show()
 89 | 
 90 | # Appendix C - average precision (AP)
 91 | precisions, recalls, f1_scores = precision_recall_curve(y_true=y_true, 
 92 |                                              pred_scores=pred_scores, 
 93 |                                              thresholds=thresholds)
 94 | 
 95 | precisions.append(1)
 96 | recalls.append(0)
 97 | 
 98 | precisions = np.array(precisions)
 99 | recalls = np.array(recalls)
100 | 
101 | print('\nRecall ::',recalls)
102 | print('Precision ::',precisions)
103 | 
104 | AP = np.sum((recalls[:-1] - recalls[1:]) * precisions[:-1])
105 | print("\nAP --", AP)
106 | 
107 | # Appendix D - Intersection over Union
108 | 
109 | # gt_box -- 	ground-truth bounding box
110 | # pred_box --	prediction bounding box 
111 | def intersection_over_union(gt_box, pred_box):
112 | 
113 |     inter_box_top_left = [max(gt_box[0], pred_box[0]), max(gt_box[1], pred_box[1])]
114 | 
115 |     print("\ninter_box_top_left:", inter_box_top_left)
116 |     print("gt_box:", gt_box)
117 |     print("pred_box:", pred_box)
118 |     inter_box_bottom_right = [min(gt_box[0]+gt_box[2], pred_box[0]+pred_box[2]), min(gt_box[1]+gt_box[3], pred_box[1]+pred_box[3])]
119 |     print("inter_box_bottom_right:", inter_box_bottom_right)
120 | 
121 |     inter_box_w = inter_box_bottom_right[0] - inter_box_top_left[0]
122 |     print("inter_box_w:", inter_box_w)
123 |     inter_box_h = inter_box_bottom_right[1] - inter_box_top_left[1]
124 |     print("inter_box_h:", inter_box_h)
125 | 
126 |     intersection = inter_box_w * inter_box_h
127 |     union = gt_box[2] * gt_box[3] + pred_box[2] * pred_box[3] - intersection
128 |     
129 |     iou = intersection / union
130 | 
131 |     return iou, intersection, union
132 | 
133 | gt_box1 = [320, 220, 680, 900]
134 | pred_box1 = [500, 320, 550, 700]
135 | 
136 | gt_box2 = [645, 130, 310, 320]
137 | pred_box2 = [500, 60, 310, 320]
138 | 
139 | iou1 = intersection_over_union(gt_box1, pred_box1)
140 | print("\nIOU1 ::", iou1)
141 | 
142 | iou2 = intersection_over_union(gt_box2, pred_box2)
143 | print("\nIOU2 ::", iou2)


--------------------------------------------------------------------------------
/helpers/gradient_descent.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | from helpers.linear_algebra import distance, vector_subtract, scalar_multiply
  4 | 
  5 | 
  6 | def sum_of_squares(v):
  7 |     """computes the sum of squared elements in v"""
  8 |     return sum(v_i ** 2 for v_i in v)
  9 | 
 10 | 
 11 | def difference_quotient(f, x, h):
 12 |     return (f(x + h) - f(x)) / h
 13 | 
 14 | 
 15 | def plot_estimated_derivative():
 16 |     def square(x):
 17 |         return x * x
 18 | 
 19 |     def derivative(x):
 20 |         return 2 * x
 21 | 
 22 |     def derivative_estimate():
 23 |         difference_quotient(square, x, h=0.00001)
 24 | 
 25 |     # plot to show they're basically the same
 26 |     import matplotlib.pyplot as plt
 27 |     x = range(-10, 10)
 28 |     plt.plot(x, map(derivative, x), 'rx')  # red  x
 29 |     plt.plot(x, map(derivative_estimate, x), 'b+')  # blue +
 30 |     plt.show()  # purple *, hopefully
 31 | 
 32 | 
 33 | def partial_difference_quotient(f, v, i, h):
 34 |     # add h to just the i-th element of v
 35 |     w = [v_j + (h if j == i else 0)
 36 |          for j, v_j in enumerate(v)]
 37 | 
 38 |     return (f(w) - f(v)) / h
 39 | 
 40 | 
 41 | def estimate_gradient(f, v, h=0.00001):
 42 |     return [partial_difference_quotient(f, v, i, h)
 43 |             for i, _ in enumerate(v)]
 44 | 
 45 | 
 46 | def step(v, direction, step_size):
 47 |     """move step_size in the direction from v"""
 48 |     return [v_i + step_size * direction_i
 49 |             for v_i, direction_i in zip(v, direction)]
 50 | 
 51 | 
 52 | def sum_of_squares_gradient(v):
 53 |     return [2 * v_i for v_i in v]
 54 | 
 55 | 
 56 | def safe(f):
 57 |     """define a new function that wraps f and return it"""
 58 | 
 59 |     def safe_f(*args, **kwargs):
 60 |         try:
 61 |             return f(*args, **kwargs)
 62 |         except:
 63 |             return float('inf')  # this means "infinity" in Python
 64 | 
 65 |     return safe_f
 66 | 
 67 | 
 68 | #
 69 | #
 70 | # minimize / maximize batch
 71 | #
 72 | #
 73 | 
 74 | def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
 75 |     """use gradient descent to find theta that minimizes target function"""
 76 | 
 77 |     step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
 78 | 
 79 |     theta = theta_0  # set theta to initial value
 80 |     target_fn = safe(target_fn)  # safe version of target_fn
 81 |     value = target_fn(theta)  # value we're minimizing
 82 | 
 83 |     while True:
 84 |         gradient = gradient_fn(theta)
 85 |         next_thetas = [step(theta, gradient, -step_size)
 86 |                        for step_size in step_sizes]
 87 | 
 88 |         # choose the one that minimizes the error function
 89 |         next_theta = min(next_thetas, key=target_fn)
 90 |         next_value = target_fn(next_theta)
 91 | 
 92 |         # stop if we're "converging"
 93 |         if abs(value - next_value) < tolerance:
 94 |             return theta
 95 |         else:
 96 |             theta, value = next_theta, next_value
 97 | 
 98 | 
 99 | def negate(f):
100 |     """return a function that for any input x returns -f(x)"""
101 |     return lambda *args, **kwargs: -f(*args, **kwargs)
102 | 
103 | 
104 | def negate_all(f):
105 |     """the same when f returns a list of numbers"""
106 |     return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)]
107 | 
108 | 
109 | def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
110 |     return minimize_batch(negate(target_fn),
111 |                           negate_all(gradient_fn),
112 |                           theta_0,
113 |                           tolerance)
114 | 
115 | 
116 | #
117 | # minimize / maximize stochastic
118 | #
119 | 
120 | 
121 | def in_random_order(data):
122 |     """generator that returns the elements of data in random order"""
123 |     indexes = [i for i, _ in enumerate(data)]  # create a list of indexes
124 |     random.shuffle(indexes)  # shuffle them
125 |     for i in indexes:  # return the data in that order
126 |         yield data[i]
127 | 
128 | 
129 | def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
130 |     data = list(zip(x, y))
131 |     theta = theta_0  # initial guess
132 |     alpha = alpha_0  # initial step size
133 |     min_theta, min_value = None, float("inf")  # the minimum so far
134 |     iterations_with_no_improvement = 0
135 | 
136 |     # if we ever go 100 iterations with no improvement, stop
137 |     while iterations_with_no_improvement < 100:
138 |         value = sum(target_fn(x_i, y_i, theta) for x_i, y_i in data)
139 | 
140 |         if value < min_value:
141 |             # if we've found a new minimum, remember it
142 |             # and go back to the original step size
143 |             min_theta, min_value = theta, value
144 |             iterations_with_no_improvement = 0
145 |             alpha = alpha_0
146 |         else:
147 |             # otherwise we're not improving, so try shrinking the step size
148 |             iterations_with_no_improvement += 1
149 |             alpha *= 0.9
150 | 
151 |         # and take a gradient step for each of the data points
152 |         for x_i, y_i in in_random_order(data):
153 |             gradient_i = gradient_fn(x_i, y_i, theta)
154 |             theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i))
155 | 
156 |     return min_theta
157 | 
158 | 
159 | def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01):
160 |     return minimize_stochastic(negate(target_fn),
161 |                                negate_all(gradient_fn),
162 |                                x, y, theta_0, alpha_0)
163 | 
164 | 
165 | if __name__ == "__main__":
166 | 
167 |     print("using the gradient")
168 | 
169 |     v = [random.randint(-10, 10) for i in range(3)]
170 | 
171 |     tolerance = 0.0000001
172 | 
173 |     while True:
174 |         # print v, sum_of_squares(v)
175 |         gradient = sum_of_squares_gradient(v)  # compute the gradient at v
176 |         next_v = step(v, gradient, -0.01)  # take a negative gradient step
177 |         if distance(next_v, v) < tolerance:  # stop if we're converging
178 |             break
179 |         v = next_v  # continue if we're not
180 | 
181 |     print("minimum v", v)
182 |     print("minimum value", sum_of_squares(v))
183 |     print()
184 |     print("using minimize_batch")
185 | 
186 |     v = [random.randint(-10, 10) for i in range(3)]
187 | 
188 |     v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v)
189 | 
190 |     print("minimum v", v)
191 |     print("minimum value", sum_of_squares(v))
192 | 


--------------------------------------------------------------------------------
/Understanding SQL Queries.md:
--------------------------------------------------------------------------------
  1 | ### Three SQL Concepts you Must Know to Pass the Data Science Interview
  2 | 
  3 | #### Credits: Thanks to Jay Feng for writing this [article](https://www.interviewquery.com/blog-three-sql-questions-you-must-know-to-pass/)
  4 | 
  5 | #### 1. Getting the first or last value for each user in a `transactions` table.
  6 | 
  7 | `transactions`
  8 | 
  9 | | column_name       | data_type     |
 10 | --- | --- |
 11 | | user_id       | int     |     
 12 | | created_at    | datetime|     
 13 | | product       | varchar |     
 14 | 
 15 | ##### Question: Given the user transactions table above, write a query to get the first purchase for each user.
 16 | 
 17 | #### Solution:
 18 | 
 19 | We want to take a table that looks like this:
 20 | 
 21 |  user_id | created_at | product  
 22 |  --- | --- | ---  
 23 |   123    | 2019-01-01 | apple    
 24 |   456    | 2019-01-02 | banana   
 25 |   123    | 2019-01-05 | pear    
 26 |   456    | 2019-01-10 | apple   
 27 |   789    | 2019-01-11 | banana  
 28 | 
 29 | and turn it into this
 30 | 
 31 |  user_id | created_at | product   
 32 |  --- | --- | ---  
 33 |  123     | 2019-01-01 | apple      
 34 |  456     | 2019-01-02 | banana     
 35 |  789     | 2019-01-11 | banana
 36 |  
 37 |  The solution can be broken into two parts:
 38 |  - First make a table of `user_id` and the first purchase (i.e. minimum create date). We can get this by the following query
 39 |  
 40 | ```
 41 | SELECT 
 42 |   user_id, MIN(created_at) AS min_created_at
 43 | FROM 
 44 |   transactions
 45 | GROUP BY 1
 46 | ```
 47 | 
 48 | - Now all we have to do is join this table back to the original on two columns: `user_id` and `created_at`. <br>
 49 | The self join will effectively filter for the first purchase.<br> 
 50 | Then all we have to do is grab all of the columns on the left side table.
 51 | 
 52 | ```
 53 | SELECT 
 54 |   t.user_id, t.created_at, t.product
 55 | FROM 
 56 |   transactions AS t
 57 |   INNER JOIN (
 58 |     SELECT user_id, MIN(created_at) AS min_created_at
 59 |     FROM transactions
 60 |     GROUP BY 1
 61 |   ) AS t1 ON (t.user_id = t1.user_id AND t.created_at = t1.min_created_at)
 62 | ```
 63 | 
 64 | #### 2. Knowing the difference between a LEFT JOIN and INNER JOIN in practice.
 65 | 
 66 |  `users`
 67 |  
 68 |  
 69 | | column_name       | data_type     |
 70 | --- | --- |   
 71 | | id      | int     |     
 72 | | name    | varchar |     
 73 | | city_id | int     |
 74 | 
 75 | `city_id` is `id` in the `cities` table
 76 | 
 77 | `cities`               
 78 | | column_name       | data_type     |
 79 | --- | --- | 
 80 | | id      | int     |   
 81 | | name    | varchar |       
 82 | 
 83 |       
 84 | ##### Question: Given the `users` and `cities` tables above, write a query to return the list of cities without any users.
 85 | 
 86 | This question aims to test the candidate's understanding of the LEFT JOIN and INNER JOIN
 87 | 
 88 | ##### What is the actual difference between a LEFT JOIN and INNER JOIN?
 89 | 
 90 | **INNER JOIN**: returns rows when there is a match in __both tables__.<br> 
 91 | **LEFT JOIN**: returns all rows from the left table, __even if there are no matches in the right table__.
 92 | 
 93 | #### Solution:
 94 | 
 95 | We know that each user in the users table must live in a city given the city_id field.<br> 
 96 | However the `cities` table doesn’t have a `user_id` field. <br> 
 97 | In which if we run an INNER JOIN between these two tables joined by the city_id in each table, we’ll get all of the cities that have users and __all of the cities without users will be filtered out.__
 98 | 
 99 | But what if we run a LEFT JOIN between cities and users?
100 | 
101 | cities.name  | users.id
102 | --- | --- | 
103 | seattle      | 123
104 | seattle      | 124
105 | portland     | null
106 | san diego    | 534
107 | san diego    | 564
108 | 
109 | Here we see that since we are keeping all of the values on the LEFT side of the table, since there’s no match on the city of Portland to any users that exist in the database, the city shows up as NULL. <br>
110 | Therefore now all we have to do is run a __WHERE filter to where any value in the users table is NULL.__
111 | 
112 | ```
113 | SELECT 
114 |   cities.name, users.id
115 | FROM 
116 |   cities
117 |   LEFT JOIN users ON users.city_id = cities.id
118 | WHERE 
119 |   users.id IS NULL
120 | ```
121 | 
122 | #### 3. Aggregations with a conditional statement
123 | 
124 | `transactions`
125 | | column_name       | data_type     |
126 | --- | --- | 
127 | | user_id       | int     |     
128 | | created_at    | datetime|     
129 | | product       | varchar |     
130 | 
131 | ##### Question: Given the same user transactions table as before,write a query to get the total purchases made in the morning versus afternoon/evening (AM vs PM) by day.
132 | 
133 | We are comparing two groups. Every time we have to compare two groups we must use a GROUP BY
134 | 
135 | In this case, we need to create a separate column to actually run our GROUP BY on, which in this case, is the difference between AM or PM in the `created_at` field.
136 | 
137 | ```
138 | CASE 
139 |  WHEN HOUR(created_at) > 11 THEN 'PM' 
140 |  ELSE 'AM' 
141 | END AS time_of_day 
142 | ```
143 | 
144 | We can cast the created_at column to the hour and set the new column value time_of_day as AM or PM based on this condition. 
145 | 
146 | Now we just have to run a GROUP BY on the original `created_at` field truncated to the day AND the new column we created that differentiates each row value. <br> 
147 | The last aggregation will then be the output variable we want which is total purchases by running the COUNT function.
148 | 
149 | ```
150 | SELECT
151 |  DATE_TRUNC('day', created_at) AS date
152 |  ,CASE 
153 |    WHEN HOUR(created_at) > 11 THEN 'PM' 
154 |    ELSE 'AM' 
155 |   END AS time_of_day
156 |  ,COUNT(*)
157 | FROM 
158 |  transactions
159 | GROUP BY 1,2
160 | ```
161 | ### Bonus Questions
162 | 
163 | #### 4.Write an SQL query that makes recommendations using the pages that your friends liked. Assume you have two tables: 
164 | 
165 | `usersAndFriends`
166 | | column_name       | data_type     |
167 | --- | --- | 
168 | | user_id       | int     |     
169 | | friend    | int| 
170 | 
171 | `usersLikedPages`
172 | | column_name       | data_type     |
173 | --- | --- | 
174 | | user_id       | int     |     
175 | | page_id    | int| 
176 | 
177 | #### It should not recommend pages you already like.
178 | 
179 | #### 5.Write an SQL query that shows percentage change month over month in daily active users. Assume you have a table: 
180 | 
181 | `logins`
182 | | column_name       | data_type     |
183 | --- | --- | 
184 | | user_id       | int     |     
185 | | date    | date| 
186 | 


--------------------------------------------------------------------------------
/network_analysis/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections import deque
  3 | from functools import partial
  4 | 
  5 | from helpers.linear_algebra import dot, get_row, get_column, shape, make_matrix, magnitude, scalar_multiply, distance
  6 | from network_analysis.data import users, friendships, endorsements
  7 | 
  8 | for user in users:
  9 |     user["friends"] = []
 10 | 
 11 | # and populate it
 12 | for i, j in friendships:
 13 |     # this works because users[i] is the user whose id is i
 14 |     users[i]["friends"].append(users[j]) # add i as a friend of j
 15 |     users[j]["friends"].append(users[i]) # add j as a friend of i
 16 | 
 17 | 
 18 | def shortest_paths_from(from_user):
 19 | 
 20 |     # a dictionary from "user_id" to *all* shortest paths to that user
 21 |     shortest_paths_to = {from_user["id"]: [[]]}
 22 | 
 23 |     # a queue of (previous_user, next user) that we need to check
 24 |     # starts out with all the pairs (from_user, friend_of_from_user)
 25 |     frontier = deque((from_user, friend)
 26 |                      for friend in from_user["friends"])
 27 | 
 28 |     # keep going until we empty the deque
 29 |     while frontier:
 30 | 
 31 |         prev_user, user = frontier.popleft()    # remove the user who is first in the queue
 32 |         user_id = user["id"]
 33 | 
 34 |         # because of the way we are adding to the queue,
 35 |         # necessarily we already know some shortest paths to prev_user
 36 |         paths_to_prev_user = shortest_paths_to[prev_user["id"]]
 37 |         new_paths_to_user = [path + [user_id] for path in paths_to_prev_user]
 38 | 
 39 |         # it is possible we already know a shortest path
 40 |         old_paths_to_user = shortest_paths_to.get(user_id, [])
 41 | 
 42 |         # what is the shortest path tot here that we have seen so far ?
 43 |         if old_paths_to_user:
 44 |             min_path_length = len(old_paths_to_user[0])
 45 |         else:
 46 |             min_path_length = float('inf')
 47 | 
 48 |         # only keep paths that are not too long and are actually new
 49 |         new_paths_to_user = [path
 50 |                              for path in new_paths_to_user
 51 |                              if len(path) <= min_path_length
 52 |                              and path not in old_paths_to_user]
 53 | 
 54 |         shortest_paths_to[user_id] = old_paths_to_user + new_paths_to_user
 55 | 
 56 |         # add never-seen neighbors to the frontier
 57 |         frontier.extend((user, friend)
 58 |                         for friend in user["friends"]
 59 |                         if friend["id"] not in shortest_paths_to)
 60 | 
 61 |     return shortest_paths_to
 62 | 
 63 | 
 64 | for user in users:
 65 |     user["shortest_paths"] = shortest_paths_from(user)
 66 | 
 67 | for user in users:
 68 |     user["betweenness_centrality"] = 0.0
 69 | 
 70 | for source in users:
 71 |     source_id = source["id"]
 72 |     for target_id, paths in source["shortest_paths"].items():
 73 |         if source_id < target_id:       # don't double count
 74 |             num_paths = len(paths)      # how many shortest paths?
 75 |             contrib = 1 / num_paths     # contribution to centrality
 76 | 
 77 |             for path in paths:
 78 |                 for id in path:
 79 |                     if id not in [source_id, target_id]:
 80 |                         users[id]["betweenness_centrality"] += contrib
 81 | 
 82 | 
 83 | def farness(user):
 84 |     """the sum of the lengths of the shortest paths to each other user"""
 85 |     return sum(len(paths[0])
 86 |                for paths in user["shortest_paths"].values())
 87 | 
 88 | 
 89 | for user in users:
 90 |     user["closeness_centrality"] = 1 / farness(user)
 91 | 
 92 | """Eigenvector Centrality"""
 93 | 
 94 | 
 95 | def matrix_product_entry(A, B, i, j):
 96 |     return dot(get_row(A, i), get_column(B, j))
 97 | 
 98 | 
 99 | def matrix_multiply(A, B):
100 |     n1, k1 = shape(A)
101 |     n2, k2 = shape(B)
102 | 
103 |     if k1 != n2:
104 |         raise ArithmeticError("incompatible shapes!")
105 | 
106 |     return make_matrix(n1, k2, partial(matrix_product_entry, A, B))
107 | 
108 | 
109 | def vector_as_matrix(v):
110 |     """returns the vector v (represented as a list) as a n x 1 matrix"""
111 |     return [[v_i] for v_i in v]
112 | 
113 | 
114 | def vector_from_matrix(v_as_matrix):
115 |     """returns the n x 1 matrix as a list of values"""
116 |     return [row[0] for row in v_as_matrix]
117 | 
118 | 
119 | def matrix_operation(A, v):
120 |     v_as_matrix = vector_as_matrix(v)
121 |     product = matrix_multiply(A, v_as_matrix)
122 |     return vector_from_matrix(product)
123 | 
124 | 
125 | def find_eigenvector(A, tolerance=0.00001):
126 |     guess = [random.random() for _ in A]
127 | 
128 |     while True:
129 |         result = matrix_operation(A, guess)
130 |         length = magnitude(result)
131 |         next_guess = scalar_multiply(1/length, result)
132 | 
133 |         if distance(guess, next_guess) < tolerance:
134 |             return next_guess, length   # eigenvector, eigenvalue
135 |         guess = next_guess
136 | 
137 | 
138 | def entry_fn(i, j):
139 |     return 1 if (i, j) in friendships or (j, i) in friendships else 0
140 | 
141 | 
142 | n = len(users)
143 | adjacency_matrix = make_matrix(n, n, entry_fn)
144 | eigenvector_centralities, _ = find_eigenvector(adjacency_matrix)
145 | 
146 | """Directed Graphs and PageRank"""
147 | for user in users:
148 |     user["endorses"] = []       # add one list to track outgoing endorsements
149 |     user["endorsed_by"] = []    # and another to track endorsements
150 | 
151 | for source_id, target_id in endorsements:
152 |     users[source_id]["endorses"].append(users[target_id])
153 |     users[target_id]["endorsed_by"].append(users[source_id])
154 | 
155 | endorsements_by_id = [(user["id"], len(user["endorsed_by"]))
156 |                       for user in users]
157 | 
158 | sorted(endorsements_by_id,
159 |        key=lambda pair: pair[1],
160 |        reverse=True)
161 | 
162 | 
163 | def page_rank(users, damping=0.85, num_iters=100):
164 | 
165 |     # initially distribute PageRank evenly
166 |     num_users = len(users)
167 |     pr = {user["id"]: 1 / num_users for user in users}
168 | 
169 |     # this is the small fraction of PageRank
170 |     # that each node gets each iteration
171 |     base_pr = (1 - damping) / num_users
172 | 
173 |     for _ in range(num_iters):
174 |         next_pr = {user["id"]: base_pr for user in users}
175 |         for user in users:
176 |             # distribute PageRank to outgoing links
177 |             links_pr = pr[user["id"]] * damping
178 |             for endorsee in user["endorses"]:
179 |                 next_pr[endorsee["id"]] += links_pr / len(user["endorses"])
180 | 
181 |         pr = next_pr
182 | 
183 |     return pr


--------------------------------------------------------------------------------
/hypothesis_inference.py:
--------------------------------------------------------------------------------
  1 | from helpers.probability import normal_cdf, inverse_normal_cdf
  2 | import math, random
  3 | 
  4 | 
  5 | def normal_approximation_to_binomial(n, p):
  6 |     """finds mu and sigma corresponding to a Binomial(n, p)"""
  7 |     mu = p * n
  8 |     sigma = math.sqrt(p * (1 - p) * n)
  9 |     return mu, sigma
 10 | 
 11 | 
 12 | #####
 13 | #
 14 | # probabilities a normal lies in an interval
 15 | #
 16 | ######
 17 | 
 18 | # the normal cdf _is_ the probability the variable is below a threshold
 19 | normal_probability_below = normal_cdf
 20 | 
 21 | 
 22 | # it's above the threshold if it's not below the threshold
 23 | def normal_probability_above(lo, mu=0, sigma=1):
 24 |     return 1 - normal_cdf(lo, mu, sigma)
 25 | 
 26 | 
 27 | # it's between if it's less than hi, but not less than lo
 28 | def normal_probability_between(lo, hi, mu=0, sigma=1):
 29 |     return normal_cdf(hi, mu, sigma) - normal_cdf(lo, mu, sigma)
 30 | 
 31 | 
 32 | # it's outside if it's not between
 33 | def normal_probability_outside(lo, hi, mu=0, sigma=1):
 34 |     return 1 - normal_probability_between(lo, hi, mu, sigma)
 35 | 
 36 | 
 37 | ######
 38 | #
 39 | #  normal bounds
 40 | #
 41 | ######
 42 | 
 43 | 
 44 | def normal_upper_bound(probability, mu=0, sigma=1):
 45 |     """returns the z for which P(Z <= z) = probability"""
 46 |     return inverse_normal_cdf(probability, mu, sigma)
 47 | 
 48 | 
 49 | def normal_lower_bound(probability, mu=0, sigma=1):
 50 |     """returns the z for which P(Z >= z) = probability"""
 51 |     return inverse_normal_cdf(1 - probability, mu, sigma)
 52 | 
 53 | 
 54 | def normal_two_sided_bounds(probability, mu=0, sigma=1):
 55 |     """returns the symmetric (about the mean) bounds
 56 |     that contain the specified probability"""
 57 |     tail_probability = (1 - probability) / 2
 58 | 
 59 |     # upper bound should have tail_probability above it
 60 |     upper_bound = normal_lower_bound(tail_probability, mu, sigma)
 61 | 
 62 |     # lower bound should have tail_probability below it
 63 |     lower_bound = normal_upper_bound(tail_probability, mu, sigma)
 64 | 
 65 |     return lower_bound, upper_bound
 66 | 
 67 | 
 68 | def two_sided_p_value(x, mu=0, sigma=1):
 69 |     if x >= mu:
 70 |         # if x is greater than the mean, the tail is above x
 71 |         return 2 * normal_probability_above(x, mu, sigma)
 72 |     else:
 73 |         # if x is less than the mean, the tail is below x
 74 |         return 2 * normal_probability_below(x, mu, sigma)
 75 | 
 76 | 
 77 | def count_extreme_values():
 78 |     extreme_value_count = 0
 79 |     for _ in range(100000):
 80 |         num_heads = sum(1 if random.random() < 0.5 else 0  # count # of heads
 81 |                         for _ in range(1000))  # in 1000 flips
 82 |         if num_heads >= 530 or num_heads <= 470:  # and count how often
 83 |             extreme_value_count += 1  # the # is 'extreme'
 84 | 
 85 |     return extreme_value_count / 100000
 86 | 
 87 | 
 88 | upper_p_value = normal_probability_above
 89 | lower_p_value = normal_probability_below
 90 | 
 91 | 
 92 | ##
 93 | #
 94 | # P-hacking
 95 | #
 96 | ##
 97 | 
 98 | def run_experiment():
 99 |     """flip a fair coin 1000 times, True = heads, False = tails"""
100 |     return [random.random() < 0.5 for _ in range(1000)]
101 | 
102 | 
103 | def reject_fairness(experiment):
104 |     """using the 5% significance levels"""
105 |     num_heads = len([flip for flip in experiment if flip])
106 |     return num_heads < 469 or num_heads > 531
107 | 
108 | 
109 | ##
110 | #
111 | # running an A/B test
112 | #
113 | ##
114 | 
115 | def estimated_parameters(N, n):
116 |     p = n / N
117 |     sigma = math.sqrt(p * (1 - p) / N)
118 |     return p, sigma
119 | 
120 | 
121 | def a_b_test_statistic(N_A, n_A, N_B, n_B):
122 |     p_A, sigma_A = estimated_parameters(N_A, n_A)
123 |     p_B, sigma_B = estimated_parameters(N_B, n_B)
124 |     return (p_B - p_A) / math.sqrt(sigma_A ** 2 + sigma_B ** 2)
125 | 
126 | 
127 | ##
128 | #
129 | # Bayesian Inference
130 | #
131 | ##
132 | 
133 | def B(alpha, beta):
134 |     """a normalizing constant so that the total probability is 1"""
135 |     return math.gamma(alpha) * math.gamma(beta) / math.gamma(alpha + beta)
136 | 
137 | 
138 | def beta_pdf(x, alpha, beta):
139 |     if x < 0 or x > 1:  # no weight outside of [0, 1]
140 |         return 0
141 |     return x ** (alpha - 1) * (1 - x) ** (beta - 1) / B(alpha, beta)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     mu_0, sigma_0 = normal_approximation_to_binomial(1000, 0.5)
146 |     print("mu_0", mu_0)
147 |     print("sigma_0", sigma_0)
148 |     print("normal_two_sided_bounds(0.95, mu_0, sigma_0)", normal_two_sided_bounds(0.95, mu_0, sigma_0))
149 |     print()
150 |     print("power of a test")
151 | 
152 |     print("95% bounds based on assumption p is 0.5")
153 | 
154 |     lo, hi = normal_two_sided_bounds(0.95, mu_0, sigma_0)
155 |     print("lo", lo)
156 |     print("hi", hi)
157 | 
158 |     print("actual mu and sigma based on p = 0.55")
159 |     mu_1, sigma_1 = normal_approximation_to_binomial(1000, 0.55)
160 |     print("mu_1", mu_1)
161 |     print("sigma_1", sigma_1)
162 | 
163 |     # a type 2 error means we fail to reject the null hypothesis
164 |     # which will happen when X is still in our original interval
165 |     type_2_probability = normal_probability_between(lo, hi, mu_1, sigma_1)
166 |     power = 1 - type_2_probability  # 0.887
167 | 
168 |     print("type 2 probability", type_2_probability)
169 |     print("power", power)
170 |     print()
171 |     print("one-sided test")
172 |     hi = normal_upper_bound(0.95, mu_0, sigma_0)
173 |     print("hi", hi)  # is 526 (< 531, since we need more probability in the upper tail)
174 |     type_2_probability = normal_probability_below(hi, mu_1, sigma_1)
175 |     power = 1 - type_2_probability  # = 0.936
176 |     print("type 2 probability", type_2_probability)
177 |     print("power", power)
178 |     print()
179 | 
180 |     print("two_sided_p_value(529.5, mu_0, sigma_0)", two_sided_p_value(529.5, mu_0, sigma_0))
181 | 
182 |     print("two_sided_p_value(531.5, mu_0, sigma_0)", two_sided_p_value(531.5, mu_0, sigma_0))
183 | 
184 |     print("upper_p_value(525, mu_0, sigma_0)", upper_p_value(525, mu_0, sigma_0))
185 |     print("upper_p_value(527, mu_0, sigma_0)", upper_p_value(527, mu_0, sigma_0))
186 |     print()
187 | 
188 |     print("P-hacking")
189 | 
190 |     random.seed(0)
191 |     experiments = [run_experiment() for _ in range(1000)]
192 |     num_rejections = len([experiment
193 |                           for experiment in experiments
194 |                           if reject_fairness(experiment)])
195 | 
196 |     print(num_rejections, "rejections out of 1000")
197 |     print()
198 | 
199 |     print("A/B testing")
200 |     z = a_b_test_statistic(1000, 200, 1000, 180)
201 |     print("a_b_test_statistic(1000, 200, 1000, 180)", z)
202 |     print("p-value", two_sided_p_value(z))
203 |     z = a_b_test_statistic(1000, 200, 1000, 150)
204 |     print("a_b_test_statistic(1000, 200, 1000, 150)", z)
205 |     print("p-value", two_sided_p_value(z))
206 | 


--------------------------------------------------------------------------------
/friendster_network.py:
--------------------------------------------------------------------------------
  1 | ##########################
  2 | # Finding Key Connectors #
  3 | ##########################
  4 | 
  5 | # dictionary of each user and their id
  6 | users = [
  7 |     {"id": 0, "name": "Hero"},
  8 |     {"id": 1, "name": "Dunn"},
  9 |     {"id": 2, "name": "Sue"},
 10 |     {"id": 3, "name": "Chi"},
 11 |     {"id": 4, "name": "Thor"},
 12 |     {"id": 5, "name": "Clive"},
 13 |     {"id": 6, "name": "Hicks"},
 14 |     {"id": 7, "name": "Devin"},
 15 |     {"id": 8, "name": "Kate"},
 16 |     {"id": 9, "name": "Klein"}
 17 | ]
 18 | 
 19 | # friendship data as a list of tuples
 20 | friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
 21 |                (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
 22 | 
 23 | # assign empty list to each user
 24 | for user in users:
 25 |     user["friends"] = []
 26 | 
 27 | for i, j in friendships:
 28 |     users[i]["friends"].append(users[j])  # add i as a friend of j
 29 |     users[j]["friends"].append(users[i])  # add j as a friend of i
 30 | 
 31 | 
 32 | def number_of_friends(user):
 33 |     return len(user["friends"])
 34 | 
 35 | 
 36 | total_connections = sum(number_of_friends(user) for user in users)
 37 | print(total_connections)
 38 | 
 39 | num_users = len(users)
 40 | avg_connections = total_connections / num_users
 41 | print(avg_connections)
 42 | 
 43 | num_friends_by_id = [(user["id"], number_of_friends(user)) for user in users]
 44 | print(num_friends_by_id)
 45 | 
 46 | 
 47 | ###############################
 48 | # Data Scientist You May Know #
 49 | ###############################
 50 | 
 51 | 
 52 | def friends_of_friend_ids_bad(user):
 53 |     return [foaf["id"]
 54 |             for friend in user["friends"]  # for each of user's friend
 55 |             for foaf in friend["friends"]]  # for each of their friends
 56 | 
 57 | 
 58 | print(friends_of_friend_ids_bad(users[0]))  # Data Scientists Hero may know
 59 | 
 60 | from collections import Counter
 61 | 
 62 | 
 63 | def not_the_same(user, other_user):
 64 |     # Two users are not same if they have different ids
 65 |     return user["id"] != other_user["id"]
 66 | 
 67 | 
 68 | def not_friends(user, other_user):
 69 |     # other_user is not a friend if he is not in user["friends"]
 70 |     return all(not_the_same(friend, other_user) for friend in user["friends"])
 71 | 
 72 | 
 73 | def friends_of_friend_ids(user):
 74 |     return Counter(foaf["id"]
 75 |                    for friend in user["friends"]  # for each of my friends
 76 |                    for foaf in friend["friends"]  # count *their* friends
 77 |                    if not_the_same(user, foaf)  # who aren't me
 78 |                    and not_friends(user, foaf))  # and aren't my friends
 79 | 
 80 | 
 81 | print(friends_of_friend_ids(users[3]))  # Data Scientists Chi may know
 82 | 
 83 | interests = [
 84 |     (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
 85 |     (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
 86 |     (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
 87 |     (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
 88 |     (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
 89 |     (3, "statistics"), (3, "regression"), (3, "probability"),
 90 |     (4, "machine learning"), (4, "regression"), (4, "decision trees"),
 91 |     (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
 92 |     (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
 93 |     (6, "probability"), (6, "mathematics"), (6, "theory"),
 94 |     (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
 95 |     (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
 96 |     (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
 97 |     (9, "Java"), (9, "MapReduce"), (9, "Big Data")
 98 | ]
 99 | 
100 | 
101 | def data_scientists_who_like(target_interest):
102 |     return [user_id
103 |             for user_id, user_interest in interests
104 |             if user_interest == target_interest]
105 | 
106 | 
107 | from collections import defaultdict
108 | 
109 | user_ids_by_interest = defaultdict(list)
110 | for user_id, interest in interests:
111 |     user_ids_by_interest[interest].append(user_id)
112 | 
113 | print(user_ids_by_interest)
114 | 
115 | interests_by_user_ids = defaultdict(list)
116 | for user_id, interest in interests:
117 |     interests_by_user_ids[user_id].append(interest)
118 | 
119 | print(interests_by_user_ids)
120 | 
121 | 
122 | def most_common_interests_with(user):
123 |     return Counter(interested_user_id
124 |                    for interest in interests_by_user_ids[user["id"]]
125 |                    for interested_user_id in user_ids_by_interest[interest]
126 |                    if interested_user_id != user["id"])
127 | 
128 | 
129 | print(most_common_interests_with(users[6]))
130 | 
131 | ###########################
132 | # Salaries and Experience #
133 | ###########################
134 | salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
135 |                         (48000, 0.7), (76000, 6),
136 |                         (69000, 6.5), (76000, 7.5),
137 |                         (60000, 2.5), (83000, 10),
138 |                         (48000, 1.9), (63000, 4.2)]
139 | 
140 | from matplotlib import pyplot as plt
141 | 
142 | 
143 | def make_chart_salaries_by_tenure():
144 |     tenures = [tenure for salary, tenure in salaries_and_tenures]
145 |     salaries = [salary for salary, tenure in salaries_and_tenures]
146 |     plt.scatter(tenures, salaries)
147 |     plt.xlabel("Years Experience")
148 |     plt.ylabel("Salary")
149 |     plt.show()
150 | 
151 | 
152 | salary_by_tenure = defaultdict(list)
153 | 
154 | for salary, tenure in salaries_and_tenures:
155 |     salary_by_tenure[tenure].append(salary)
156 | 
157 | average_salary_by_tenure = {
158 |     tenure: sum(salaries) / len(salaries)
159 |     for tenure, salaries in salary_by_tenure.items()
160 | }
161 | 
162 | print(average_salary_by_tenure)
163 | 
164 | 
165 | def tenure_bucket(tenure):
166 |     if tenure < 2:
167 |         return "less than two"
168 |     elif tenure < 5:
169 |         return "between two and five"
170 |     else:
171 |         return "more than five"
172 | 
173 | 
174 | salary_by_tenure_bucket = defaultdict(list)
175 | for salary, tenure in salaries_and_tenures:
176 |     bucket = tenure_bucket(tenure)
177 |     salary_by_tenure_bucket[bucket].append(salary)
178 | 
179 | average_salary_by_bucket = {
180 |     tenure_bucket: sum(salaries) / len(salaries)
181 |     for tenure_bucket, salaries in salary_by_tenure_bucket.items()
182 | }
183 | 
184 | print(average_salary_by_bucket)
185 | 
186 | 
187 | #################
188 | # Paid Accounts #
189 | #################
190 | 
191 | 
192 | def predict_paid_or_unpaid(years_experience):
193 |     if years_experience < 3.0:
194 |         return "paid"
195 |     elif years_experience < 8.5:
196 |         return "unpaid"
197 |     else:
198 |         return "paid"
199 | 
200 | 
201 | #######################
202 | # Topics of Interests #
203 | #######################
204 | 
205 | words_and_counts = Counter(word
206 |                            for user, interest in interests
207 |                            for word in str(interest).lower().split())
208 | 
209 | for word, count in words_and_counts.most_common():
210 |     if count > 1:
211 |         print(word, count)
212 | 


--------------------------------------------------------------------------------
/helpers/stats.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from helpers.linear_algebra import sum_of_squares, dot
  3 | import math
  4 | 
  5 | num_friends = [100, 49, 41, 40, 25, 21, 21, 19, 19, 18, 18, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 10,
  6 |                10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
  7 |                9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6,
  8 |                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4,
  9 |                4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 10 |                3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 11 |                1, 1, 1, 1, 1, 1, 1, 1]
 12 | 
 13 | 
 14 | def make_friend_counts_histogram(plt):
 15 |     friend_counts = Counter(num_friends)
 16 |     xs = range(101)
 17 |     ys = [friend_counts[x] for x in xs]
 18 |     plt.bar(xs, ys)
 19 |     plt.axis([0, 101, 0, 25])
 20 |     plt.title("Histogram of Friend Counts")
 21 |     plt.xlabel("# of friends")
 22 |     plt.ylabel("# of people")
 23 |     plt.show()
 24 | 
 25 | 
 26 | num_points = len(num_friends)  # 204
 27 | 
 28 | largest_value = max(num_friends)  # 100
 29 | smallest_value = min(num_friends)  # 1
 30 | 
 31 | sorted_values = sorted(num_friends)
 32 | # smallest_value = sorted_values[0]           # 1
 33 | second_smallest_value = sorted_values[1]  # 1
 34 | second_largest_value = sorted_values[-2]  # 49
 35 | 
 36 | 
 37 | # this isn't right if you don't from __future__ import division
 38 | 
 39 | 
 40 | def mean(x):
 41 |     return sum(x) / len(x)
 42 | 
 43 | 
 44 | def median(v):
 45 |     """finds the 'middle-most' value of v"""
 46 |     n = len(v)
 47 |     sorted_v = sorted(v)
 48 |     midpoint = n // 2
 49 | 
 50 |     if n % 2 == 1:
 51 |         # if odd,  return the middle value
 52 |         return sorted_v[midpoint]
 53 |     else:
 54 |         # if even,  return the average of the middle values
 55 |         lo = midpoint - 1
 56 |         hi = midpoint
 57 |         return (sorted_v[lo] + sorted_v[hi]) / 2
 58 | 
 59 | 
 60 | def quantile(x, p):
 61 |     """returns the pth-percentile value in x"""
 62 |     p_index = int(p * len(x))
 63 |     return sorted(x)[p_index]
 64 | 
 65 | 
 66 | def mode(x):
 67 |     """returns a list,  might be more than one mode"""
 68 |     counts = Counter(x)
 69 |     max_count = max(counts.values())
 70 |     return [x_i for x_i, count in counts.items()
 71 |             if count == max_count]
 72 | 
 73 | 
 74 | # "range" already means something in Python,  so we'll use a different name
 75 | 
 76 | 
 77 | def data_range(x):
 78 |     return max(x) - min(x)
 79 | 
 80 | 
 81 | def de_mean(x):
 82 |     """translate x by subtracting its mean (so the result has mean 0)"""
 83 |     x_bar = mean(x)
 84 |     return [x_i - x_bar for x_i in x]
 85 | 
 86 | 
 87 | def variance(x):
 88 |     """assumes x has at least two elements"""
 89 |     n = len(x)
 90 |     deviations = de_mean(x)
 91 |     return sum_of_squares(deviations) / (n - 1)
 92 | 
 93 | 
 94 | def standard_deviation(x):
 95 |     return math.sqrt(variance(x))
 96 | 
 97 | 
 98 | def interquartile_range(x):
 99 |     return quantile(x, 0.75) - quantile(x, 0.25)
100 | 
101 | 
102 | ####
103 | #
104 | # CORRELATION
105 | #
106 | #####
107 | 
108 | 
109 | daily_minutes = [1, 68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22, 34.76, 54.01, 38.79, 47.59, 49.1,
110 |                  27.66, 41.03, 36.73, 48.65, 28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57, 31.65, 31.21,
111 |                  36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8, 24.23, 21.4, 27.94, 32.24, 40.57, 25.07,
112 |                  19.42, 22.39, 18.42, 46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02, 29.47, 30.2, 31, 38.11, 38.18,
113 |                  36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28, 15.28, 24.17, 22.31, 30.17, 25.53, 19.85, 35.37, 44.6,
114 |                  17.23, 13.47, 26.33, 35.02, 32.09, 24.81, 19.33, 28.77, 24.26, 31.98, 25.73, 24.86, 16.28, 34.51,
115 |                  15.23, 39.72, 40.8, 26.06, 35.76, 34.76, 16.13, 44.04, 18.03, 19.65, 32.62, 35.59, 39.43, 14.18, 35.24,
116 |                  40.13, 41.82, 35.45, 36.07, 43.67, 24.61, 20.9, 21.9, 18.79, 27.61, 27.21, 26.61, 29.77, 20.59, 27.53,
117 |                  13.82, 33.2, 25, 33.1, 36.65, 18.63, 14.87, 22.2, 36.81, 25.53, 24.62, 26.25, 18.21, 28.08, 19.42,
118 |                  29.79, 32.8, 35.99, 28.32, 27.79, 35.88, 29.06, 36.28, 14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48,
119 |                  18.95, 33.55, 14.24, 29.04, 32.51, 25.63, 22.22, 19, 32.73, 15.16, 13.9, 27.2, 32.01, 29.27, 33, 13.74,
120 |                  20.42, 27.32, 18.23, 35.35, 28.48, 9.08, 24.62, 20.12, 35.26, 19.92, 31.02, 16.49, 12.16, 30.7, 31.22,
121 |                  34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82, 23.39, 30.93, 15.03, 21.67,
122 |                  31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84]
123 | 
124 | 
125 | def covariance(x, y):
126 |     n = len(x)
127 |     return dot(de_mean(x), de_mean(y)) / (n - 1)
128 | 
129 | 
130 | def correlation(x, y):
131 |     stdev_x = standard_deviation(x)
132 |     stdev_y = standard_deviation(y)
133 |     if stdev_x > 0 and stdev_y > 0:
134 |         return covariance(x, y) / stdev_x / stdev_y
135 |     else:
136 |         return 0  # if no variation,  correlation is zero
137 | 
138 | 
139 | outlier = num_friends.index(100)  # index of outlier
140 | 
141 | num_friends_good = [x
142 |                     for i, x in enumerate(num_friends)
143 |                     if i != outlier]
144 | 
145 | daily_minutes_good = [x
146 |                       for i, x in enumerate(daily_minutes)
147 |                       if i != outlier]
148 | 
149 | # alpha, beta = least_squares_fit(num_friends_good, daily_minutes_good)
150 | 
151 | if __name__ == "__main__":
152 |     print("num_points", len(num_friends))
153 |     print("largest value", max(num_friends))
154 |     print("smallest value", min(num_friends))
155 | 
156 |     print("second_smallest_value", sorted_values[1])
157 |     print("second_largest_value", sorted_values[-2])
158 | 
159 |     print("mean(num_friends)", mean(num_friends))
160 |     print("median(num_friends)", median(num_friends))
161 | 
162 |     print("quantile(num_friends,  0.10)", quantile(num_friends, 0.10))
163 |     print("quantile(num_friends,  0.25)", quantile(num_friends, 0.25))
164 |     print("quantile(num_friends,  0.75)", quantile(num_friends, 0.75))
165 |     print("quantile(num_friends,  0.90)", quantile(num_friends, 0.90))
166 |     
167 |     print("mode(num_friends)", mode(num_friends))
168 |     print("data_range(num_friends)", data_range(num_friends))
169 |     print("variance(num_friends)", variance(num_friends))
170 |     print("standard_deviation(num_friends)", standard_deviation(num_friends))
171 |     print("interquartile_range(num_friends)", interquartile_range(num_friends))
172 | 
173 |     print("covariance(num_friends,  daily_minutes)", covariance(num_friends, daily_minutes))
174 |     print("correlation(num_friends,  daily_minutes)", correlation(num_friends, daily_minutes))
175 |     print("correlation(num_friends_good,  daily_minutes_good)", correlation(num_friends_good, daily_minutes_good))
176 |     # print("R-squared value", r_squared(alpha, beta, num_friends_good, daily_minutes_good))
177 | 


--------------------------------------------------------------------------------
/natural_language_processing/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import re
  3 | from collections import defaultdict, Counter
  4 | 
  5 | from natural_language_processing.data import data, documents
  6 | import matplotlib.pyplot as plt
  7 | from bs4 import BeautifulSoup
  8 | import requests
  9 | 
 10 | 
 11 | def plot_resumes():
 12 |     """Word Clouds"""
 13 | 
 14 |     def text_size(total):
 15 |         return 8 + total / 200 * 20
 16 | 
 17 |     for word, job_popularity, resume_popularity in data:
 18 |         plt.text(job_popularity, resume_popularity, word,
 19 |                  ha='center',
 20 |                  va='center',
 21 |                  size=text_size(job_popularity + resume_popularity))
 22 | 
 23 |     plt.xlabel("Popularity on Job Postings")
 24 |     plt.ylabel("Popularity on Resumes")
 25 |     plt.axis([0, 100, 0, 100])
 26 |     plt.xticks([])
 27 |     plt.yticks([])
 28 |     plt.show()
 29 | 
 30 | 
 31 | """n-grams Model"""
 32 | 
 33 | 
 34 | def fix_unicode(text):
 35 |     return text.replace(u"\u2019", "'")
 36 | 
 37 | 
 38 | def get_document():
 39 |     url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
 40 |     html = requests.get(url).text
 41 |     soup = BeautifulSoup(html, 'html5lib')
 42 | 
 43 |     content = soup.find("div", "article-body")  # find article-body div
 44 |     regex = r"[\w']+|[\.]"  # matches a word or a period
 45 | 
 46 |     document = []
 47 | 
 48 |     for paragraph in content("p"):
 49 |         words = re.findall(regex, fix_unicode(paragraph.text))
 50 |         document.extend(words)
 51 | 
 52 |     return document
 53 | 
 54 | 
 55 | def generate_using_bigrams(transitions):
 56 |     current = "."  # this means the next word will start with a sentence
 57 |     result = []
 58 |     while True:
 59 |         next_word_candidates = transitions[current]  # bigrams (current, _)
 60 |         current = random.choice(next_word_candidates)  # choose one at random
 61 |         result.append(current)  # append it to results
 62 |         if current == ".":
 63 |             return " ".join(result)  # if "." we're done
 64 | 
 65 | 
 66 | def generate_using_trigrams(starts, transitions):
 67 |     current = random.choice(starts)  # choose a random starting word
 68 |     prev = "."
 69 |     result = [current]
 70 |     while True:
 71 |         next_word_candidates = transitions[(prev, current)]
 72 |         next = random.choice(next_word_candidates)
 73 | 
 74 |         prev, current = current, next
 75 |         result.append(current)  # append it to results
 76 |         if current == ".":
 77 |             return " ".join(result)  # if "." we're done
 78 | 
 79 | 
 80 | """Grammars"""
 81 | 
 82 | 
 83 | def is_terminal(token):
 84 |     return token[0] != "_"
 85 | 
 86 | 
 87 | def expand(grammar, tokens):
 88 |     for i, token in enumerate(tokens):
 89 | 
 90 |         # skip over terminals
 91 |         if is_terminal(token): continue
 92 | 
 93 |         # if we get here, we found a non-terminal token
 94 |         # so we need to choose a replacement at random
 95 |         replacement = random.choice(grammar[token])
 96 | 
 97 |         if is_terminal(replacement):
 98 |             tokens[i] = replacement
 99 |         else:
100 |             tokens = tokens[:i] + replacement.split() + tokens[(i + 1):]
101 | 
102 |         # now call expand on the new list of tokens
103 |         return expand(grammar, tokens)
104 | 
105 |     # if we get here we had all terminals and are done
106 |     return tokens
107 | 
108 | 
109 | def generate_sentence(grammar):
110 |     return expand(grammar, ["_S"])
111 | 
112 | 
113 | """Gibbs Sampling"""
114 | 
115 | 
116 | def roll_a_die():
117 |     return random.choice([1, 2, 3, 4, 5, 6])
118 | 
119 | 
120 | def direct_sample():
121 |     d1 = roll_a_die()
122 |     d2 = roll_a_die()
123 |     return d1, d1 + d2
124 | 
125 | 
126 | def random_y_given_x(x):
127 |     return x + roll_a_die()
128 | 
129 | 
130 | def random_x_given_y(y):
131 |     if y <= 7:
132 |         return random.randrange(1, y)
133 |     else:
134 |         return random.randrange(y - 6, 7)
135 | 
136 | 
137 | def gibbs_sampling(num_iters=100):
138 |     x, y = 1, 2
139 |     for _ in range(num_iters):
140 |         x = random_x_given_y(y)
141 |         y = random_y_given_x(x)
142 |     return x, y
143 | 
144 | 
145 | def compare_distributions(num_samples=1000):
146 |     counts = defaultdict(lambda: [0, 0])
147 |     for _ in range(num_samples):
148 |         counts[gibbs_sampling()][0] += 1
149 |         counts[direct_sample()][1] += 1
150 |     return counts
151 | 
152 | 
153 | """Topic Modelling"""
154 | 
155 | 
156 | def sample_from(weights):
157 |     """returns i with probability weights[i] / sum(weights)"""
158 |     total = sum(weights)
159 |     rnd = total * random.random()  # uniform between 0 and total
160 |     for i, w in enumerate(weights):
161 |         rnd -= w  # return the smallest i such
162 |         if rnd <= 0:  # weights[0] + ... + weights[i] >=rnd
163 |             return i
164 | 
165 | 
166 | K = 4
167 | 
168 | document_topic_counts = [Counter() for _ in documents]
169 | # print(document_topic_counts)
170 | topic_word_counts = [Counter() for _ in range(K)]
171 | topic_counts = [0 for _ in range(K)]
172 | document_lengths = [len(d) for d in documents]
173 | 
174 | distinct_words = set(word
175 |                      for document in documents
176 |                      for word in document)
177 | 
178 | W = len(distinct_words)
179 | D = len(documents)
180 | 
181 | 
182 | def p_topic_given_document(topic, d, alpha=0.1):
183 |     """the fraction of words in document 'd'
184 |     that are assigned to 'topic' (plus some smoothing)"""
185 |     return ((document_topic_counts[d][topic] + alpha) / (document_lengths[d] + K * alpha))
186 | 
187 | 
188 | def p_word_given_topic(word, topic, beta=0.1):
189 |     """the fraction of words in document 'd'
190 |     that are assigned to 'topic' (plus some smoothing)"""
191 |     return ((topic_word_counts[topic][word] + beta) / (topic_counts[topic] + W * beta))
192 | 
193 | 
194 | def topic_weight(d, word, k):
195 |     """given a document and a word in that document,
196 |     return the weight for the k-th topic"""
197 |     return p_word_given_topic(word, k) * p_topic_given_document(k, d)
198 | 
199 | 
200 | def choose_new_topic(d, word):
201 |     return sample_from([topic_weight(d, word, k)
202 |                         for k in range(K)])
203 | 
204 | 
205 | random.seed(0)
206 | document_topics = [[random.randrange(K) for word in document]
207 |                    for document in documents]
208 | 
209 | for d in range(D):
210 |     for word, topic in zip(documents[d], document_topics[d]):
211 |         document_topic_counts[d][topic] += 1
212 |         topic_word_counts[topic][word] += 1
213 |         topic_counts[topic] += 1
214 | 
215 | for iter in range(1000):
216 |     for d in range(D):
217 |         for i, (word, topic) in enumerate(zip(documents[d], document_topics[d])):
218 |             # remove this word/topic from the counts
219 |             # so that it doesn't influence the weights
220 |             document_topic_counts[d][topic] -= 1
221 |             topic_word_counts[topic][word] -= 1
222 |             topic_counts[topic] -= 1
223 |             document_lengths[d] -= 1
224 | 
225 |             # choose a new topic based on the weights
226 |             new_topic = choose_new_topic(d, word)
227 |             document_topics[d][i] = new_topic
228 | 
229 |             # and now add it back to the counts
230 |             document_topic_counts[d][new_topic] += 1
231 |             topic_word_counts[topic][word] += 1
232 |             topic_counts[topic] += 1
233 |             document_lengths[d] += 1
234 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Machine-Learning-with-Python ![GitHub stars](https://img.shields.io/github/stars/devAmoghS/Machine-Learning-with-Python?style=for-the-badge)  ![GitHub forks](https://img.shields.io/github/forks/devAmoghS/Machine-Learning-with-Python?label=Forks&style=for-the-badge)
  2 | 
  3 | ## Star History
  4 | 
  5 | [![Star History Chart](https://api.star-history.com/svg?repos=devAmoghS/Machine-Learning-with-Python&type=Date)](https://star-history.com/#devAmoghS/Machine-Learning-with-Python&Date)
  6 | 
  7 | 
  8 | ![alt text](https://media.istockphoto.com/vectors/machine-learning-3-step-infographic-artificial-intelligence-machine-vector-id962219860?k=6&m=962219860&s=612x612&w=0&h=yricYyUqZbILMHp3IvtenS3xbRDhu1w1u5kk2az5tbo=)
  9 | 
 10 | ## Small scale machine learning projects to understand the core concepts (order: oldest to newest)
 11 | * Topic Modelling using **Latent Dirichlet Allocation** with newsgroups20 dataset, implemented with Python and Scikit-Learn
 12 | * Implemented a simple **neural network** built with Keras on MNIST dataset
 13 | * Stock Price Forecasting on Google using **Linear Regression**
 14 | * Implemented a simple a **social network** to learn basics of Python
 15 | * Implemented **Naives Bayes Classifier** to filter spam messages on SpamAssasin Public Corpus
 16 | * **Churn Prediction Model** for banking dataset using Keras and Scikit-Learn
 17 | * Implemented **Random Forest** from scratch and built a classifier on Sonar dataset from UCI repository
 18 | * Simple Linear Regression in Python on sample dataset
 19 | * **Multiple Regression** in Python on sample dataset
 20 | * **PCA and scaling** sample stock data in Python [working_with_data]
 21 | * **Decision Trees** in Python on sample dataset
 22 | * **Logistic Regression** in Python on sample dataset
 23 | * Built a neural network in Python to defeat a captcha system
 24 | * Helper methods include commom operations used in **Statistics, Probability, Linear Algebra and Data Analysis**
 25 | * **K-means clustering** with example data; **clustering colors** with k-means; **Bottom-up Hierarchical Clustering**
 26 | * Generating Word Clouds
 27 | * Sentence generation using n-grams
 28 | * Sentence generation using **Grammars and Automata Theory; Gibbs Sampling** 
 29 | * Topic Modelling using Latent Dirichlet Analysis (LDA)
 30 | * Wrapper for using Scikit-Learn's **GridSearchCV** for a **Keras Neural Network**
 31 | * **Recommender system** using **cosine similarity**, recommending new interests to users as well as matching users as per common interests
 32 | * Implementing different methods for **network analysis** such as **PageRank, Betweeness Centrality, Closeness Centrality, EigenVector Centrality**
 33 | * Implementing methods used for **Hypothesis Inference** such as **P-hacking, A/B Testing, Bayesian Inference**
 34 | * Implemented **K-nearest neigbors** for next presedential election and prediciting voting behavior based on nearest neigbors.
 35 | 
 36 | ## Installation notes
 37 | MLwP is built using Python 3.5.  The easiest way to set up a compatible
 38 | environment is to use [Conda](https://conda.io/).  This will set up a virtual
 39 | environment with the exact version of Python used for development along with all the
 40 | dependencies needed to run MLwP.
 41 | 
 42 | 1.  [Download and install Conda](https://conda.io/docs/download.html).
 43 | 2.  Create a Conda environment with Python 3. 
 44 | 
 45 | (**Note**: enter ```cd ~``` to go on **$HOME** , then perform these commands)
 46 | 
 47 |     ```
 48 |     conda create --name *your env name* python=3.5
 49 |     ```
 50 |    
 51 |    You will get the following, mlwp-test is the env name used in this example
 52 |    
 53 |    ```
 54 |    Solving environment: done
 55 |    
 56 | ## Package Plan ##
 57 | 
 58 |   environment location: /home/user/anaconda3/envs/mlwp-test
 59 | 
 60 |   added / updated specs: 
 61 |     - python=3.5
 62 | 
 63 | 
 64 | The following NEW packages will be INSTALLED:
 65 | 
 66 |     ca-certificates: 2018.12.5-0            
 67 |     certifi:         2018.8.24-py35_1       
 68 |     libedit:         3.1.20181209-hc058e9b_0
 69 |     libffi:          3.2.1-hd88cf55_4       
 70 |     libgcc-ng:       8.2.0-hdf63c60_1       
 71 |     libstdcxx-ng:    8.2.0-hdf63c60_1       
 72 |     ncurses:         6.1-he6710b0_1         
 73 |     openssl:         1.0.2p-h14c3975_0      
 74 |     pip:             10.0.1-py35_0          
 75 |     python:          3.5.6-hc3d631a_0       
 76 |     readline:        7.0-h7b6447c_5         
 77 |     setuptools:      40.2.0-py35_0          
 78 |     sqlite:          3.26.0-h7b6447c_0      
 79 |     tk:              8.6.8-hbc83047_0       
 80 |     wheel:           0.31.1-py35_0          
 81 |     xz:              5.2.4-h14c3975_4       
 82 |     zlib:            1.2.11-h7b6447c_3      
 83 | 
 84 | Proceed ([y]/n)?  *Press y*
 85 | 
 86 | Preparing transaction: done
 87 | Verifying transaction: done
 88 | Executing transaction: done
 89 | #
 90 | # To activate this environment, use:
 91 | # > source activate mlwp-test
 92 | #
 93 | # To deactivate an active environment, use:
 94 | # > source deactivate
 95 | #
 96 | 
 97 |    ```
 98 |    The environment is successfully created.
 99 | 
100 | 3.  Now activate the Conda environment.
101 | 
102 |     ```
103 |     source activate *your env name*
104 |     ```
105 |     You will get the following
106 |     
107 |     ```
108 |     (mlwp-test) amogh@hp15X34:~$ 
109 |     ```
110 |     Enter `conda list` to get the list of available packages
111 |     
112 |     ```
113 |         (mlwp-test) amogh@hp15X34:~$ conda list
114 |     # packages in environment at /home/amogh/anaconda3/envs/mlwp-test:
115 |     #
116 |     # Name                    Version                   Build  Channel
117 |     ca-certificates           2018.12.5                     0  
118 |     certifi                   2018.8.24                py35_1  
119 |     libedit                   3.1.20181209         hc058e9b_0  
120 |     libffi                    3.2.1                hd88cf55_4  
121 |     libgcc-ng                 8.2.0                hdf63c60_1  
122 |     libstdcxx-ng              8.2.0                hdf63c60_1  
123 |     ncurses                   6.1                  he6710b0_1  
124 |     openssl                   1.0.2p               h14c3975_0  
125 |     pip                       10.0.1                   py35_0  
126 |     python                    3.5.6                hc3d631a_0  
127 |     readline                  7.0                  h7b6447c_5  
128 |     setuptools                40.2.0                   py35_0  
129 |     sqlite                    3.26.0               h7b6447c_0  
130 |     tk                        8.6.8                hbc83047_0  
131 |     wheel                     0.31.1                   py35_0  
132 |     xz                        5.2.4                h14c3975_4  
133 |     zlib                      1.2.11               h7b6447c_3 
134 |     ```
135 | 
136 | 4.  Install the required dependencies.
137 | 
138 |     ```
139 |     (mlwp-test) amogh@hp15X34:~$ conda install --yes --file *path to requirements.txt*
140 |     ```
141 |     
142 | 5. In case you are not able to install the packages or getting `PackagesNotFoundError`
143 | Use the following command ` conda install -c conda-forge *list of packages separated by space*`. For more info, refer issue [#3](https://github.com/devAmoghS/Machine-Learning-with-Python/issues/3) **Unable to install requirements**
144 | 
145 | 
146 | ## How good is the code ?
147 | * It is well tested
148 | * It passes style checks (PEP8 compliant)
149 | * It can compile in its current state (and there are relatively no issues)
150 | 
151 | ## How much support is available?
152 | * FAQs (coming soon)
153 | * Documentation (coming soon)
154 | 
155 | ## Issues
156 | Feel free to submit issues and enhancement requests.
157 | 
158 | ## Contributing
159 | Please refer to each project's style guidelines and guidelines for submitting patches and additions. In general, we follow the "fork-and-pull" Git workflow.
160 | 
161 |  1. **Fork** the repo on GitHub
162 |  2. **Clone** the project to your own machine
163 |  3. **Commit** changes to your own branch
164 |  4. **Push** your work back up to your fork
165 |  5. Submit a **Pull request** so that we can review your changes
166 | 
167 | NOTE: Be sure to merge the latest from "upstream" before making a pull request!
168 | 


--------------------------------------------------------------------------------
/sonar_clf_rf.py:
--------------------------------------------------------------------------------
  1 | from csv import reader
  2 | from math import sqrt
  3 | from random import randrange, seed
  4 | 
  5 | 
  6 | def load_csv(filename):
  7 |     """This method loads a csv file"""
  8 |     dataset = list()
  9 |     with open(filename, 'r') as file:
 10 |         csv_reader = reader(file)
 11 |         for row in csv_reader:
 12 |             if not row:
 13 |                 continue
 14 |             dataset.append(row)
 15 | 
 16 |     return dataset
 17 | 
 18 | 
 19 | def str_column_to_float(dataset, column):
 20 |     """This method converts a string column to float"""
 21 |     for row in dataset:
 22 |         row[column] = float(row[column].strip())
 23 | 
 24 | 
 25 | def str_columm_to_int(dataset, column):
 26 |     """This method converts a string column to int"""
 27 |     class_values = [row[column] for row in dataset]
 28 |     unique = set(class_values)
 29 |     lookup = dict()
 30 | 
 31 |     for i, value in enumerate(unique):
 32 |         lookup[value] = i
 33 | 
 34 |     for row in dataset:
 35 |         row[column] = lookup[row[column]]
 36 | 
 37 |     return lookup
 38 | 
 39 | 
 40 | def cross_validation_split(dataset, k_folds):
 41 |     """This method splits a dataset into k folds"""
 42 |     dataset_split = list()
 43 |     dataset_copy = list(dataset)
 44 |     fold_size = int(len(dataset) / k_folds)
 45 | 
 46 |     for i in range(k_folds):
 47 |         fold = list()
 48 |         while(len(fold) < fold_size):
 49 |             index = randrange(len(dataset_copy))
 50 |             fold.append(dataset_copy.pop(index))
 51 |         dataset_split.append(fold)
 52 | 
 53 |     return dataset_split
 54 | 
 55 | 
 56 | def accuracy_score(actual, predicted):
 57 |     """This method predicts the accuracy percentage"""
 58 |     correct = 0
 59 |     for i in range(len(actual)):
 60 |         if actual[i] == predicted[i]:
 61 |             correct += 1
 62 | 
 63 |     return correct / float(len(actual)) * 100.0
 64 | 
 65 | 
 66 | def evaluate_algorithm(dataset, algorithm, k_folds, *args):
 67 |     """This method evaluates the algorithm using a cross validation split"""
 68 |     folds = cross_validation_split(dataset, k_folds)
 69 |     scores = list()
 70 | 
 71 |     for fold in folds:
 72 |         train_set = list(folds)
 73 |         train_set.remove(fold)
 74 |         train_set = sum(train_set, [])
 75 | 
 76 |         test_set = list()
 77 | 
 78 |         for row in fold:
 79 |             row_copy = list(row)
 80 |             test_set.append(row_copy)
 81 |             row_copy[-1] = None
 82 | 
 83 |         predicted = algorithm(train_set, test_set, *args)
 84 |         actual = [row[-1] for row in fold]
 85 | 
 86 |         accuracy = accuracy_score(actual, predicted)
 87 |         scores.append(accuracy)
 88 | 
 89 |         return scores
 90 | 
 91 | 
 92 | def test_split(index, value, dataset):
 93 |     """This method split a dataset based on an attribute and an attribute value"""
 94 |     left, right = list(), list()
 95 | 
 96 |     for row in dataset:
 97 |         if row[index] < value:
 98 |             left.append(row)
 99 |         else:
100 |             right.append(row)
101 | 
102 |     return left, right
103 | 
104 | 
105 | def gini_index(groups, classes):
106 |     """This method calculates the gini index for a split dataset"""
107 |     # count all samples at split point
108 |     n_instances = float(sum([len(group) for group in groups]))
109 |     # sum weighted gini index for each group
110 |     gini = 0.0
111 |     for group in groups:
112 |         size = float(len(group))
113 |         # avoid divide ny zero
114 |         if size == 0:
115 |             continue
116 |         score = 0.0
117 |         # score tje group based on the score for each class
118 |         for class_val in classes:
119 |             p = [row[-1] for row in group].count(class_val) / size
120 |             score += p * p
121 |         # weight the group score by its relative size
122 |         gini += (1.0 - score) * (size / n_instances)
123 | 
124 |     return gini
125 | 
126 | 
127 | def get_split(dataset, n_features):
128 |     """This method selects the best split for the dataset"""
129 |     class_values = list(set(row[-1] for row in dataset))
130 |     b_index, b_value, b_score, b_groups = 999, 999, 999, None
131 |     features = list()
132 | 
133 |     while len(features) < n_features :
134 |         index = randrange(len(dataset[0]) - 1)
135 |         if index not in features:
136 |             features.append(index)
137 | 
138 |     for index in features:
139 |         for row in dataset:
140 |             groups = test_split(index, row[index], dataset)
141 |             gini = gini_index(groups, class_values)
142 | 
143 |             if gini < b_score:
144 |                 b_index, b_value, b_score, b_groups = index, row[index], gini, groups
145 | 
146 |     return {'index':b_index, 'value':b_value, 'groups':b_groups}
147 | 
148 | 
149 | def to_terminal(group):
150 |     """Create a terminal node value"""
151 |     outcomes = [row[-1] for row in group]
152 |     return max(set(outcomes), key=outcomes.count)
153 | 
154 | 
155 | def split(node, max_depth, min_size, n_features, depth):
156 |     left, right = node['groups']
157 |     del node['groups']
158 | 
159 |     # check for a no split
160 |     if not left or not right:
161 |         node['left'] = node['right'] = to_terminal(left + right)
162 | 
163 |     # check for max_depth
164 |     if depth >= max_depth:
165 |         node['left'], node['right'] = to_terminal(left), to_terminal(right)
166 |         return
167 | 
168 |     # process left child
169 |     if len(left) <= min_size:
170 |         node['left'] = to_terminal(left)
171 |     else:
172 |         node['left'] = get_split(left, n_features)
173 |         split(node['left'], max_depth, min_size, n_features, depth+1)
174 | 
175 |     # process right child
176 |     if len(right) <= min_size:
177 |         node['right'] = to_terminal(right)
178 |     else:
179 |         node['right'] = get_split(right, n_features)
180 |         split(node['right'], max_depth, min_size, n_features, depth+1)
181 | 
182 | 
183 | def build_tree(train, max_depth, min_size, n_features):
184 |     """This method builds a decision tree"""
185 |     root = get_split(train, n_features)
186 |     split(root, max_depth, min_size, n_features, 1)
187 |     return root
188 | 
189 | 
190 | def predict(node, row):
191 |     """This method makes a prediction with a decision tree"""
192 |     if row[node['index']] < node['value']:
193 |         if isinstance(node['left'], dict):
194 |             return predict(node['left'], row)
195 |         else:
196 |             return node['left']
197 |     else:
198 |         if isinstance(node['right'], dict):
199 |             return predict(node['right'], row)
200 |         else:
201 |             return node['right']
202 | 
203 | 
204 | def subsample(dataset, ratio):
205 |     """This method creates a random subsample from the dataset with replacement"""
206 |     sample = list()
207 |     n_sample = round(len(dataset) * ratio)
208 |     while len(sample) < n_sample:
209 |         index = randrange(len(dataset))
210 |         sample.append(dataset[index])
211 |     return sample
212 | 
213 | 
214 | def bagging_predict(trees, row):
215 |     """This method makes a prediction a list of bagged trees"""
216 |     predictions = [predict(tree, row) for tree in trees]
217 |     return max(set(predictions), key=predictions.count)
218 | 
219 | 
220 | def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):
221 |     """Random Forest Algorithm"""
222 |     trees = list()
223 |     for i in range(n_trees):
224 |         sample = subsample(train, sample_size)
225 |         tree = build_tree(sample, max_depth, min_size, n_features)
226 |         trees.append(tree)
227 |     predictions = [bagging_predict(trees, row) for row in test]
228 |     return predictions
229 | 
230 | 
231 | """Test run the algorithm"""
232 | seed(2)
233 | # load and prepare the data
234 | filename = "/home/amogh/PycharmProjects/deeplearning/indie_projects/sonar_data.csv"
235 | dataset = load_csv(filename)
236 | # convert string attributes to integers
237 | for i in range(0, len(dataset[0]) - 1):
238 |     str_column_to_float(dataset, i)
239 | # convert class columns to integers
240 | str_columm_to_int(dataset, len(dataset[0]) - 1)
241 | 
242 | # evaluate algorithm
243 | k_folds = 5
244 | max_depth = 10
245 | min_size = 1
246 | sample_size = 1.0
247 | n_features = int(sqrt(len(dataset[0]) - 1))
248 | 
249 | for n_trees in [1, 5, 10]:
250 |     scores = evaluate_algorithm(dataset, random_forest, k_folds, max_depth, min_size, sample_size, n_trees, n_features)
251 |     print("Trees: %d" % n_trees)
252 |     print("Scores: %d" % scores)
253 |     print("Mean Accuracy: %.3f%%" % (sum(scores) / float(len(scores))))
254 | 


--------------------------------------------------------------------------------