├── .github
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── Dockerfile.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── cover.jpg
├── egeaML
    ├── __init__.py
    ├── constants.py
    ├── datareader.py
    ├── egeaML.py
    └── preprocessing.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── data_ingestion
        ├── __init__.py
        ├── fixture.py
        ├── test_base.py
        └── test_financial_datareader.py
    └── preprocessing
        ├── __init__.py
        └── test_imputation.py


/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: ["3.10"]
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |       - name: Set up Python ${{ matrix.python-version }}
15 |         uses: actions/setup-python@v4
16 |         with:
17 |           python-version: ${{ matrix.python-version }}
18 |       - name: Install dependencies
19 |         run: |  
20 |           python -m pip install --upgrade pip  
21 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
22 |       - name: Lint with Ruff
23 |         run: |  
24 |           pip install ruff  
25 |           ruff --format=github --target-version=py310 .
26 |         continue-on-error: true
27 |       - name: Test with pytest
28 |         run: |  
29 |           coverage run -m pytest  -v -s
30 |       - name: Generate Coverage Report
31 |         run: |  
32 |           coverage report -m
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | 
10 | # Packages #
11 | ############
12 | # it's better to unpack these files and commit the raw source
13 | # git has its own built in compression methods
14 | *.7z
15 | *.dmg
16 | *.gz
17 | *.iso
18 | *.jar
19 | *.rar
20 | *.tar
21 | *.zip
22 | 
23 | # Logs and databases #
24 | ######################
25 | *.log
26 | *.sql
27 | *.sqlite
28 | 
29 | # OS generated files #
30 | ######################
31 | .DS_Store
32 | .DS_Store?
33 | ._*
34 | .Spotlight-V100
35 | .Trashes
36 | ehthumbs.db
37 | Thumbs.db


--------------------------------------------------------------------------------
/Dockerfile.txt:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 | 
3 | WORKDIR /Applied_Machine_Learning_with_Python
4 | COPY requirements.txt /Applied_Machine_Learning_with_Python/
5 | RUN pip install -r requirements.txt
6 | 
7 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 andreagiussani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Readme.md
2 | recursive-include egeaML/data *
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Helper functions for the book "Applied Machine Learning with Python"
  2 | 
  3 | [![PyPi](https://img.shields.io/pypi/v/egeaML.svg)](https://pypi.python.org/pypi/egeaML)
  4 | [![Downloads](https://static.pepy.tech/badge/egeaML)](https://pypi.python.org/pypi/egeaML)
  5 | [![CI status](https://github.com/andreagiussani/Applied_Machine_Learning_with_Python/workflows/CI/badge.svg)](https://github.com/andreagiussani/Applied_Machine_Learning_with_Python/actions?queryworkflow%3ACI+event%3Apush+branch%3Amain)
  6 | 
  7 | <p align="center">
  8 |   <img src="cover.jpg" width="428" height="584" title="FrontCover">
  9 | </p>
 10 | 
 11 | 
 12 | This repository contains the Supplementary Material for the book "Applied Machine Learning with Python", written by Andrea Giussani.
 13 | You can find details about the book on the [BUP](https://bup.egeaonline.it) website.  
 14 | The books was written with the following specific versions of some popular libraries:
 15 | - [scikit-learn](https://scikit-learn.org/stable/) version 1.2.2
 16 | - [pandas](https://pandas.pydata.org) version 1.5.3
 17 | - [xgboost](https://xgboost.readthedocs.io/en/latest/#) version 1.7.4
 18 | - [gensim](https://radimrehurek.com/gensim/) version 3.8.1
 19 | - [matplotlib](https://matplotlib.org) version 3.7.1
 20 | - [seaborn](https://seaborn.pydata.org) version 0.9.0
 21 | 
 22 | ## How to use the EgeaML Library
 23 | The book provides a book-specific module, called **egeaML**. <br>
 24 | Be sure you have created a virtualenv. Then run 
 25 | ```bash
 26 | pip install egeaML
 27 | ```
 28 | Once installed you can load a structured label dataset - such as the well-known Boston dataset - 
 29 | as a `pandas.DataFrame`, as follows:
 30 | 
 31 | ```python
 32 | from egeaML.datareader import DataReader
 33 | 
 34 | raw_data = DataReader(
 35 |     filename='https://raw.githubusercontent.com/andreagiussani/datasets/master/egeaML/boston.csv',
 36 |     col_target='MEDV'
 37 | )
 38 | ```
 39 | 
 40 | Please noe that the base code is evolving over time; in case you want to stick to the print version of the book, 
 41 | be sure you install the `egeaML==0.2.3` version.
 42 | 
 43 | ## How to develop on the EgeaML
 44 | Please, clone on your local machine this repo, as follows:
 45 | ```bash
 46 | git clone https://github.com/andreagiussani/Applied_Machine_Learning_with_Python.git
 47 | ```
 48 | To install it into your local env, I recommend to create a virtualenv where you add the necessary requirements, running this command from your favourite terminal emulator:
 49 | ```bash
 50 | pip install -r requirements.txt
 51 | pip install git+https://github.com/andreagiussani/Applied_Machine_Learning_with_Python.git
 52 | ```
 53 | 
 54 | If, instead, you use the Anaconda system:
 55 | ```bash
 56 | conda install --file requirements.txt
 57 | conda install git+https://github.com/andreagiussani/Applied_Machine_Learning_with_Python.git
 58 | ```
 59 | If you have Python3 already installed in your local environment, you can run:
 60 | ```bash
 61 | python3 -m pip install --upgrade pip
 62 | python3 -m pip install git+https://github.com/andreagiussani/Applied_Machine_Learning_with_Python.git
 63 | ```
 64 | 
 65 | ### Unittest each method
 66 | As a developer, you should unittest your contribution.
 67 | To do so, you simply need to create a dedicated folder inside the `tests` subfolder (or possibly extend an existing one),
 68 | and test that your method exactly does what you expect. Please look at the following example to tke inspiration:
 69 | 
 70 | ```python
 71 | import unittest
 72 | import os
 73 | import pandas as pd
 74 | 
 75 | from egeaML.datareader import DataReader
 76 | 
 77 | 
 78 | class DataIngestionTestCase(unittest.TestCase):
 79 |     URL_STRING_NAME = 'https://raw.githubusercontent.com/andreagiussani/datasets/master/egeaML'
 80 |     FILENAME_STRING_NAME = 'boston.csv'
 81 | 
 82 |     def setUp(self):
 83 |         self.col_target = 'MEDV'
 84 |         self.filename = os.path.join(self.URL_STRING_NAME, self.FILENAME_STRING_NAME)
 85 |         self.columns = [
 86 |             'CRIM', 'ZN', 'INDUS', 'CHAS', 'NX', 'RM', 'AGE',
 87 |             'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV'
 88 |         ]
 89 |         self.raw_data = DataReader(filename=self.filename, col_target=self.col_target)
 90 | 
 91 |     def test__load_dataframe(self):
 92 |         df = self.raw_data()
 93 |         self.assertIsInstance(df, pd.DataFrame)
 94 |         self.assertEqual(df.shape[0], 506)
 95 |         self.assertEqual(df.shape[1], 14)
 96 | ```
 97 | The above unittest checks that the output is of type `pandas.DataFrame` and 
 98 | verify the expected output satisfies some characteristics.
 99 | 
100 | ## Extra Stuff
101 | If you wish to use the `egeaML` library on a Jupyter notebook, you firstly need to install the jupyter library,
102 | and then running the following command
103 | ```bash
104 | pip install jupyter
105 | python3 -m ipykernel install --user --name=<YOUR_ENV>
106 | ```
107 | where the name is the name you have assigned to your local environment. 
108 | You are now ready to use all the feature of this helper!
109 | 
110 | ## Submitting Errata
111 | If you have errata for the book, please submit them via the [BUP](https://bup.egeaonline.it) website. In case of possible mistakes within the book-specific module, you can submit a fixed-version as a pull-request in this repository.
112 | 
113 | ## How to Cite this Book
114 | 
115 | ```tex
116 | @book{giussani2020,
117 | 	TITLE="Applied Machine Learning with Python",
118 | 	AUTHOR="Andrea Giussani",
119 | 	YEAR="2020",
120 | 	PUBLISHER="Bocconi University Press"
121 | }
122 | ```
123 | 


--------------------------------------------------------------------------------
/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andreagiussani/Applied_Machine_Learning_with_Python/89966d54faf344cf90df55532bb1541f64461686/cover.jpg


--------------------------------------------------------------------------------
/egeaML/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/egeaML/constants.py:
--------------------------------------------------------------------------------
1 | UNNAMED_COLNAME = 'Unnamed'
2 | FILENAME_CONSTANT = 'filename'
3 | COL_TO_DROP_CONSTANT = 'col_to_drop'
4 | COL_TARGET_COLNAME = 'col_target'
5 | 


--------------------------------------------------------------------------------
/egeaML/datareader.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | from calendar import monthrange
  3 | from typing import Union
  4 | 
  5 | import yfinance as yf
  6 | import matplotlib.pyplot as plt
  7 | import pandas as pd
  8 | import seaborn as sns
  9 | 
 10 | import requests
 11 | from io import BytesIO
 12 | from zipfile import ZipFile
 13 | 
 14 | from concurrent.futures import ThreadPoolExecutor, as_completed
 15 | 
 16 | from sklearn.model_selection import train_test_split
 17 | 
 18 | from egeaML.constants import (
 19 |     UNNAMED_COLNAME,
 20 |     FILENAME_CONSTANT,
 21 |     COL_TARGET_COLNAME,
 22 |     COL_TO_DROP_CONSTANT,
 23 | )
 24 | 
 25 | import logging
 26 | logging.basicConfig(level=logging.INFO)
 27 | 
 28 | 
 29 | class DataReader:
 30 |     """
 31 |     This class is used to ingest data into the system before preprocessing.
 32 |     """
 33 | 
 34 |     def __init__(self, **args):
 35 |         """
 36 |         This module is used to ingest data into the system before preprocessing.
 37 |         """
 38 |         self.filename = args.get(FILENAME_CONSTANT)
 39 |         self.col_to_drop = args.get(COL_TO_DROP_CONSTANT)
 40 |         self.col_target = args.get(COL_TARGET_COLNAME)
 41 |         self.X = None
 42 |         self.y = None
 43 | 
 44 |     def __call__(self, split_features_target: bool = False):
 45 |         """
 46 |         This function takes the .csv file, and clean from unwanted columns.
 47 |         If split_features_target is set to True the function returns the set of features (explanatory variables)
 48 |         and the target variable
 49 |         Parameters
 50 |         ----------
 51 |             split_features_target: bool
 52 |                 Default value is False, if True return set of features and target variable.
 53 |         """
 54 |         df = pd.read_csv(self.filename, index_col=False)
 55 |         df = df.loc[:, ~df.columns.str.match(UNNAMED_COLNAME)]
 56 |         if split_features_target:
 57 |             self.y = df[self.col_target]  # This returns a vector containing the target variable
 58 |             self.X = df.drop(self.col_target, axis=1) if self.col_to_drop is None else \
 59 |                 df.drop([self.col_to_drop, self.col_target], axis=1)
 60 |             return self.X, self.y
 61 |         return df
 62 | 
 63 |     def split_train_test(self, test_size=0.3, random_seed=42):
 64 |         """
 65 |         This function splits the data into train and test set
 66 |         """
 67 |         X_train, X_test, y_train, y_test = train_test_split(
 68 |             self.X, self.y, test_size=test_size, random_state=random_seed
 69 |         )
 70 |         return X_train, X_test, y_train, y_test
 71 | 
 72 |     def plot_column_distribution(self, variable_name, title_plot, yticklabels):
 73 |         """
 74 |         This is a graphical utility, since it returns the distribution of a variable
 75 |         """
 76 |         plt.figure(figsize=(8, 5))
 77 |         sns.set(font_scale=1.4)
 78 |         sns.heatmap(
 79 |             pd.DataFrame(self.df[variable_name].value_counts()),
 80 |             annot=True,
 81 |             fmt='g', cbar=False, cmap='Blues',
 82 |             annot_kws={"size": 20},
 83 |             yticklabels=yticklabels
 84 |         )
 85 |         plt.title(title_plot)
 86 | 
 87 | 
 88 | class FinancialDataReader:
 89 |     # TODO: to be improved
 90 | 
 91 |     def __init__(self, stock_name, start_date, end_date):
 92 |         self.stock_name = stock_name
 93 |         self.start_date = start_date
 94 |         self.end_date = end_date
 95 |         self._validation_input()
 96 | 
 97 |     def _validation_input(self):
 98 |         if type(self.stock_name) is not str:
 99 |             raise ValueError('The stock name must be a string')
100 |         if self.start_date > self.end_date:
101 |             raise ValueError('The end date must be greater than the start date.')
102 | 
103 |     def __call__(self):
104 |         df = yf.download(self.stock_name, start=self.start_date, end=self.end_date)
105 |         return df
106 | 
107 | 
108 | class CryptoDataReader:
109 |     """
110 |     Parameters
111 |     ----------
112 |     crypto_name : string
113 |         Cryptocurrency to download
114 |     start_date: datetime, str
115 |         Download start date string (YYYY-MM-DD) or _datetime.
116 |     end_date: datetime, str
117 |         Download end date string (YYYY-MM-DD) or _datetime.
118 |     timeframe : str
119 |         Valid timeframes: 1s,1m,3m,5m,15m,30m,1h,2h,4h,6h,8h,12h,1d
120 | 
121 |     Examples
122 |     --------
123 |     Using datetime objects:
124 | 
125 |     start_date = datetime.date(2022, 1, 1)
126 |     end_date = datetime.date(2022, 12, 31)
127 | 
128 |     crypto = CryptoDataReader('BTCUSDT', start_date, end_date, '1d')
129 |     data = crypto.get_data()
130 | 
131 |     Using date as string:
132 | 
133 |     start_date = '2022-06-30'
134 |     end_date = '2023-03-31'
135 | 
136 |     crypto = CryptoDataReader('ADAUSDT', start_date, end_date, '1h')
137 |     data = crypto.get_data()
138 |     """
139 | 
140 |     def __init__(self, crypto_name, start_date, end_date, timeframe):
141 |         self.crypto_name = crypto_name.upper()
142 |         self.start_date = start_date
143 |         self.end_date = end_date
144 |         self.timeframe = timeframe
145 |         self._validation_input()
146 | 
147 |     def _validation_input(self):
148 | 
149 |         valid_timeframe = ['1s', '1m', '3m', '5m', '15m', '30m', '1h', '2h', '4h', '6h', '8h', '12h', '1d']
150 |         if self.timeframe not in valid_timeframe:
151 |             raise ValueError(f'Timeframe := {self.timeframe} must be in ({", ".join(valid_timeframe)})')
152 | 
153 |         if isinstance(self.start_date, str) and isinstance(self.end_date, str):
154 |             self.start_date = datetime.datetime.strptime(self.start_date, '%Y-%m-%d').date()
155 |             self.end_date = datetime.datetime.strptime(self.end_date, '%Y-%m-%d').date()
156 | 
157 |         if self.start_date > self.end_date:
158 |             raise ValueError(f'The end date must be greater than the start date.')
159 | 
160 |         filepath = 'https://raw.githubusercontent.com/binance/binance-public-data/master/data/symbols.txt'
161 |         tickers = pd.read_csv(filepath, header=None)
162 |         if self.crypto_name not in tickers.values:
163 |             raise ValueError(f'{self.crypto_name} is not a valid ticker. Check the available tickers at {filepath}.')
164 | 
165 |     @staticmethod
166 |     def _check_connection() -> bool:
167 |         with requests.head('http://www.google.com') as response:
168 |             if response.status_code == 404:
169 |                 return False
170 |             else:
171 |                 return True
172 | 
173 |     def _get_url(self, date: datetime, type: str) -> str:
174 |         """ Create the url from where download data """
175 |         year, month, day = date.year, date.strftime('%m'), date.strftime('%d')
176 | 
177 |         URL = f"https://data.binance.vision/data/spot/" \
178 |               f"{type}/klines/{self.crypto_name}/{self.timeframe}/{self.crypto_name}-{self.timeframe}-{year}-{month}"
179 |         return URL + ".zip" if type == 'monthly' else URL + f"-{day}.zip"
180 | 
181 |     def _download_data(self, date: datetime, type: str) -> Union[pd.DataFrame, bool]:
182 | 
183 |         url = self._get_url(date, type=type)
184 |         with requests.get(url) as response:
185 | 
186 |             if response.status_code == 404:
187 |                 return False
188 | 
189 |             else:
190 |                 zipfile = ZipFile(BytesIO(response.content))
191 |                 with zipfile.open(zipfile.namelist()[0]) as file_in:
192 |                     download = pd.read_csv(file_in,
193 |                                            usecols=[0, 1, 2, 3, 4, 5, 8, 9],
194 |                                            header=None,
195 |                                            names=['Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Trades', 'Buy_volume'])
196 |                 return download
197 | 
198 |     @staticmethod
199 |     def last_day_of_month(date: datetime) -> datetime:
200 |         return date.replace(day=monthrange(date.year, date.month)[1])
201 | 
202 |     def _get_dates_to_download(self) -> list:
203 |         adjusted_end_date = self.last_day_of_month(self.end_date)
204 |         dates_monthly = [(date, 'monthly') for date in pd.date_range(self.start_date, adjusted_end_date, freq='M')]
205 |         dates_daily = [(date, 'daily') for date in pd.date_range(datetime.date.today().replace(day=1), datetime.date.today(), freq='D')
206 |                        if adjusted_end_date >= datetime.date.today()]
207 | 
208 |         return dates_monthly + dates_daily
209 | 
210 |     def get_data(self) -> pd.DataFrame:
211 | 
212 |         if not self._check_connection():
213 |             raise OSError('No connection available')
214 | 
215 |         data = pd.DataFrame(columns=['Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Trades', 'Buy_volume'])
216 | 
217 |         with ThreadPoolExecutor(max_workers=10) as exe:
218 | 
219 |             dates_to_download = self._get_dates_to_download()
220 |             futures = [exe.submit(self._download_data, date, type) for date, type in dates_to_download]
221 | 
222 |             for future in as_completed(futures):
223 |                 output = future.result()
224 |                 if isinstance(output, pd.DataFrame):
225 |                     data = pd.concat([data, output], axis=0, join='inner')
226 | 
227 |         data.drop_duplicates(subset=['Time'], inplace=True)
228 |         data.Time = pd.to_datetime(data.Time, unit='ms')
229 |         data.set_index(keys='Time', inplace=True)
230 |         data.sort_index(inplace=True)
231 |         data = data.loc[self.start_date:datetime.datetime.combine(self.end_date, datetime.time(23, 59, 59))]
232 | 
233 |         if data.index.min().date() != self.start_date or data.index.max().date() != self.end_date:
234 |             logging.info(f'Data for {self.crypto_name} is only available '
235 |                          f'from {data.index.min().date()} to {data.index.max().date()}')
236 | 
237 |         return data
238 | 


--------------------------------------------------------------------------------
/egeaML/egeaML.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import print_function
  3 | import re
  4 | import os
  5 | import wget
  6 | import pandas as pd
  7 | import numpy as np
  8 | import xgboost as xgb
  9 | import gensim
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | import seaborn as sns
 13 | from matplotlib.colors import ListedColormap
 14 | plt.style.use('ggplot')
 15 | 
 16 | 
 17 | from sklearn.linear_model import LogisticRegression
 18 | from sklearn.svm import SVC
 19 | from sklearn.decomposition import PCA
 20 | from sklearn.tree import DecisionTreeClassifier
 21 | from sklearn.model_selection import GridSearchCV, train_test_split
 22 | from sklearn.neighbors import KNeighborsClassifier
 23 | from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_curve
 24 | 
 25 | 
 26 | from gensim.parsing.preprocessing import strip_punctuation
 27 | from gensim.parsing.preprocessing import remove_stopwords
 28 | from gensim.parsing.preprocessing import strip_multiple_whitespaces
 29 | from gensim import models
 30 | from gensim.models.doc2vec import TaggedDocument
 31 | 
 32 | 
 33 | from nltk.tokenize import word_tokenize
 34 | from nltk.stem.wordnet import WordNetLemmatizer
 35 | from nltk.stem.snowball import SnowballStemmer
 36 | 
 37 | 
 38 | class ModelFitting:
 39 | 
 40 |     def __init__(self, n):
 41 |         self.n = n
 42 | 
 43 |         self.lr = LogisticRegression()
 44 |         self.dt = DecisionTreeClassifier()
 45 |         self.svc = SVC()
 46 | 
 47 |         self.my_dict = dict()
 48 |         self.abb_list = list()
 49 |         self.names_list = list()
 50 |         self.raw_scikit_models =  list()
 51 |         self.clfs = None
 52 | 
 53 |     def models_def(self, **kwargs):
 54 |         self.my_dict[kwargs['model_one']] = kwargs['abb1']
 55 |         self.my_dict[kwargs['model_two']] = kwargs['abb2']
 56 |         self.my_dict[kwargs['model_three']] = kwargs['abb3']
 57 |         return self.my_dict
 58 | 
 59 |     def get_models(self,**kwargs):
 60 |         my_dict = kwargs['models_dict']
 61 |         my_list = [kwargs['model_one'], kwargs['model_two'],
 62 |                    kwargs['model_three']]
 63 |         for name,abb in my_dict.items():
 64 |             self.abb_list.append(abb)
 65 |             self.names_list.append(name)
 66 |         for i in my_list:
 67 |             self.raw_scikit_models.append(i+'()')
 68 |         scikit_models_list = [self.lr,self.dt,self.svc]
 69 |         self.clfs = list(zip(self.names_list,scikit_models_list))
 70 |         return self.clfs
 71 | 
 72 |     def fitting_models(self,models, X_train,y_train,X_test,y_test):
 73 |         for name,clf in models:
 74 |             clf_ = clf
 75 |             clf_.fit(X_train,y_train)
 76 |             y_pred = clf_.predict(X_test)
 77 |             score = format(accuracy_score(y_test,y_pred), '.4f')
 78 |             print("{} : {}".format(name,score))
 79 | 
 80 | 
 81 | class DataUtils:
 82 | 
 83 |     def __init__(self):
 84 |         pass
 85 | 
 86 |     def download_data(self, foldername, urls, directory='./data' ):
 87 |         """
 88 |         This function download an online available dataset.
 89 |         The dataset is going to be saved on a folder in the current working directory
 90 |         Parameters
 91 |         ----------
 92 |             foldername: str
 93 |                 Folder name where the dowloaded data is going to be saved.
 94 |                 Example: 'IMDb'
 95 | 
 96 |             squadurls: list
 97 |                 List of urls
 98 | 
 99 |             directory: str
100 |                  Directory where the dataset will be stored
101 | 
102 |         Examples
103 |         -------
104 |                 > foldername = IMDb
105 |                 > download_data(foldername=foldername, urls=[url_imdb]s, directory='./data_folder')
106 | 
107 |         """
108 |         print("\nDownloading data...")
109 |         directory = os.path.expanduser(directory)
110 |         try:
111 |             if not os.path.exists(directory):
112 |                 os.makedirs(directory)
113 |         except:
114 |             print("The " + directory + " folder already exists!")
115 | 
116 |         dir_data = os.path.join(directory,foldername)
117 |         try:
118 |             if not os.path.exists(dir_data):
119 |                 os.makedirs(dir_data)
120 |         except:
121 |             print("The " + foldername + " folder already exists!")
122 | 
123 |         for dataurl in urls:
124 |             file = dataurl.split("/")[-1]
125 |             if os.path.exists(os.path.join(dir_data, file)):
126 |                 print(file, "already downloaded")
127 |             else:
128 |                 wget.download(url=dataurl, out=dir_data)
129 |         print("\nDownload Finished")
130 | 
131 | 
132 | class LossUtils:
133 | 
134 |     def __init__(self, data):
135 |         self.data = data
136 | 
137 |     def huber_loss(self,c=3):
138 |         return ((abs(self.data) < c) * 0.5 * self.data ** 2 + (abs(self.data) >= c) * -c * (0.5 * c - abs(self.data)))
139 | 
140 |     def logistic_loss(self):
141 |         return np.log(1+np.exp(-self.data))
142 | 
143 |     def hinge_loss(self):
144 |         return np.maximum(1 - self.data, 0)
145 | 
146 | 
147 | class PlottingUtils:
148 | 
149 |     def plot_pca(X):
150 | 
151 |         def draw_vector(v0, v1, ax=None):
152 |             ax = ax or plt.gca()
153 |             arrowprops=dict(facecolor='black',
154 |                             arrowstyle='->',
155 |                         linewidth=2,
156 |                         shrinkA=0, shrinkB=0)
157 |             ax.annotate('', v1, v0, arrowprops=arrowprops)
158 | 
159 |         pca = PCA(n_components=2, whiten=True)
160 |         pca.fit(X)
161 |         fig, ax = plt.subplots(1, 2, figsize=(16, 6))
162 |         fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
163 |         # plot data
164 |         ax[0].scatter(X[:, 0], X[:, 1], alpha=0.2)
165 |         for length, vector in zip(pca.explained_variance_,
166 |                                   pca.components_):
167 |             v = vector * 3 * np.sqrt(length)
168 |             draw_vector(pca.mean_, pca.mean_ + v, ax=ax[0])
169 |         ax[0].axis('equal');
170 |         ax[0].set(xlabel='x', ylabel='y', title='input')
171 | 
172 |         # plot principal components
173 |         X_pca = pca.transform(X)
174 |         ax[1].scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.2)
175 |         draw_vector([0, 0], [0, 3], ax=ax[1])
176 |         draw_vector([0, 0], [3, 0], ax=ax[1])
177 |         ax[1].axis('equal')
178 |         ax[1].set(xlabel='component 1', ylabel='component 2',
179 |                   title='principal components',
180 |                   xlim=(-5, 5), ylim=(-3, 3.1))
181 | 
182 | 
183 |     def plot_loss(data, model1, model2, model1_name, model2_name, model1_abb, model2_abb, xlim=None, ylim=None):
184 |         plt.plot(data, model1, label=model1_name, linestyle='-')
185 |         plt.plot(data, model2, label=model2_name, linestyle=':')
186 |         plt.ylabel("Loss")
187 |         plt.xlabel("Raw Model Output: $y - f(x)$")
188 |         plt.legend([model1_abb, model2_abb])
189 |         plt.xlim(xlim)
190 |         plt.ylim(ylim)
191 |         plt.show()
192 | 
193 | 
194 | class classification_plots:
195 | 
196 |     @staticmethod
197 |     def training_class(X,y,test_size=0.3):
198 |         """
199 |         This Function plots a a 2-dim training set,
200 |         and each point is labelled by the class it belongs to.
201 |         The arguments are as follows:
202 |             - X: 2-dim set of features;
203 |             - y: 1-dim target label;
204 |             - test_size: equal to 0.3 by default.
205 |                          It can take any numer between 0 and 1
206 |         """
207 |         X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size,\
208 |                                                          random_state=42)
209 |         df = pd.DataFrame(dict(height=X_train.iloc[:,1], weight=X_train.iloc[:,0],\
210 |                                label=y_train))
211 |         colors = {0:'red', 1:'blue'}
212 |         fig, ax = plt.subplots()
213 |         grouped = df.groupby('label')
214 |         for key, group in grouped:
215 |             group.plot(ax=ax, kind='scatter', x='height', y='weight', label=key, \
216 |                        color=colors[key],figsize=(8, 5))
217 |         plt.legend(["Training Class Female", "Training Class Man"],fontsize=10)
218 |         plt.show()
219 | 
220 |     @staticmethod
221 |     def knn_class(X,y,test_size=0.3):
222 |         """
223 |         This Function fits and a k-Neigh classifier and provides the
224 |         visualization of the prediction results wrt the target.
225 |         """
226 |         X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=test_size,random_state=42)
227 |         df = pd.DataFrame(dict(height=X_train.iloc[:,1], weight=X_train.iloc[:,0],
228 |                                label=y_train))
229 |         colors = {0:'red', 1:'blue'}
230 |         fig, ax = plt.subplots()
231 |         grouped = df.groupby('label')
232 |         for key, group in grouped:
233 |             group.plot(ax=ax, kind='scatter', x='weight', y='height', label=key,
234 |                        color=colors[key],figsize=(5, 5))
235 |         clf = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
236 |         y_pred = clf.predict(X_test)
237 |         df_ = pd.DataFrame(dict(height=X_test.iloc[:,1], weight=X_test.iloc[:,0],
238 |                                 label=y_pred))
239 |         colors = {0:'orange', 1:'green'}
240 |         grouped_ = df_.groupby('label')
241 |         for key, group in grouped_:
242 |             group.plot(ax=ax, kind='scatter', x='weight', y='height', label=key,
243 |                        color=colors[key],figsize=(5, 5))
244 |         plt.xlabel('Height', fontsize=14)
245 |         plt.ylabel('Weight', fontsize=14)
246 |         plt.legend(["Training Female", "Training Man", "Test Pred Female", "Test Pred Man"]
247 |                    ,fontsize=10)
248 |         plt.show()
249 | 
250 |     @staticmethod
251 |     def plotting_prediction(X_train,X_test,y_train,y_test,nn):
252 |         """
253 |         This function plots the test set points labelled with the predicted value.
254 |         The parameter nn stands for the Number of Neighbors
255 |         """
256 |         plt.style.use('ggplot')
257 |         plt.figure(figsize=(5,5))
258 | 
259 |         clf = KNeighborsClassifier(n_neighbors=nn).fit(X_train, y_train)
260 |         y_pred = clf.predict(X_test)
261 | 
262 |         colors = ['lime' if i else 'yellow' for i in y_test]
263 |         ps = clf.predict_proba(X_test)[:,1]
264 |         errs = ((ps < 0.5) & y_test) |((ps >= 0.5) & (1-y_test))
265 |         plt.scatter(X_test.weight[errs], X_test.height[errs], facecolors='red', s=150)
266 |         plt.scatter(X_test.weight, X_test.height,
267 |                     facecolors=colors, edgecolors='k', s=50, alpha=1)
268 |         plt.xlabel('Height', fontsize=14)
269 |         plt.ylabel('Weight', fontsize=14)
270 |         plt.tight_layout()
271 | 
272 | 
273 |     @staticmethod
274 |     def confusion_matrix(y_test, y_pred, cmap=None, xticklabels=None, yticklabels=None):
275 |         """
276 |         This function generates a confusion matrix, which is used as a
277 |         summary to evaluate a Classification predictor.
278 |         The arguments are:
279 |          - y_test: the true labels;
280 |          - y_pred: the predicted labels;
281 |          - cmap: it is the palette used to color the confusion matrix.
282 |                  The available options are:
283 |                   - cmap="YlGnBu"
284 |                   - cmap="Blues"
285 |                   - cmap="BuPu"
286 |                   - cmap="Greens"
287 |          Please refer to the notebook available on the book repo
288 |                     Miscellaneous/setting_CMAP_argument_matplotlib.ipynb
289 |          for further details.
290 |          - xticklabels: list
291 |                         description of x-axis label;
292 |          - yticklabels: list
293 |                         description of y-axis label
294 |         """
295 |         mat = confusion_matrix(y_test, y_pred)
296 |         if not xticklabels:
297 |             sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap=cmap, annot_kws={"size": 12})
298 |             plt.xlabel('True label')
299 |             plt.ylabel('Predicted label')
300 |             plt.show()
301 |         else:
302 |             sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap=cmap, annot_kws={"size": 12},
303 |                         xticklabels=xticklabels, yticklabels=yticklabels)
304 |             plt.xlabel('True label')
305 |             plt.ylabel('Predicted label')
306 |             plt.show()
307 | 
308 |     @staticmethod
309 |     def plot_precision_recall(y_test, y_pred):
310 |         """
311 |         Precision/Recall Curve
312 |         Parameters:
313 |                 - y_test: the true test labels
314 |                 - y_pred: the predicted labels
315 |         """
316 |         precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
317 |         plt.figure(figsize=(8,6))
318 |         plt.plot(recall, precision, lw=2, color='navy')
319 |         plt.title('Precision-Recall curve on the Diabetes dataset')
320 |         plt.ylabel('Precision')
321 |         plt.xlabel('Recall')
322 |         plt.xlim=([0.0, 1.05])
323 |         plt.ylim=([0.0, 1.05])
324 |         plt.show()
325 | 
326 |     @staticmethod
327 |     def knn_boundaries(X_train,X_test,y_train,y_test,n_neighbors):
328 |         """
329 |         This Function provides the boundaries for a k-Neigh classifier
330 |         """
331 |         # Create color maps
332 |         cmap_bold = ListedColormap(['#FF3333', '#3333FF'])
333 |         cmap_light = ListedColormap(['#e6eff0', '#096b76'])# (['#FF9999', '#9999FF'])
334 |         clf = KNeighborsClassifier(n_neighbors=n_neighbors)
335 |         clf.fit(X_train, y_train)
336 | 
337 |         # Plot the decision boundary. For that, we will assign a color to each
338 |         # point in the mesh [x_min, x_max]x[y_min, y_max].
339 |         x_min, x_max = X_test.iloc[:, 0].min() - 1, X_test.iloc[:, 0].max() + 1
340 |         y_min, y_max = X_test.iloc[:, 1].min() - 1, X_test.iloc[:, 1].max() + 1
341 |         xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
342 |                                  np.arange(y_min, y_max, 0.02))
343 |         Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
344 | 
345 |         # Put the result into a color plot
346 |         Z = Z.reshape(xx.shape)
347 |         plt.figure(figsize=(8, 6))
348 |         plt.pcolormesh(xx, yy, Z, cmap=cmap_light, linewidths=40)
349 | 
350 | 
351 |         # Plot also the training points
352 |         plt.scatter(X_test.iloc[:, 0], X_test.iloc[:, 1], c=y_test,
353 |                     cmap=cmap_bold, linewidths=3)
354 |         plt.xlim(xx.min(), xx.max())
355 |         plt.ylim(yy.min(), yy.max())
356 |         plt.title("Binary classification (k = %i)"
357 |                       % (n_neighbors))
358 |         plt.show()
359 | 
360 |     # def scaling_plot():
361 |     #     import mglearn
362 |     #     mglearn.plots.plot_scaling()
363 | 
364 |     @staticmethod
365 |     def plot_hist(data,features_name,target_name):
366 |         data = pd.DataFrame(data, columns=features_name)
367 |         plt.figure(figsize=(20, 16))
368 |         features = list(data)
369 |         for i, col in enumerate(features):
370 |             plt.subplot(3, len(features)/2 , i+1)
371 |             x = data[col]
372 |             plt.hist(x, 50, density=True, facecolor='g', alpha=0.75)
373 |             plt.title(col)
374 |             plt.xlabel(col)
375 |             plt.ylabel(target_name)
376 | 
377 |     @staticmethod
378 |     def plot_svc_decision_function(model, ax=None, plot_support=True):
379 |         """Plot the decision function for a 2D SVC"""
380 |         if ax is None:
381 |             ax = plt.gca()
382 |         xlim = ax.get_xlim()
383 |         ylim = ax.get_ylim()
384 | 
385 |         # create grid to evaluate model
386 |         x = np.linspace(xlim[0], xlim[1], 30)
387 |         y = np.linspace(ylim[0], ylim[1], 30)
388 |         Y, X = np.meshgrid(y, x)
389 |         xy = np.vstack([X.ravel(), Y.ravel()]).T
390 |         P = model.decision_function(xy).reshape(X.shape)
391 | 
392 |         # plot decision boundary and margins
393 |         ax.contour(X, Y, P, colors='k',
394 |                    levels=[-1, 0, 1], alpha=0.5,
395 |                    linestyles=['--', '-', '--'])
396 | 
397 |         # plot support vectors
398 |         if plot_support:
399 |             ax.scatter(model.support_vectors_[:, 0],
400 |                    model.support_vectors_[:, 1],
401 |                    s=300, linewidth=1, facecolors='none')
402 |         ax.set_xlim(xlim)
403 |         ax.set_ylim(ylim)
404 | 
405 |     @staticmethod
406 |     def plot_svc_regularization_effect(X,y,kernel,cmap):
407 |         fig, ax = plt.subplots(1, 2, figsize=(16, 6))
408 |         fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)
409 |         for ax, C in zip(ax, [100.0, 0.1]):
410 |             model = SVC(kernel=kernel, C=C).fit(X, y)
411 |             ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=cmap)
412 |             classification_plots.plot_svc_decision_function(model, ax)
413 |             ax.scatter(model.support_vectors_[:, 0],model.support_vectors_[:, 1],
414 |                 s=300, lw=1, facecolors='none')
415 |             ax.set_title('C = {0:.1f}'.format(C), size=14)
416 |         plt.show()
417 | 
418 | 
419 | class EgeaMLXGBoost:
420 |     def fitting(X,y,param_grid,n_jobs,cv):
421 |         clf_xgb = xgb.XGBClassifier(n_jobs=n_jobs, objective="binary:logistic")
422 |         clf = GridSearchCV(clf_xgb, param_grid=param_grid, verbose=1, cv=cv)
423 |         model = clf.fit(X, y)
424 |         return model
425 | 
426 |     def checking_overfitting(X_train,y_train,learning_rate, n_estimators):
427 |         model__ = xgb.XGBClassifier()
428 |         param_grid_ = dict(learning_rate=learning_rate,
429 |                    n_estimators=n_estimators)
430 |         grid_search = GridSearchCV(model__, param_grid_,
431 |                            scoring="neg_log_loss",
432 |                            n_jobs=-1,
433 |                            cv=10)
434 |         grid_result = grid_search.fit(X_train, y_train)
435 |         print("Best Log Score: %f using %s" % (grid_result.best_score_,
436 |                              grid_result.best_params_))
437 |         means = grid_result.cv_results_['mean_test_score']
438 |         stds = grid_result.cv_results_['std_test_score']
439 |         params = grid_result.cv_results_['params']
440 |         scores = np.array(means).reshape(len(learning_rate),
441 |                                  len(n_estimators))
442 |         for i, value in enumerate(learning_rate):
443 |             plt.plot(n_estimators, scores[i],label='learning_rate: ' + str(value))
444 |         plt.legend()
445 |         plt.xlabel('n_estimators')
446 |         plt.ylabel('Log Loss')
447 |         plt.savefig('n_estimators_vs_learning_rate.png')
448 | 
449 | 
450 | class EgeaNLP:
451 | 
452 |     @staticmethod
453 |     def clean_text(text):
454 |         new_string = []
455 |         for word in gensim.utils.simple_preprocess(text):
456 |             if word not in gensim.parsing.preprocessing.STOPWORDS and len(word) > 2:
457 |                 stem_ = SnowballStemmer('english')
458 |                 lemma = WordNetLemmatizer()
459 |                 new = stem_.stem(lemma.lemmatize(word, pos='v'))
460 |                 new_string.append(new)
461 |         return new_string
462 | 
463 |     @staticmethod
464 |     def simple_tokenization(doc):
465 |         """This function performs simple tokenization"""
466 |         tok = re.findall('(\\w+)', doc.lower())
467 |         docs = ' '.join(tok)
468 |         return word_tokenize(docs)
469 | 
470 |     @staticmethod
471 |     def parsing_text(doc):
472 |         """This function removes stopwords and puctuaction"""
473 |         return strip_multiple_whitespaces(strip_punctuation(remove_stopwords(doc.lower())))
474 | 
475 | 
476 |     @staticmethod
477 |     def top_words(corpus, dictionary, doc, n_words=5):
478 |         """
479 |         This function returns the most n important words in a particular document
480 |         ----------
481 |         Params:
482 |                 tfidf: obj
483 |                     tfidf matrix
484 |                 dictionary: obj
485 |                     This is the gensim dictionary
486 |                 doc: obj
487 |                 The specific corpus we wish to extract the top words
488 |                 n_words: int
489 |                     Number of top words to be shown
490 |         """
491 |         tfidf = models.TfidfModel(corpus)
492 |         tf_obj = tfidf[doc]
493 |         soterd_obj = sorted(tf_obj, key=lambda x: x[1], reverse=True)
494 |         top_words = list()
495 |         for obj in soterd_obj[:n_words]:
496 |             top_words.append("{0:s} ({1:01.03f})".format(dictionary[obj[0]], obj[1]))
497 |         return top_words
498 | 
499 |     @staticmethod
500 |     def analogy(model, x1, x2, y1):
501 |         result = model.most_similar(positive=[y1, x2], negative=[x1])
502 |         return result[0][0]
503 | 
504 |     @staticmethod
505 |     def display_similarity(model, words=None, sample=0):
506 |         """
507 |         This methods plot the desired list of words in a 2-dim cartesian plane
508 |         based on their similarity.
509 | 
510 |         """
511 |         if words == None:
512 |             if sample > 0:
513 |                 words = np.random.choice(list(model.vocab.keys()), sample)
514 |             else:
515 |                 words = [ word for word in model.vocab ]
516 | 
517 |         word_vectors = np.array([model[w] for w in words])
518 |         pca = PCA().fit_transform(word_vectors)[:,:2]
519 |         shift = 0.1
520 |         plt.figure(figsize=(8,6))
521 |         plt.scatter(pca[:,0], pca[:,1], edgecolors='k', c='b')
522 |         for w, (x1,x2) in zip(words, pca):
523 |             plt.text(x1+shift, x2+shift, w)
524 |         plt.xlabel('Component 1')
525 |         plt.ylabel('Component 2')
526 | 
527 |     @staticmethod
528 |     def tagging_doc2vec(docs):
529 |         """
530 |         This function prepares tagged documents for  the Doc2vec model
531 |         """
532 |         mylist = list()
533 |         for i,s in enumerate(docs):
534 |             mylist.append(TaggedDocument(s, [i]))
535 |         return mylist
536 | 
537 | 
538 | class EgeaNN:
539 | 
540 |     def __init__(self):
541 |         pass
542 | 
543 |     def plot_data(self, X, y):
544 |         """
545 |         This function plots the raw data
546 |         """
547 |         plt.figure(figsize = (8, 6))
548 |         plt.plot(X[y==0, 0], X[y==0, 1], 'or', alpha=0.5, label=0)
549 |         plt.plot(X[y==1, 0], X[y==1, 1], 'ob', alpha=0.5, label=1, marker="^")
550 |         plt.xlim((min(X[:, 0])-0.1, max(X[:, 0])+0.1))
551 |         plt.ylim((min(X[:, 1])-0.1, max(X[:, 1])+0.1))
552 |         plt.legend()
553 | 
554 |     def make_multiclass(self, n=500, d=2, k=3):
555 |         """
556 |         parameters:
557 |             n: # points per class
558 |             d: #dimensionality
559 |             k: # of classes
560 |         """
561 | 
562 |         np.random.seed(0)
563 |         X = np.zeros((n*k, d))
564 |         y = np.zeros(n*k)
565 |         for j in range(k):
566 |             ix = range(n*j, n*(j+1))
567 |             r = np.linspace(0.0,1,n)
568 |             t = np.linspace(j*4,(j+1)*4,n) + np.random.randn(n)*0.2
569 |             X[ix] = np.c_[r*np.sin(t), r*np.cos(t)]
570 |             y[ix] = j
571 |         fig = plt.figure(figsize=(6, 6))
572 |         plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap='RdYlBu', alpha=0.8)
573 |         plt.xlim([-1,1])
574 |         plt.ylim([-1,1])
575 |         return X, y
576 | 
577 |     def plot_decision_boundary(self, func, X, y):
578 |         figsize=(6, 6)
579 |         amin, bmin = X.min(axis=0) - 0.1
580 |         amax, bmax = X.max(axis=0) + 0.1
581 |         hticks = np.linspace(amin, amax, 101)
582 |         vticks = np.linspace(bmin, bmax, 101)
583 | 
584 |         aa, bb = np.meshgrid(hticks, vticks)
585 |         ab = np.c_[aa.ravel(), bb.ravel()]
586 |         c = func(ab)
587 |         cc = c.reshape(aa.shape)
588 | 
589 |         cm = 'RdBu'
590 |         cm_bright = ListedColormap(['#FF0000', '#0000FF'])
591 | 
592 |         fig, ax = plt.subplots(figsize=figsize)
593 |         contour = plt.contourf(aa, bb, cc, cmap=cm, alpha=0.8)
594 | 
595 |         ax_c = fig.colorbar(contour)
596 |         ax_c.set_label("$P(y = 1)$")
597 |         ax_c.set_ticks([0, 0.25, 0.5, 0.75, 1])
598 | 
599 |         plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright)
600 |         plt.xlim(amin, amax)
601 |         plt.ylim(bmin, bmax)
602 | 
603 |     def plot_loss_accuracy(self, history):
604 |         historydf = pd.DataFrame(history.history, index=history.epoch)
605 |         plt.figure(figsize=(10, 6))
606 |         historydf.plot(ylim=(0, max(1, historydf.values.max())),
607 |                        style=['+-','.-'] )
608 |         loss = history.history['loss'][-1]
609 |         acc = history.history['acc'][-1]
610 |         plt.title('Loss: %.3f, Accuracy: %.3f' % (loss, acc))
611 | 
612 |     def plot_multiclass_decision_boundary(self, model, X, y):
613 |         x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
614 |         y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
615 |         xx, yy = np.meshgrid(np.linspace(x_min, x_max, 101),
616 |                              np.linspace(y_min, y_max, 101))
617 |         cmap = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
618 | 
619 |         Z = model.predict_classes(np.c_[xx.ravel(), yy.ravel()],
620 |                                   verbose=0)
621 |         Z = Z.reshape(xx.shape)
622 |         fig = plt.figure(figsize=(8, 8))
623 |         plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral, alpha=0.8)
624 |         plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap='RdYlBu')
625 |         plt.xlim(xx.min(), xx.max())
626 |         plt.ylim(yy.min(), yy.max())
627 | 


--------------------------------------------------------------------------------
/egeaML/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from typing import List
 4 | 
 5 | 
 6 | class Preprocessing:
 7 |     """
 8 |     This class is aimed at facilitating preprocessing.
 9 |     It is made of two main objects: detecting nulls,
10 |     and dealing with categorical_cols.
11 |     """
12 |     def __init__(self, columns: List, X: pd.DataFrame):
13 |         self.categorical_cols = columns
14 |         self.X = X
15 |         self.df = None
16 | 
17 |     def simple_imputer(self) -> pd.DataFrame:
18 |         """
19 |         This function replaces null values in the input DataFrame with mode for object columns
20 |         and median for numeric columns.
21 |         """
22 |         summary_stats = self.X.select_dtypes(include=['object']).mode().to_dict(orient='records')[0]
23 |         summary_stats.update(self.X.select_dtypes(exclude=['object']).median().to_dict())
24 |         self.X.fillna(value=summary_stats, inplace=True)
25 |         return self.X
26 | 
27 |     def dummization(self):
28 |         """
29 |         This function performs dummization for categorical_cols
30 |         """
31 |         #TODO: use sklearn ColumnTransformer instead
32 | 
33 |         return pd.get_dummies(
34 |             self.simple_imputer(),
35 |             prefix_sep='_',
36 |             prefix=self.categorical_cols,
37 |             columns=self.categorical_cols,
38 |             drop_first=False
39 |         )
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | catboost==1.1.1
 2 | coverage==4.5.4
 3 | gensim==4.3.1
 4 | imbalanced-learn==0.5.0
 5 | yfinance==0.2.9
 6 | matplotlib==3.7.1
 7 | nltk==3.8.1
 8 | pandas==1.5.3
 9 | pytest==7.2.0
10 | requests==2.28.0
11 | seaborn==0.9.0
12 | shap==0.41.0
13 | scikit-learn==1.2.2
14 | tensorflow==2.11.0
15 | xgboost==1.7.4
16 | wget==3.2
17 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.0.2
 3 | commit = True
 4 | tag = True
 5 | 
 6 | [bumpversion:file:setup.py]
 7 | search = version='{current_version}'
 8 | replace = version='{new_version}'
 9 | 
10 | [bdist_wheel]
11 | universal = 1
12 | 
13 | [metadata]
14 | description_file=README.md
15 | license_files=LICENSE


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="egeaML",
 5 |     version="1.0.2",
 6 |     author="Andrea Giussani",
 7 |     author_email="andrea.giussani@unibocconi.it",
 8 |     description=("A python library used in support of the Book"
 9 |                  "'Applied Machine Learning with Python'"),
10 |     url="https://github.com/andreagiussani/Applied_Machine_Learning_with_Python",
11 |     license="BSD",
12 |     packages=find_packages(),
13 |     install_requires=[
14 |         'pandas==1.5.3', 'scikit-learn==1.2.2',
15 |         'shap==0.41.0', 'catboost==1.1.1',
16 |         'gensim==4.3.1', 'nltk==3.8.1',
17 |         'matplotlib==3.7.1', 'seaborn==0.9.0', 'wget==3.2',
18 |         'imbalanced-learn==0.5.0', 'tensorflow==2.11.0',
19 |         'xgboost==1.7.4', 'yfinance==0.2.9'
20 |     ],
21 |     include_package_data=True,
22 | )
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andreagiussani/Applied_Machine_Learning_with_Python/89966d54faf344cf90df55532bb1541f64461686/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data_ingestion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andreagiussani/Applied_Machine_Learning_with_Python/89966d54faf344cf90df55532bb1541f64461686/tests/data_ingestion/__init__.py


--------------------------------------------------------------------------------
/tests/data_ingestion/fixture.py:
--------------------------------------------------------------------------------
 1 | from io import StringIO
 2 | 
 3 | 
 4 | def get_mocked_string_csv():
 5 |     return StringIO(
 6 |         """col1,col2,col3,y
 7 |         1,4.4,99,1
 8 |         2,4.5,200,1
 9 |         3,4.7,65,0
10 |         4,1.5,140,0"""
11 |     )
12 | 


--------------------------------------------------------------------------------
/tests/data_ingestion/test_base.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | 
 4 | from egeaML.datareader import DataReader
 5 | from tests.data_ingestion.fixture import get_mocked_string_csv
 6 | 
 7 | 
 8 | class DataIngestionTestCase(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.col_target = 'y'
12 |         self.filename = get_mocked_string_csv()
13 |         self.columns = ['col1', 'col2', 'col3', 'y']
14 |         self.raw_data = DataReader(filename=self.filename, col_target=self.col_target)
15 | 
16 |     def test__load_dataframe(self):
17 |         df = self.raw_data()
18 |         self.assertIsInstance(df, pd.DataFrame)
19 |         self.assertEqual(df.shape[0], 4)
20 |         self.assertEqual(df.shape[1], 4)
21 |         self.assertListEqual(list(df), self.columns)
22 | 


--------------------------------------------------------------------------------
/tests/data_ingestion/test_financial_datareader.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | 
 4 | from unittest.mock import MagicMock
 5 | from egeaML.datareader import FinancialDataReader
 6 | 
 7 | 
 8 | class FinancialDataReaderTestCase(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
12 |         self.raw_data = FinancialDataReader("AAPL", start_date="2023-01-02", end_date="2023-01-04")
13 |         self.raw_data = MagicMock(return_value=pd.DataFrame(
14 |             [
15 |                 [130.279999, 130.899994, 124.169998, 125.070000, 124.879326, 112117500],
16 |                 [127.279999, 128, 122.879326, 124.970000, 124.5, 102115400],
17 |             ],
18 |             columns=self.columns
19 |         ))
20 | 
21 |     def test__load_dataframe(self):
22 |         df = self.raw_data()
23 |         self.assertIsInstance(df, pd.DataFrame)
24 |         self.assertEqual(df.shape[0], 2)
25 |         self.assertEqual(df.shape[1], 6)
26 |         self.assertListEqual(list(df), self.columns)
27 | 


--------------------------------------------------------------------------------
/tests/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andreagiussani/Applied_Machine_Learning_with_Python/89966d54faf344cf90df55532bb1541f64461686/tests/preprocessing/__init__.py


--------------------------------------------------------------------------------
/tests/preprocessing/test_imputation.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | from egeaML.preprocessing import Preprocessing
 6 | 
 7 | 
 8 | class DataIngestionTestCase(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.X = pd.DataFrame(
12 |             {
13 |                 'col1': [1, 6, np.nan, 5],
14 |                 'col2': [100, np.nan, np.nan, 30],
15 |                 'col3': ['iphone', 'iphone', np.nan, 'pixel']
16 |             }
17 |         )
18 |         self.transformer = Preprocessing(columns=None, X=self.X)
19 | 
20 |     def test__impute_null_values(self):
21 |         df = self.transformer.simple_imputer()
22 |         self.assertIsInstance(df, pd.DataFrame)
23 |         self.assertEqual(df[~df['col2'].isna()].shape[0], 4)
24 |         self.assertEqual(df.loc[2, 'col3'], 'iphone')
25 |         self.assertEqual(df.loc[2, 'col1'], 5)
26 | 


--------------------------------------------------------------------------------