├── .gitignore ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── MIT-LICENSE.TXT ├── Makefile ├── README.md ├── funpymodeling ├── __init__.py ├── data_prep.py ├── exploratory.py ├── model_validation.py └── test │ ├── __init__.py │ └── test_funpymodeling.py ├── notebooks └── quick-start_eng_v1.ipynb ├── poetry.lock └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Custom 2 | my_env/ 3 | 4 | # General 5 | syntax: glob 6 | .python-version 7 | .venv 8 | env/* 9 | venv/* 10 | ENV/* 11 | .idea/* 12 | .DS_Store 13 | dython.egg*/* 14 | *run_stuff.py* 15 | build/* 16 | dist/* 17 | build_deploy.sh 18 | site/* 19 | debug.py 20 | AUX/ 21 | __pycache__/ 22 | *.py[cod] 23 | *$py.class 24 | *.pyc 25 | *.ipynb_checkpoints/ 26 | funPyModeling.egg-info/ 27 | .ipynb_checkpoints/* 28 | funpymodeling/.ipynb_checkpoints/* 29 | # Distribution / packaging 30 | .Python 31 | build/ 32 | develop-eggs/ 33 | .pytest_cache/ 34 | dist/ 35 | downloads/ 36 | eggs/ 37 | .eggs/ 38 | lib/ 39 | lib64/ 40 | parts/ 41 | sdist/ 42 | var/ 43 | wheels/ 44 | *.egg-info/ 45 | .installed.cfg 46 | *.egg 47 | 48 | # PyInstaller 49 | # Usually these files are written by a python script from a template 50 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 51 | *.manifest 52 | *.spec 53 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://gitlab.com/pycqa/flake8 3 | rev: 3.7.9 4 | hooks: 5 | - id: flake8 6 | name: flake8 except __init__.py 7 | args: [--exclude=__init__.py] 8 | - id: flake8 9 | name: flake8 only __init__.py 10 | args: [--ignore=F401] # ignore imported unused in __init__.py 11 | files: __init__.py 12 | - repo: local 13 | hooks: 14 | - id: pytest 15 | name: Check pytest unit tests pass 16 | entry: make test 17 | language: system -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at {pcasas.biz@gmail.com} . All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /MIT-LICENSE.TXT: -------------------------------------------------------------------------------- 1 | Copyright 2020 Pablo Casas 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: install 2 | 3 | install: 4 | poetry install 5 | 6 | .PHONY: test 7 | 8 | test: 9 | poetry run pytest --pyargs funpymodeling 10 | 11 | .PHONY: check_style 12 | 13 | check_style: 14 | poetry run flake8 --exclude=__init__.py 15 | poetry run flake8 --ignore F401 funpymodeling/__init__.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # funPyModeling 2 | A package to help data scientist in Exploratory Data Analysis and Data Preparation for ML models 3 | -------------------------------------------------------------------------------- /funpymodeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_prep import todf 2 | from .exploratory import status, corr_pair, num_vars, cat_vars, profiling_num, freq_tbl 3 | from .model_validation import coord_plot 4 | 5 | 6 | __version__ = "0.1.7" 7 | -------------------------------------------------------------------------------- /funpymodeling/data_prep.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def todf(data): 5 | """ 6 | It converts almost any object to pandas dataframe. It supports: 1D/2D list, 1D/2D arrays, pandas series. If the object containts +2D it returns an error. 7 | Parameters: 8 | ----------- 9 | data: data 10 | 11 | Returns: 12 | -------- 13 | A pandas dataframe. 14 | 15 | Example: 16 | -------- 17 | >> from numpy import array 18 | 19 | # Different case study: 20 | >> list_1d = [11, 12, 5, 2] 21 | >> todf(list_1d) 22 | >> list_2d = [[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]] 23 | >> todf(list_2d) 24 | >> list_3d = [[[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]] 25 | >> todf(list_3d) 26 | >> array_1d = array(list_1d) 27 | >> todf(array_1d) 28 | >> array_2d = array(list_2d) 29 | >> todf(array_2d) 30 | >> pd_df=pd.DataFrame({'v1':[11, 12, 5, 2], 'v2':[15,24, 6,10]}) # ok 31 | >> todf(pd_df) 32 | >> pd_series=pd_df.v1 33 | """ 34 | if isinstance(data, list): 35 | data=np.array(data) 36 | 37 | if(len(data.shape))>2: 38 | raise Exception("I live in flattland! (can't handle objects with more than 2 dimensions)") 39 | 40 | if isinstance(data, pd.Series): 41 | data2=pd.DataFrame({data.name: data}) 42 | elif isinstance(data, np.ndarray): 43 | if(data.shape==1): 44 | data2=pd.DataFrame({'var': data}).convert_dtypes() 45 | else: 46 | data2=pd.DataFrame(data).convert_dtypes() 47 | else: 48 | data2=data 49 | 50 | return data2 51 | 52 | -------------------------------------------------------------------------------- /funpymodeling/exploratory.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from .data_prep import todf 4 | 5 | def status(data): 6 | """ 7 | For each variable it returns: Quantity and percentage of zeros (q_zeros and p_zeros respectevly). Same metrics for NA values (q_NA/p_na), and infinite values (q_inf/p_inf). Last two columns indicates data type and quantity of unique values. 8 | status can be used for EDA or in a data flow to spot errors or take actions based on the result. 9 | 10 | Parameters: 11 | ----------- 12 | data: It can be a dataframe or a single column, 1D or 2D numpy array. It uses the todf() function. 13 | 14 | Returns: 15 | -------- 16 | A pandas dataframe containing the status metrics for each input variable. 17 | 18 | Example: 19 | -------- 20 | >> import seaborn as sns 21 | >> iris = sns.load_dataset('iris') 22 | >> # dataframe as input 23 | >> status(iris) 24 | >> # single columns: 25 | >> status(iris['species']) 26 | """ 27 | data2=todf(data) 28 | 29 | # total de rows 30 | tot_rows=len(data2) 31 | 32 | # total de nan 33 | d2=data2.isnull().sum().reset_index() 34 | d2.columns=['variable', 'q_nan'] 35 | 36 | # percentage of nan 37 | d2[['p_nan']]=d2[['q_nan']]/tot_rows 38 | 39 | # num of zeros 40 | d2['q_zeros']=(data2==0).sum().values 41 | 42 | # perc of zeros 43 | d2['p_zeros']=d2[['q_zeros']]/tot_rows 44 | 45 | # total unique values 46 | d2['unique']=data2.nunique().values 47 | 48 | # get data types per column 49 | d2['type']=[str(x) for x in data2.dtypes.values] 50 | 51 | return(d2) 52 | 53 | 54 | def corr_pair(data, method='pearson'): 55 | """ 56 | Calcuate the correlations among all numeric features. Non-numeric are excluded since it uses the `corr` pandas function. 57 | It's useful to quickly extract those correlated input features and the correlation between the input and the target variable. 58 | 59 | Parameters: 60 | ----------- 61 | data: pandas data containing the variables to calculate the correlation 62 | method: `pearson` as default, same as `corr` function in pandas. 63 | Returns: 64 | -------- 65 | A pandas dataframe containing pairwaise correlation, R and R2 statistcs 66 | 67 | Example: 68 | -------- 69 | >> import seaborn as sns 70 | >> iris = sns.load_dataset('iris') 71 | >> corr_pair(iris) 72 | """ 73 | data2=todf(data) 74 | 75 | d_cor=data2.corr(method) 76 | 77 | d_cor2=d_cor.reset_index() # generates index as column 78 | 79 | d_long=d_cor2.melt(id_vars='index') # to long format, each row 1 var 80 | 81 | d_long.columns=['v1', 'v2', 'R'] 82 | 83 | d_long[['R2']]=d_long[['R']]**2 84 | 85 | d_long2=d_long.query("v1 != v2") # don't need the auto-correlation 86 | 87 | return(d_long2) 88 | 89 | 90 | def num_vars(data, exclude_var=None): 91 | """ 92 | Returns the numeric variable names. Useful to use with pipelines or any other method in which we need to keep numeric variables. It `exclude_var` can be a list with the variable names to skip in the result. Useful when we want to skip the target variable (i.e. in a data transformation). 93 | It's also available for categorical variables in the function `cat_vars()` 94 | Parameters: 95 | ----------- 96 | data: pandas dataframe 97 | exclude_var: list of variable names to exclude from the result 98 | 99 | Returns: 100 | -------- 101 | A list with all the numeric variable names. 102 | 103 | Example: 104 | -------- 105 | >> import seaborn as sns 106 | >> iris = sns.load_dataset('iris') 107 | >> num_vars(iris) 108 | """ 109 | num_v = data.select_dtypes(include=['int64', 'float64']).columns 110 | if exclude_var is not None: 111 | num_v=num_v.drop(exclude_var) 112 | return num_v 113 | 114 | def cat_vars(data, exclude_var=None): 115 | """ 116 | Returns the categoric variable names. Useful to use with pipelines or any other method in which we need to keep categorical variables. It `exclude_var` can be a list with the variable names to skip in the result. Useful when we want to skip the target variable (i.e. in a data transformation). It will include all `object`, `category` and `string` variables. 117 | It's also available for numeric variables in the function `num_vars()` 118 | 119 | Parameters: 120 | ----------- 121 | data: pandas dataframe 122 | exclude_var: list of variable names to exclude from the result 123 | 124 | Returns: 125 | -------- 126 | A list with all the categoric variable names. 127 | 128 | Example: 129 | -------- 130 | >> import seaborn as sns 131 | >> iris = sns.load_dataset('iris') 132 | >> cat_vars(iris) 133 | """ 134 | cat_v = data.select_dtypes(include=['object','category', 'string']).columns 135 | if exclude_var is not None: 136 | cat_v=cat_v.drop(exclude_var) 137 | return cat_v 138 | 139 | 140 | def profiling_num(data): 141 | """ 142 | Get a metric table with many indicators for all numerical variables, automatically skipping the non-numerical variables. Current metrics are: mean, std_dev: standard deviation, all the p_XX: percentile at XX number, skewness, kurtosis, iqr: inter quartile range, variation_coef: the ratio of sd/mean, range_98 is the limit for which the 98% of fall, range_80 similar to range_98 but with 80%. All NA values will be skipped from calculations. 143 | 144 | Parameters: 145 | ----------- 146 | data: pandas series/dataframe, numpy 1D/2D array 147 | 148 | Returns: 149 | -------- 150 | A dataframe in which each row is an input variable, and each column an statistic. 151 | 152 | Example: 153 | -------- 154 | >> import seaborn as sns 155 | >> iris = sns.load_dataset('iris') 156 | >> profiling_num(iris) 157 | """ 158 | 159 | # handling different inputs to dataframe 160 | data=todf(data) 161 | 162 | # explicit keep the num vars 163 | d=data[num_vars(data)] 164 | 165 | des1=pd.DataFrame({'mean':d.mean().transpose(), 166 | 'std_dev':d.std().transpose()}) 167 | 168 | des1['variation_coef']=des1['std_dev']/des1['mean'] 169 | 170 | d_quant=d.quantile([0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).transpose().add_prefix('p_') 171 | 172 | des2=des1.join(d_quant, how='outer') 173 | 174 | des_final=des2.copy() 175 | 176 | des_final['variable'] = des_final.index 177 | 178 | des_final=des_final.reset_index(drop=True) 179 | 180 | des_final=des_final[['variable', 'mean', 'std_dev','variation_coef', 'p_0.01', 'p_0.05', 'p_0.25', 'p_0.5', 'p_0.75', 'p_0.95', 'p_0.99']] 181 | 182 | return des_final 183 | 184 | 185 | 186 | def _freq_tbl_logic(var, name): 187 | """ 188 | For internal use. Related to `freq_tbl`. 189 | 190 | Parameters: 191 | ----------- 192 | var: pandas series 193 | name: column name (string) 194 | 195 | Returns: 196 | -------- 197 | Dataframe with the metrics 198 | 199 | Example: 200 | -------- 201 | 202 | """ 203 | cnt=var.value_counts() 204 | df_res=pd.DataFrame({'frequency': var.value_counts(), 'percentage': var.value_counts()/len(var)}) 205 | df_res.reset_index(drop=True) 206 | 207 | df_res[name] = df_res.index 208 | 209 | df_res=df_res.reset_index(drop=True) 210 | 211 | df_res['cumulative_perc'] = df_res.percentage.cumsum()/df_res.percentage.sum() 212 | 213 | df_res=df_res[[name, 'frequency', 'percentage', 'cumulative_perc']] 214 | 215 | return df_res 216 | 217 | 218 | 219 | def freq_tbl(data): 220 | """ 221 | Frequency table for categorical variables. It retrieves the frequency, perrcentage and cummulative percentage for each categorical variables (excluding the numerical ones). 222 | 223 | Parameters: 224 | ----------- 225 | data: pandas series/dataframe, numpy 1D/2D array 226 | 227 | Returns: 228 | -------- 229 | If a single variable is passed, then it returns the table with the results (useful to be used in a processes and take actions based on the result.). 230 | If it contains more than one varible, it will print in the console the result for all the categorical variables (based on cat_vars). 231 | 232 | Example: 233 | -------- 234 | > import seaborn as sns 235 | > tips=sns.load_dataset('tips') 236 | > freq_tbl(tips) 237 | """ 238 | data=todf(data) 239 | 240 | cat_v=cat_vars(data) 241 | if(len(cat_v)==0): 242 | return('No categorical variables to analyze.') 243 | 244 | if(len(cat_v)>1): 245 | for col in cat_v: 246 | print(_freq_tbl_logic(data[col], name=col)) 247 | print('\n----------------------------------------------------------------\n') 248 | else: 249 | # if only 1 column, then return the table for that variable 250 | col=cat_v[0] 251 | return _freq_tbl_logic(data[col], name=col) 252 | 253 | -------------------------------------------------------------------------------- /funpymodeling/model_validation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from pandas.plotting import parallel_coordinates 4 | from sklearn.preprocessing import MinMaxScaler 5 | import seaborn as sns 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def coord_plot(data, group_var): 10 | """ 11 | Coordinate plot analysis for clustering models. Also returns the original and the normalized (min-max) variable table. Useful to extract the main features for each cluster according to the variable means. 12 | Parameters: 13 | ----------- 14 | data : Pandas DataFrame containing the variables to analyze the mean across each cluster 15 | group_var : String indicating the clustering variable name 16 | Returns: 17 | -------- 18 | A tuple containing two data frames. The first contains the mean for each category across each value of the group_var. The other data set is similar but it is min-max normalized, range [0-1]. 19 | It also shows the coordinate or parallel plot. 20 | Example: 21 | -------- 22 | >> import seaborn as sns 23 | >> iris = sns.load_dataset('iris') 24 | # If species is the cluster variable: 25 | >> coord_plot(iris, 'species') 26 | """ 27 | # 1- group by cluster, get the means 28 | x_grp=data.groupby(group_var).mean() 29 | x_grp[group_var] = x_grp.index 30 | x_grp=x_grp.reset_index(drop=True) 31 | x_grp # data with the original variables 32 | 33 | # 2- normalizing the data min-max 34 | x_grp_no_tgt=x_grp.drop(group_var, axis=1) 35 | 36 | mm_scaler = MinMaxScaler() 37 | mm_scaler.fit(x_grp_no_tgt) 38 | x_grp_mm=mm_scaler.transform(x_grp_no_tgt) 39 | 40 | # 3- convert to df 41 | df_grp_mm=pd.DataFrame(x_grp_mm, columns=x_grp_no_tgt.columns) 42 | 43 | df_grp_mm[group_var]=x_grp[group_var] # variables escaladas 44 | 45 | # 4- plot 46 | parallel_coordinates(df_grp_mm, group_var, colormap=plt.get_cmap("Dark2")) 47 | plt.xticks(rotation=90) 48 | 49 | return [x_grp, df_grp_mm] -------------------------------------------------------------------------------- /funpymodeling/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pablo14/funpymodeling/b399804d5981e8873302980627fb6cc4139d1a10/funpymodeling/test/__init__.py -------------------------------------------------------------------------------- /funpymodeling/test/test_funpymodeling.py: -------------------------------------------------------------------------------- 1 | from funpymodeling import __version__ 2 | 3 | 4 | def test_version(): 5 | assert __version__ == '0.1.7' 6 | -------------------------------------------------------------------------------- /notebooks/quick-start_eng_v1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# funpymodeling > Basic usage\n", 8 | "\n", 9 | "Created by Pablo Casas [@pabloc_ds](https://twitter.com/pabloc_ds)\n", 10 | "\n", 11 | "" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 48, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import numpy as np\n", 23 | "import seaborn as sns" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 49, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Load some dataframes for this practice:\n", 33 | "iris = sns.load_dataset('iris')\n", 34 | "tips = sns.load_dataset('tips')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## 1) Exploratory Data Analysis" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### 1.1) Dataset health `status`" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from funpymodeling.exploratory import status" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Support data frame as input:" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 76 | "\n", 89 | "\n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | "
variableq_nanp_nanq_zerosp_zerosuniquetype
0sepal_length00.000.035float64
1sepal_width00.000.023float64
2petal_length00.000.043float64
3petal_width00.000.022float64
4species00.000.03object
\n", 155 | "
" 156 | ], 157 | "text/plain": [ 158 | " variable q_nan p_nan q_zeros p_zeros unique type\n", 159 | "0 sepal_length 0 0.0 0 0.0 35 float64\n", 160 | "1 sepal_width 0 0.0 0 0.0 23 float64\n", 161 | "2 petal_length 0 0.0 0 0.0 43 float64\n", 162 | "3 petal_width 0 0.0 0 0.0 22 float64\n", 163 | "4 species 0 0.0 0 0.0 3 object" 164 | ] 165 | }, 166 | "execution_count": 5, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "status(iris) " 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Supports Pandas series:" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 6, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/html": [ 190 | "
\n", 191 | "\n", 204 | "\n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | "
variableq_nanp_nanq_zerosp_zerosuniquetype
0sepal_width00.000.023float64
\n", 230 | "
" 231 | ], 232 | "text/plain": [ 233 | " variable q_nan p_nan q_zeros p_zeros unique type\n", 234 | "0 sepal_width 0 0.0 0 0.0 23 float64" 235 | ] 236 | }, 237 | "execution_count": 6, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "status(iris['sepal_width'])" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "Supports 2D numpy array:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 7, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "tips_np=tips.to_numpy()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 8, 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "data": { 269 | "text/html": [ 270 | "
\n", 271 | "\n", 284 | "\n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
variableq_nanp_nanq_zerosp_zerosuniquetype
0000.000.0229float64
1100.000.0123float64
2200.000.02string
3300.000.02string
4400.000.04string
5500.000.02string
6600.000.06Int64
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " variable q_nan p_nan q_zeros p_zeros unique type\n", 374 | "0 0 0 0.0 0 0.0 229 float64\n", 375 | "1 1 0 0.0 0 0.0 123 float64\n", 376 | "2 2 0 0.0 0 0.0 2 string\n", 377 | "3 3 0 0.0 0 0.0 2 string\n", 378 | "4 4 0 0.0 0 0.0 4 string\n", 379 | "5 5 0 0.0 0 0.0 2 string\n", 380 | "6 6 0 0.0 0 0.0 6 Int64" 381 | ] 382 | }, 383 | "execution_count": 8, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "status(tips_np)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "Note: data types form numpy to pandas dataframe are infered by: pandas `convert_dtypes`" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "Supports 1D numpy array:" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 9, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/html": [ 414 | "
\n", 415 | "\n", 428 | "\n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | "
variableq_nanp_nanq_zerosp_zerosuniquetype
0000.000.04string
\n", 454 | "
" 455 | ], 456 | "text/plain": [ 457 | " variable q_nan p_nan q_zeros p_zeros unique type\n", 458 | "0 0 0 0.0 0 0.0 4 string" 459 | ] 460 | }, 461 | "execution_count": 9, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "status(tips_np[:,4])" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "\n", 477 | "\n", 478 | "\n", 479 | "\n", 480 | "\n", 481 | "\n", 482 | "\n", 483 | "\n" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "### 1.2) Univariate analysis in numeric variables" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 11, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "from funpymodeling.exploratory import profiling_num" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "`profiling_num` retrieves several statistics for all numeric variables excluding the categorical ones." 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "Supports dataframe:" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 12, 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/html": [ 524 | "
\n", 525 | "\n", 538 | "\n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | "
variablemeanstd_devvariation_coefp_0.01p_0.05p_0.25p_0.5p_0.75p_0.95p_0.99
0total_bill19.7859438.9024120.4499367.259.557513.347517.79524.127538.061048.2270
1tip2.9982791.3836380.4614781.001.44002.00002.9003.56255.19557.2145
2size2.5696720.9511000.3701251.002.00002.00002.0003.00004.00006.0000
\n", 600 | "
" 601 | ], 602 | "text/plain": [ 603 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n", 604 | "0 total_bill 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n", 605 | "1 tip 2.998279 1.383638 0.461478 1.00 1.4400 2.0000 \n", 606 | "2 size 2.569672 0.951100 0.370125 1.00 2.0000 2.0000 \n", 607 | "\n", 608 | " p_0.5 p_0.75 p_0.95 p_0.99 \n", 609 | "0 17.795 24.1275 38.0610 48.2270 \n", 610 | "1 2.900 3.5625 5.1955 7.2145 \n", 611 | "2 2.000 3.0000 4.0000 6.0000 " 612 | ] 613 | }, 614 | "execution_count": 12, 615 | "metadata": {}, 616 | "output_type": "execute_result" 617 | } 618 | ], 619 | "source": [ 620 | "profiling_num(tips)" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "Also numpy as before:" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 13, 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "data": { 637 | "text/html": [ 638 | "
\n", 639 | "\n", 652 | "\n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | "
variablemeanstd_devvariation_coefp_0.01p_0.05p_0.25p_0.5p_0.75p_0.95p_0.99
0019.7859438.9024120.4499367.259.557513.347517.79524.127538.06148.227
112.9982791.3836380.46147811.4422.93.56255.19557.2145
262.5696720.9511000.3701251222346
\n", 714 | "
" 715 | ], 716 | "text/plain": [ 717 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n", 718 | "0 0 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n", 719 | "1 1 2.998279 1.383638 0.461478 1 1.44 2 \n", 720 | "2 6 2.569672 0.951100 0.370125 1 2 2 \n", 721 | "\n", 722 | " p_0.5 p_0.75 p_0.95 p_0.99 \n", 723 | "0 17.795 24.1275 38.061 48.227 \n", 724 | "1 2.9 3.5625 5.1955 7.2145 \n", 725 | "2 2 3 4 6 " 726 | ] 727 | }, 728 | "execution_count": 13, 729 | "metadata": {}, 730 | "output_type": "execute_result" 731 | } 732 | ], 733 | "source": [ 734 | "profiling_num(tips_np)" 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "Pandas series & 1D array:" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 14, 747 | "metadata": {}, 748 | "outputs": [ 749 | { 750 | "data": { 751 | "text/html": [ 752 | "
\n", 753 | "\n", 766 | "\n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | "
variablemeanstd_devvariation_coefp_0.01p_0.05p_0.25p_0.5p_0.75p_0.95p_0.99
0total_bill19.7859438.9024120.4499367.259.557513.347517.79524.127538.06148.227
\n", 800 | "
" 801 | ], 802 | "text/plain": [ 803 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n", 804 | "0 total_bill 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n", 805 | "\n", 806 | " p_0.5 p_0.75 p_0.95 p_0.99 \n", 807 | "0 17.795 24.1275 38.061 48.227 " 808 | ] 809 | }, 810 | "execution_count": 14, 811 | "metadata": {}, 812 | "output_type": "execute_result" 813 | } 814 | ], 815 | "source": [ 816 | "profiling_num(tips['total_bill'])" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 15, 822 | "metadata": {}, 823 | "outputs": [ 824 | { 825 | "data": { 826 | "text/html": [ 827 | "
\n", 828 | "\n", 841 | "\n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | "
variablemeanstd_devvariation_coefp_0.01p_0.05p_0.25p_0.5p_0.75p_0.95p_0.99
0019.7859438.9024120.4499367.259.557513.347517.79524.127538.06148.227
\n", 875 | "
" 876 | ], 877 | "text/plain": [ 878 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n", 879 | "0 0 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n", 880 | "\n", 881 | " p_0.5 p_0.75 p_0.95 p_0.99 \n", 882 | "0 17.795 24.1275 38.061 48.227 " 883 | ] 884 | }, 885 | "execution_count": 15, 886 | "metadata": {}, 887 | "output_type": "execute_result" 888 | } 889 | ], 890 | "source": [ 891 | "profiling_num(tips_np[:,0])" 892 | ] 893 | }, 894 | { 895 | "cell_type": "code", 896 | "execution_count": null, 897 | "metadata": {}, 898 | "outputs": [], 899 | "source": [ 900 | "\n", 901 | "\n", 902 | "\n", 903 | "\n", 904 | "\n", 905 | "\n", 906 | "\n", 907 | "\n", 908 | "\n" 909 | ] 910 | }, 911 | { 912 | "cell_type": "markdown", 913 | "metadata": {}, 914 | "source": [ 915 | "### 1.3) Univariate analysis in categorical variables" 916 | ] 917 | }, 918 | { 919 | "cell_type": "code", 920 | "execution_count": 16, 921 | "metadata": {}, 922 | "outputs": [], 923 | "source": [ 924 | "from funpymodeling.exploratory import freq_tbl" 925 | ] 926 | }, 927 | { 928 | "cell_type": "markdown", 929 | "metadata": {}, 930 | "source": [ 931 | "It retrieves several statistics related to categorical variablees, such as frequecnyt, percentage an cumulative percentage.\n", 932 | "\n", 933 | "It will run for all categorical variables excluding all the other ones." 934 | ] 935 | }, 936 | { 937 | "cell_type": "markdown", 938 | "metadata": {}, 939 | "source": [ 940 | "Just like the others, it supports: pandas dataframe, pandas series and 1D/2D numpy arrays" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 17, 946 | "metadata": {}, 947 | "outputs": [ 948 | { 949 | "name": "stdout", 950 | "output_type": "stream", 951 | "text": [ 952 | " sex frequency percentage cumulative_perc\n", 953 | "0 Male 157 0.643443 0.643443\n", 954 | "1 Female 87 0.356557 1.000000\n", 955 | "\n", 956 | "----------------------------------------------------------------\n", 957 | "\n", 958 | " smoker frequency percentage cumulative_perc\n", 959 | "0 No 151 0.618852 0.618852\n", 960 | "1 Yes 93 0.381148 1.000000\n", 961 | "\n", 962 | "----------------------------------------------------------------\n", 963 | "\n", 964 | " day frequency percentage cumulative_perc\n", 965 | "0 Sat 87 0.356557 0.356557\n", 966 | "1 Sun 76 0.311475 0.668033\n", 967 | "2 Thur 62 0.254098 0.922131\n", 968 | "3 Fri 19 0.077869 1.000000\n", 969 | "\n", 970 | "----------------------------------------------------------------\n", 971 | "\n", 972 | " time frequency percentage cumulative_perc\n", 973 | "0 Dinner 176 0.721311 0.721311\n", 974 | "1 Lunch 68 0.278689 1.000000\n", 975 | "\n", 976 | "----------------------------------------------------------------\n", 977 | "\n" 978 | ] 979 | } 980 | ], 981 | "source": [ 982 | "freq_tbl(tips)" 983 | ] 984 | }, 985 | { 986 | "cell_type": "markdown", 987 | "metadata": {}, 988 | "source": [ 989 | "If 1 variable is provided, it returns the table associated to that variable so we can use in our data pipeline:" 990 | ] 991 | }, 992 | { 993 | "cell_type": "code", 994 | "execution_count": 18, 995 | "metadata": {}, 996 | "outputs": [ 997 | { 998 | "data": { 999 | "text/html": [ 1000 | "
\n", 1001 | "\n", 1014 | "\n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | "
dayfrequencypercentagecumulative_perc
0Sat870.3565570.356557
1Sun760.3114750.668033
2Thur620.2540980.922131
3Fri190.0778691.000000
\n", 1055 | "
" 1056 | ], 1057 | "text/plain": [ 1058 | " day frequency percentage cumulative_perc\n", 1059 | "0 Sat 87 0.356557 0.356557\n", 1060 | "1 Sun 76 0.311475 0.668033\n", 1061 | "2 Thur 62 0.254098 0.922131\n", 1062 | "3 Fri 19 0.077869 1.000000" 1063 | ] 1064 | }, 1065 | "execution_count": 18, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "day_freq=freq_tbl(tips['day'])\n", 1072 | "\n", 1073 | "day_freq" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "markdown", 1078 | "metadata": {}, 1079 | "source": [ 1080 | "Days with low representativity (30%):" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": 19, 1086 | "metadata": {}, 1087 | "outputs": [ 1088 | { 1089 | "data": { 1090 | "text/html": [ 1091 | "
\n", 1092 | "\n", 1105 | "\n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | "
dayfrequencypercentagecumulative_perc
0Sat870.3565570.356557
1Sun760.3114750.668033
\n", 1132 | "
" 1133 | ], 1134 | "text/plain": [ 1135 | " day frequency percentage cumulative_perc\n", 1136 | "0 Sat 87 0.356557 0.356557\n", 1137 | "1 Sun 76 0.311475 0.668033" 1138 | ] 1139 | }, 1140 | "execution_count": 19, 1141 | "metadata": {}, 1142 | "output_type": "execute_result" 1143 | } 1144 | ], 1145 | "source": [ 1146 | "day_freq[day_freq['percentage']>0.3]" 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "code", 1151 | "execution_count": 20, 1152 | "metadata": {}, 1153 | "outputs": [ 1154 | { 1155 | "data": { 1156 | "text/plain": [ 1157 | "0 Sat\n", 1158 | "1 Sun\n", 1159 | "Name: day, dtype: category\n", 1160 | "Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']" 1161 | ] 1162 | }, 1163 | "execution_count": 20, 1164 | "metadata": {}, 1165 | "output_type": "execute_result" 1166 | } 1167 | ], 1168 | "source": [ 1169 | "day_freq[day_freq['percentage']>0.3]['day']" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "execution_count": null, 1175 | "metadata": {}, 1176 | "outputs": [], 1177 | "source": [ 1178 | "\n", 1179 | "\n", 1180 | "\n", 1181 | "\n", 1182 | "\n", 1183 | "\n", 1184 | "\n", 1185 | "\n", 1186 | "\n" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "markdown", 1191 | "metadata": {}, 1192 | "source": [ 1193 | "### 1.4) Pairwaise correlation analysis" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": 21, 1199 | "metadata": {}, 1200 | "outputs": [], 1201 | "source": [ 1202 | "from funpymodeling.exploratory import corr_pair" 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "markdown", 1207 | "metadata": {}, 1208 | "source": [ 1209 | "A wrapper around `corr` of pandas that allow us to quickly filter most important variables, or not. \n", 1210 | "\n", 1211 | "Useful in EDA and when doing the features pre-selection before creating the predictive model." 1212 | ] 1213 | }, 1214 | { 1215 | "cell_type": "code", 1216 | "execution_count": 22, 1217 | "metadata": {}, 1218 | "outputs": [ 1219 | { 1220 | "data": { 1221 | "text/html": [ 1222 | "
\n", 1223 | "\n", 1236 | "\n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | "
variableq_nanp_nanq_zerosp_zerosuniquetype
0carat00.000.000000273float64
1cut00.000.0000005object
2color00.000.0000007object
3clarity00.000.0000008object
4depth00.000.000000184float64
5table00.000.000000127float64
6price00.000.00000011602int64
7x00.080.000148554float64
8y00.070.000130552float64
9z00.0200.000371375float64
\n", 1352 | "
" 1353 | ], 1354 | "text/plain": [ 1355 | " variable q_nan p_nan q_zeros p_zeros unique type\n", 1356 | "0 carat 0 0.0 0 0.000000 273 float64\n", 1357 | "1 cut 0 0.0 0 0.000000 5 object\n", 1358 | "2 color 0 0.0 0 0.000000 7 object\n", 1359 | "3 clarity 0 0.0 0 0.000000 8 object\n", 1360 | "4 depth 0 0.0 0 0.000000 184 float64\n", 1361 | "5 table 0 0.0 0 0.000000 127 float64\n", 1362 | "6 price 0 0.0 0 0.000000 11602 int64\n", 1363 | "7 x 0 0.0 8 0.000148 554 float64\n", 1364 | "8 y 0 0.0 7 0.000130 552 float64\n", 1365 | "9 z 0 0.0 20 0.000371 375 float64" 1366 | ] 1367 | }, 1368 | "execution_count": 22, 1369 | "metadata": {}, 1370 | "output_type": "execute_result" 1371 | } 1372 | ], 1373 | "source": [ 1374 | "diamonds = sns.load_dataset('diamonds')\n", 1375 | "\n", 1376 | "status(diamonds)" 1377 | ] 1378 | }, 1379 | { 1380 | "cell_type": "code", 1381 | "execution_count": 23, 1382 | "metadata": {}, 1383 | "outputs": [ 1384 | { 1385 | "data": { 1386 | "text/html": [ 1387 | "
\n", 1388 | "\n", 1401 | "\n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | "
v1v2RR2
1depthcarat0.0282240.000797
2tablecarat0.1816180.032985
3pricecarat0.9215910.849331
4xcarat0.9750940.950809
5ycarat0.9517220.905775
6zcarat0.9533870.908947
7caratdepth0.0282240.000797
9tabledepth-0.2957790.087485
10pricedepth-0.0106470.000113
11xdepth-0.0252890.000640
\n", 1484 | "
" 1485 | ], 1486 | "text/plain": [ 1487 | " v1 v2 R R2\n", 1488 | "1 depth carat 0.028224 0.000797\n", 1489 | "2 table carat 0.181618 0.032985\n", 1490 | "3 price carat 0.921591 0.849331\n", 1491 | "4 x carat 0.975094 0.950809\n", 1492 | "5 y carat 0.951722 0.905775\n", 1493 | "6 z carat 0.953387 0.908947\n", 1494 | "7 carat depth 0.028224 0.000797\n", 1495 | "9 table depth -0.295779 0.087485\n", 1496 | "10 price depth -0.010647 0.000113\n", 1497 | "11 x depth -0.025289 0.000640" 1498 | ] 1499 | }, 1500 | "execution_count": 23, 1501 | "metadata": {}, 1502 | "output_type": "execute_result" 1503 | } 1504 | ], 1505 | "source": [ 1506 | "res=corr_pair(diamonds)\n", 1507 | "\n", 1508 | "res.head(10)" 1509 | ] 1510 | }, 1511 | { 1512 | "cell_type": "markdown", 1513 | "metadata": {}, 1514 | "source": [ 1515 | "If `price` is the target..." 1516 | ] 1517 | }, 1518 | { 1519 | "cell_type": "markdown", 1520 | "metadata": {}, 1521 | "source": [ 1522 | "Feature analysis for predictive modeling:" 1523 | ] 1524 | }, 1525 | { 1526 | "cell_type": "code", 1527 | "execution_count": 24, 1528 | "metadata": {}, 1529 | "outputs": [], 1530 | "source": [ 1531 | "res_target_ordered=res[res['v2']=='price'].sort_values('R2', ascending=False)" 1532 | ] 1533 | }, 1534 | { 1535 | "cell_type": "code", 1536 | "execution_count": null, 1537 | "metadata": {}, 1538 | "outputs": [], 1539 | "source": [ 1540 | "\n", 1541 | "\n", 1542 | "\n", 1543 | "\n" 1544 | ] 1545 | }, 1546 | { 1547 | "cell_type": "markdown", 1548 | "metadata": {}, 1549 | "source": [ 1550 | "Get top 3 most correlated features:" 1551 | ] 1552 | }, 1553 | { 1554 | "cell_type": "code", 1555 | "execution_count": 25, 1556 | "metadata": {}, 1557 | "outputs": [ 1558 | { 1559 | "data": { 1560 | "text/plain": [ 1561 | "21 carat\n", 1562 | "25 x\n", 1563 | "26 y\n", 1564 | "Name: v1, dtype: object" 1565 | ] 1566 | }, 1567 | "execution_count": 25, 1568 | "metadata": {}, 1569 | "output_type": "execute_result" 1570 | } 1571 | ], 1572 | "source": [ 1573 | "# Top 3:\n", 1574 | "top_vars=res_target_ordered[0:3]['v1']\n", 1575 | "top_vars" 1576 | ] 1577 | }, 1578 | { 1579 | "cell_type": "code", 1580 | "execution_count": null, 1581 | "metadata": {}, 1582 | "outputs": [], 1583 | "source": [ 1584 | "\n", 1585 | "\n", 1586 | "\n", 1587 | "\n" 1588 | ] 1589 | }, 1590 | { 1591 | "cell_type": "markdown", 1592 | "metadata": {}, 1593 | "source": [ 1594 | "On the opposite: Delete less relevant features (threshold R2 < 0.05)" 1595 | ] 1596 | }, 1597 | { 1598 | "cell_type": "code", 1599 | "execution_count": 26, 1600 | "metadata": {}, 1601 | "outputs": [ 1602 | { 1603 | "data": { 1604 | "text/plain": [ 1605 | "23 table\n", 1606 | "22 depth\n", 1607 | "Name: v1, dtype: object" 1608 | ] 1609 | }, 1610 | "execution_count": 26, 1611 | "metadata": {}, 1612 | "output_type": "execute_result" 1613 | } 1614 | ], 1615 | "source": [ 1616 | "res_target_ordered[res_target_ordered['R2']<0.05]['v1']" 1617 | ] 1618 | }, 1619 | { 1620 | "cell_type": "code", 1621 | "execution_count": null, 1622 | "metadata": {}, 1623 | "outputs": [], 1624 | "source": [ 1625 | "\n", 1626 | "\n", 1627 | "\n", 1628 | "\n", 1629 | "\n", 1630 | "\n", 1631 | "\n", 1632 | "\n", 1633 | "\n" 1634 | ] 1635 | }, 1636 | { 1637 | "cell_type": "markdown", 1638 | "metadata": {}, 1639 | "source": [ 1640 | "### 1.5) Get numeric and categorical var names" 1641 | ] 1642 | }, 1643 | { 1644 | "cell_type": "markdown", 1645 | "metadata": {}, 1646 | "source": [ 1647 | "Definitely, this is not fancy but useful internally and used with sklearn pipelines." 1648 | ] 1649 | }, 1650 | { 1651 | "cell_type": "code", 1652 | "execution_count": 27, 1653 | "metadata": {}, 1654 | "outputs": [], 1655 | "source": [ 1656 | "from funpymodeling.exploratory import cat_vars, num_vars" 1657 | ] 1658 | }, 1659 | { 1660 | "cell_type": "code", 1661 | "execution_count": 28, 1662 | "metadata": {}, 1663 | "outputs": [ 1664 | { 1665 | "data": { 1666 | "text/html": [ 1667 | "
\n", 1668 | "\n", 1681 | "\n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | " \n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | "
variableq_nanp_nanq_zerosp_zerosuniquetype
0total_bill00.000.0229float64
1tip00.000.0123float64
2sex00.000.02category
3smoker00.000.02category
4day00.000.04category
5time00.000.02category
6size00.000.06int64
\n", 1767 | "
" 1768 | ], 1769 | "text/plain": [ 1770 | " variable q_nan p_nan q_zeros p_zeros unique type\n", 1771 | "0 total_bill 0 0.0 0 0.0 229 float64\n", 1772 | "1 tip 0 0.0 0 0.0 123 float64\n", 1773 | "2 sex 0 0.0 0 0.0 2 category\n", 1774 | "3 smoker 0 0.0 0 0.0 2 category\n", 1775 | "4 day 0 0.0 0 0.0 4 category\n", 1776 | "5 time 0 0.0 0 0.0 2 category\n", 1777 | "6 size 0 0.0 0 0.0 6 int64" 1778 | ] 1779 | }, 1780 | "execution_count": 28, 1781 | "metadata": {}, 1782 | "output_type": "execute_result" 1783 | } 1784 | ], 1785 | "source": [ 1786 | "status(tips)" 1787 | ] 1788 | }, 1789 | { 1790 | "cell_type": "markdown", 1791 | "metadata": {}, 1792 | "source": [ 1793 | "Retrieve categorical var names:" 1794 | ] 1795 | }, 1796 | { 1797 | "cell_type": "code", 1798 | "execution_count": 29, 1799 | "metadata": {}, 1800 | "outputs": [ 1801 | { 1802 | "data": { 1803 | "text/plain": [ 1804 | "Index(['sex', 'smoker', 'day', 'time'], dtype='object')" 1805 | ] 1806 | }, 1807 | "execution_count": 29, 1808 | "metadata": {}, 1809 | "output_type": "execute_result" 1810 | } 1811 | ], 1812 | "source": [ 1813 | "cat_vars(tips)" 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "markdown", 1818 | "metadata": {}, 1819 | "source": [ 1820 | "Retrieve numerical var names:" 1821 | ] 1822 | }, 1823 | { 1824 | "cell_type": "code", 1825 | "execution_count": 30, 1826 | "metadata": {}, 1827 | "outputs": [ 1828 | { 1829 | "data": { 1830 | "text/plain": [ 1831 | "Index(['total_bill', 'tip', 'size'], dtype='object')" 1832 | ] 1833 | }, 1834 | "execution_count": 30, 1835 | "metadata": {}, 1836 | "output_type": "execute_result" 1837 | } 1838 | ], 1839 | "source": [ 1840 | "num_vars(tips)" 1841 | ] 1842 | }, 1843 | { 1844 | "cell_type": "code", 1845 | "execution_count": null, 1846 | "metadata": {}, 1847 | "outputs": [], 1848 | "source": [ 1849 | "\n", 1850 | "\n", 1851 | "\n", 1852 | "\n", 1853 | "\n", 1854 | "\n", 1855 | "\n", 1856 | "\n", 1857 | "\n", 1858 | "\n" 1859 | ] 1860 | }, 1861 | { 1862 | "cell_type": "markdown", 1863 | "metadata": {}, 1864 | "source": [ 1865 | "## 2) Data Preparation" 1866 | ] 1867 | }, 1868 | { 1869 | "cell_type": "markdown", 1870 | "metadata": {}, 1871 | "source": [ 1872 | "### 2.1) Convert \"almost-everything\" into a pandas dataframe" 1873 | ] 1874 | }, 1875 | { 1876 | "cell_type": "code", 1877 | "execution_count": 31, 1878 | "metadata": {}, 1879 | "outputs": [], 1880 | "source": [ 1881 | "from funpymodeling.data_prep import todf\n", 1882 | "\n", 1883 | "import numpy as np" 1884 | ] 1885 | }, 1886 | { 1887 | "cell_type": "markdown", 1888 | "metadata": {}, 1889 | "source": [ 1890 | "Note: Yes, under certain scenarios is not convenient due to performance reasons. But many scenarios we need/want to test or do a quick explore.\n", 1891 | "\n", 1892 | "`todf` is used as the entry point in many functions of `funPyModeling`." 1893 | ] 1894 | }, 1895 | { 1896 | "cell_type": "code", 1897 | "execution_count": 32, 1898 | "metadata": {}, 1899 | "outputs": [ 1900 | { 1901 | "data": { 1902 | "text/html": [ 1903 | "
\n", 1904 | "\n", 1917 | "\n", 1918 | " \n", 1919 | " \n", 1920 | " \n", 1921 | " \n", 1922 | " \n", 1923 | " \n", 1924 | " \n", 1925 | " \n", 1926 | " \n", 1927 | " \n", 1928 | " \n", 1929 | " \n", 1930 | " \n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | "
0
011
112
25
32
\n", 1943 | "
" 1944 | ], 1945 | "text/plain": [ 1946 | " 0\n", 1947 | "0 11\n", 1948 | "1 12\n", 1949 | "2 5\n", 1950 | "3 2" 1951 | ] 1952 | }, 1953 | "execution_count": 32, 1954 | "metadata": {}, 1955 | "output_type": "execute_result" 1956 | } 1957 | ], 1958 | "source": [ 1959 | "# 1D List\n", 1960 | "list_1d = [11, 12, 5, 2] \n", 1961 | "todf(list_1d)" 1962 | ] 1963 | }, 1964 | { 1965 | "cell_type": "code", 1966 | "execution_count": 33, 1967 | "metadata": {}, 1968 | "outputs": [ 1969 | { 1970 | "data": { 1971 | "text/html": [ 1972 | "
\n", 1973 | "\n", 1986 | "\n", 1987 | " \n", 1988 | " \n", 1989 | " \n", 1990 | " \n", 1991 | " \n", 1992 | " \n", 1993 | " \n", 1994 | " \n", 1995 | " \n", 1996 | " \n", 1997 | " \n", 1998 | " \n", 1999 | " \n", 2000 | " \n", 2001 | " \n", 2002 | " \n", 2003 | " \n", 2004 | " \n", 2005 | " \n", 2006 | " \n", 2007 | " \n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | "
0123
0111252
11524610
2108125
3121586
\n", 2027 | "
" 2028 | ], 2029 | "text/plain": [ 2030 | " 0 1 2 3\n", 2031 | "0 11 12 5 2\n", 2032 | "1 15 24 6 10\n", 2033 | "2 10 8 12 5\n", 2034 | "3 12 15 8 6" 2035 | ] 2036 | }, 2037 | "execution_count": 33, 2038 | "metadata": {}, 2039 | "output_type": "execute_result" 2040 | } 2041 | ], 2042 | "source": [ 2043 | "# 2D List\n", 2044 | "list_2d = [[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]\n", 2045 | "todf(list_2d)" 2046 | ] 2047 | }, 2048 | { 2049 | "cell_type": "code", 2050 | "execution_count": 34, 2051 | "metadata": {}, 2052 | "outputs": [ 2053 | { 2054 | "data": { 2055 | "text/html": [ 2056 | "
\n", 2057 | "\n", 2070 | "\n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | "
0
011
112
25
32
\n", 2096 | "
" 2097 | ], 2098 | "text/plain": [ 2099 | " 0\n", 2100 | "0 11\n", 2101 | "1 12\n", 2102 | "2 5\n", 2103 | "3 2" 2104 | ] 2105 | }, 2106 | "execution_count": 34, 2107 | "metadata": {}, 2108 | "output_type": "execute_result" 2109 | } 2110 | ], 2111 | "source": [ 2112 | "# 1D numpy array\n", 2113 | "array_1d = np.array(list_1d)\n", 2114 | "todf(array_1d)" 2115 | ] 2116 | }, 2117 | { 2118 | "cell_type": "code", 2119 | "execution_count": 35, 2120 | "metadata": {}, 2121 | "outputs": [ 2122 | { 2123 | "data": { 2124 | "text/html": [ 2125 | "
\n", 2126 | "\n", 2139 | "\n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | " \n", 2152 | " \n", 2153 | " \n", 2154 | " \n", 2155 | " \n", 2156 | " \n", 2157 | " \n", 2158 | " \n", 2159 | " \n", 2160 | " \n", 2161 | " \n", 2162 | " \n", 2163 | " \n", 2164 | " \n", 2165 | " \n", 2166 | " \n", 2167 | " \n", 2168 | " \n", 2169 | " \n", 2170 | " \n", 2171 | " \n", 2172 | " \n", 2173 | " \n", 2174 | " \n", 2175 | " \n", 2176 | " \n", 2177 | " \n", 2178 | " \n", 2179 | "
0123
0111252
11524610
2108125
3121586
\n", 2180 | "
" 2181 | ], 2182 | "text/plain": [ 2183 | " 0 1 2 3\n", 2184 | "0 11 12 5 2\n", 2185 | "1 15 24 6 10\n", 2186 | "2 10 8 12 5\n", 2187 | "3 12 15 8 6" 2188 | ] 2189 | }, 2190 | "execution_count": 35, 2191 | "metadata": {}, 2192 | "output_type": "execute_result" 2193 | } 2194 | ], 2195 | "source": [ 2196 | "# 2D numpy array\n", 2197 | "array_2d = np.array(list_2d)\n", 2198 | "todf(array_2d)" 2199 | ] 2200 | }, 2201 | { 2202 | "cell_type": "code", 2203 | "execution_count": 36, 2204 | "metadata": {}, 2205 | "outputs": [ 2206 | { 2207 | "data": { 2208 | "text/html": [ 2209 | "
\n", 2210 | "\n", 2223 | "\n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | " \n", 2241 | " \n", 2242 | " \n", 2243 | " \n", 2244 | " \n", 2245 | " \n", 2246 | " \n", 2247 | " \n", 2248 | " \n", 2249 | " \n", 2250 | " \n", 2251 | " \n", 2252 | " \n", 2253 | "
v1v2
01115
11224
256
3210
\n", 2254 | "
" 2255 | ], 2256 | "text/plain": [ 2257 | " v1 v2\n", 2258 | "0 11 15\n", 2259 | "1 12 24\n", 2260 | "2 5 6\n", 2261 | "3 2 10" 2262 | ] 2263 | }, 2264 | "execution_count": 36, 2265 | "metadata": {}, 2266 | "output_type": "execute_result" 2267 | } 2268 | ], 2269 | "source": [ 2270 | "# Data frame (in=out)\n", 2271 | "pd_df=pd.DataFrame({'v1':[11, 12, 5, 2], 'v2':[15,24, 6,10]}) #\n", 2272 | "todf(pd_df)" 2273 | ] 2274 | }, 2275 | { 2276 | "cell_type": "code", 2277 | "execution_count": 37, 2278 | "metadata": {}, 2279 | "outputs": [ 2280 | { 2281 | "data": { 2282 | "text/html": [ 2283 | "
\n", 2284 | "\n", 2297 | "\n", 2298 | " \n", 2299 | " \n", 2300 | " \n", 2301 | " \n", 2302 | " \n", 2303 | " \n", 2304 | " \n", 2305 | " \n", 2306 | " \n", 2307 | " \n", 2308 | " \n", 2309 | " \n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | "
v1
011
112
25
32
\n", 2323 | "
" 2324 | ], 2325 | "text/plain": [ 2326 | " v1\n", 2327 | "0 11\n", 2328 | "1 12\n", 2329 | "2 5\n", 2330 | "3 2" 2331 | ] 2332 | }, 2333 | "execution_count": 37, 2334 | "metadata": {}, 2335 | "output_type": "execute_result" 2336 | } 2337 | ], 2338 | "source": [ 2339 | "# Pandas series\n", 2340 | "todf(pd_df['v1'])" 2341 | ] 2342 | }, 2343 | { 2344 | "cell_type": "markdown", 2345 | "metadata": {}, 2346 | "source": [ 2347 | "Raise an error in +2D objects:" 2348 | ] 2349 | }, 2350 | { 2351 | "cell_type": "code", 2352 | "execution_count": 41, 2353 | "metadata": {}, 2354 | "outputs": [ 2355 | { 2356 | "name": "stdout", 2357 | "output_type": "stream", 2358 | "text": [ 2359 | "(1, 4, 4)\n" 2360 | ] 2361 | } 2362 | ], 2363 | "source": [ 2364 | "list_3d = np.array([[[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]]) # error\n", 2365 | "print(list_3d.shape)\n", 2366 | "# todf(list_3d) # <- error" 2367 | ] 2368 | }, 2369 | { 2370 | "cell_type": "markdown", 2371 | "metadata": {}, 2372 | "source": [ 2373 | "It raises the error: `Exception: I live in flattland! (can't handle objects with more than 2 dimensions)`" 2374 | ] 2375 | }, 2376 | { 2377 | "cell_type": "code", 2378 | "execution_count": null, 2379 | "metadata": {}, 2380 | "outputs": [], 2381 | "source": [ 2382 | "\n", 2383 | "\n", 2384 | "\n", 2385 | "\n", 2386 | "\n", 2387 | "\n", 2388 | "\n", 2389 | "\n", 2390 | "\n", 2391 | "\n", 2392 | "\n" 2393 | ] 2394 | }, 2395 | { 2396 | "cell_type": "markdown", 2397 | "metadata": {}, 2398 | "source": [ 2399 | "## 3) Model validation: Clustering" 2400 | ] 2401 | }, 2402 | { 2403 | "cell_type": "code", 2404 | "execution_count": 43, 2405 | "metadata": {}, 2406 | "outputs": [], 2407 | "source": [ 2408 | "from funpymodeling.model_validation import coord_plot" 2409 | ] 2410 | }, 2411 | { 2412 | "cell_type": "code", 2413 | "execution_count": 44, 2414 | "metadata": {}, 2415 | "outputs": [], 2416 | "source": [ 2417 | "from sklearn.cluster import KMeans\n", 2418 | "\n", 2419 | "x = iris.drop('species', axis=1)\n", 2420 | "\n", 2421 | "mod_km=KMeans(n_clusters=3)\n", 2422 | "iris['cluster']=mod_km.fit_predict(x)\n" 2423 | ] 2424 | }, 2425 | { 2426 | "cell_type": "code", 2427 | "execution_count": 45, 2428 | "metadata": {}, 2429 | "outputs": [ 2430 | { 2431 | "data": { 2432 | "image/png": "\n", 2433 | "text/plain": [ 2434 | "
" 2435 | ] 2436 | }, 2437 | "metadata": { 2438 | "needs_background": "light" 2439 | }, 2440 | "output_type": "display_data" 2441 | } 2442 | ], 2443 | "source": [ 2444 | "x_grp, x_grp_norm=coord_plot(iris, 'cluster')" 2445 | ] 2446 | }, 2447 | { 2448 | "cell_type": "code", 2449 | "execution_count": 46, 2450 | "metadata": {}, 2451 | "outputs": [ 2452 | { 2453 | "data": { 2454 | "text/html": [ 2455 | "
\n", 2456 | "\n", 2469 | "\n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | "
sepal_lengthsepal_widthpetal_lengthpetal_widthcluster
05.0060003.4280001.4620000.2460000
15.9016132.7483874.3935481.4338711
26.8500003.0736845.7421052.0710532
\n", 2507 | "
" 2508 | ], 2509 | "text/plain": [ 2510 | " sepal_length sepal_width petal_length petal_width cluster\n", 2511 | "0 5.006000 3.428000 1.462000 0.246000 0\n", 2512 | "1 5.901613 2.748387 4.393548 1.433871 1\n", 2513 | "2 6.850000 3.073684 5.742105 2.071053 2" 2514 | ] 2515 | }, 2516 | "execution_count": 46, 2517 | "metadata": {}, 2518 | "output_type": "execute_result" 2519 | } 2520 | ], 2521 | "source": [ 2522 | "x_grp" 2523 | ] 2524 | }, 2525 | { 2526 | "cell_type": "code", 2527 | "execution_count": 47, 2528 | "metadata": {}, 2529 | "outputs": [ 2530 | { 2531 | "data": { 2532 | "text/html": [ 2533 | "
\n", 2534 | "\n", 2547 | "\n", 2548 | " \n", 2549 | " \n", 2550 | " \n", 2551 | " \n", 2552 | " \n", 2553 | " \n", 2554 | " \n", 2555 | " \n", 2556 | " \n", 2557 | " \n", 2558 | " \n", 2559 | " \n", 2560 | " \n", 2561 | " \n", 2562 | " \n", 2563 | " \n", 2564 | " \n", 2565 | " \n", 2566 | " \n", 2567 | " \n", 2568 | " \n", 2569 | " \n", 2570 | " \n", 2571 | " \n", 2572 | " \n", 2573 | " \n", 2574 | " \n", 2575 | " \n", 2576 | " \n", 2577 | " \n", 2578 | " \n", 2579 | " \n", 2580 | " \n", 2581 | " \n", 2582 | " \n", 2583 | " \n", 2584 | "
sepal_lengthsepal_widthpetal_lengthpetal_widthcluster
00.000001.0000000.0000000.0000000
10.485690.0000000.6849240.6508691
21.000000.4786511.0000001.0000002
\n", 2585 | "
" 2586 | ], 2587 | "text/plain": [ 2588 | " sepal_length sepal_width petal_length petal_width cluster\n", 2589 | "0 0.00000 1.000000 0.000000 0.000000 0\n", 2590 | "1 0.48569 0.000000 0.684924 0.650869 1\n", 2591 | "2 1.00000 0.478651 1.000000 1.000000 2" 2592 | ] 2593 | }, 2594 | "execution_count": 47, 2595 | "metadata": {}, 2596 | "output_type": "execute_result" 2597 | } 2598 | ], 2599 | "source": [ 2600 | "x_grp_norm" 2601 | ] 2602 | }, 2603 | { 2604 | "cell_type": "code", 2605 | "execution_count": null, 2606 | "metadata": {}, 2607 | "outputs": [], 2608 | "source": [] 2609 | } 2610 | ], 2611 | "metadata": { 2612 | "kernelspec": { 2613 | "display_name": "Python 3", 2614 | "language": "python", 2615 | "name": "python3" 2616 | }, 2617 | "language_info": { 2618 | "codemirror_mode": { 2619 | "name": "ipython", 2620 | "version": 3 2621 | }, 2622 | "file_extension": ".py", 2623 | "mimetype": "text/x-python", 2624 | "name": "python", 2625 | "nbconvert_exporter": "python", 2626 | "pygments_lexer": "ipython3", 2627 | "version": "3.7.5" 2628 | } 2629 | }, 2630 | "nbformat": 4, 2631 | "nbformat_minor": 4 2632 | } 2633 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "funpymodeling" 3 | version = "0.1.7" 4 | description = "A package designed for data scientists and teachers, to speed up their ML projects, focused on exploratory data analysis, data preparation, and model performance." 5 | license="MIT" 6 | authors = ["Pablo Casas "] 7 | readme = "README.md" 8 | repository = "https://github.com/pablo14/funPyModeling" 9 | documentation = "https://github.com/pablo14/funPyModeling" 10 | 11 | [tool.poetry.dependencies] 12 | python = ">=3.8.1,<4.0" 13 | pandas = "^2.0.2" 14 | numpy = "^1.24.3" 15 | matplotlib = "^3.7.1" 16 | typing-extensions = "^4.6.3" 17 | scikit-learn = "^1.2.2" 18 | seaborn = "^0.12.2" 19 | flake8 = "^6.0.0" 20 | jupyter = "^1.0.0" 21 | pre-commit = "^3.3.2" 22 | pytest = "^7.3.1" 23 | 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | --------------------------------------------------------------------------------