├── .gitignore
├── .pre-commit-config.yaml
├── CODE_OF_CONDUCT.md
├── MIT-LICENSE.TXT
├── Makefile
├── README.md
├── funpymodeling
├── __init__.py
├── data_prep.py
├── exploratory.py
├── model_validation.py
└── test
│ ├── __init__.py
│ └── test_funpymodeling.py
├── notebooks
└── quick-start_eng_v1.ipynb
├── poetry.lock
└── pyproject.toml
/.gitignore:
--------------------------------------------------------------------------------
1 | # Custom
2 | my_env/
3 |
4 | # General
5 | syntax: glob
6 | .python-version
7 | .venv
8 | env/*
9 | venv/*
10 | ENV/*
11 | .idea/*
12 | .DS_Store
13 | dython.egg*/*
14 | *run_stuff.py*
15 | build/*
16 | dist/*
17 | build_deploy.sh
18 | site/*
19 | debug.py
20 | AUX/
21 | __pycache__/
22 | *.py[cod]
23 | *$py.class
24 | *.pyc
25 | *.ipynb_checkpoints/
26 | funPyModeling.egg-info/
27 | .ipynb_checkpoints/*
28 | funpymodeling/.ipynb_checkpoints/*
29 | # Distribution / packaging
30 | .Python
31 | build/
32 | develop-eggs/
33 | .pytest_cache/
34 | dist/
35 | downloads/
36 | eggs/
37 | .eggs/
38 | lib/
39 | lib64/
40 | parts/
41 | sdist/
42 | var/
43 | wheels/
44 | *.egg-info/
45 | .installed.cfg
46 | *.egg
47 |
48 | # PyInstaller
49 | # Usually these files are written by a python script from a template
50 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
51 | *.manifest
52 | *.spec
53 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://gitlab.com/pycqa/flake8
3 | rev: 3.7.9
4 | hooks:
5 | - id: flake8
6 | name: flake8 except __init__.py
7 | args: [--exclude=__init__.py]
8 | - id: flake8
9 | name: flake8 only __init__.py
10 | args: [--ignore=F401] # ignore imported unused in __init__.py
11 | files: __init__.py
12 | - repo: local
13 | hooks:
14 | - id: pytest
15 | name: Check pytest unit tests pass
16 | entry: make test
17 | language: system
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at {pcasas.biz@gmail.com} . All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 |
73 | [homepage]: https://www.contributor-covenant.org
74 |
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 |
--------------------------------------------------------------------------------
/MIT-LICENSE.TXT:
--------------------------------------------------------------------------------
1 | Copyright 2020 Pablo Casas
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining
4 | a copy of this software and associated documentation files (the
5 | "Software"), to deal in the Software without restriction, including
6 | without limitation the rights to use, copy, modify, merge, publish,
7 | distribute, sublicense, and/or sell copies of the Software, and to
8 | permit persons to whom the Software is furnished to do so, subject to
9 | the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: install
2 |
3 | install:
4 | poetry install
5 |
6 | .PHONY: test
7 |
8 | test:
9 | poetry run pytest --pyargs funpymodeling
10 |
11 | .PHONY: check_style
12 |
13 | check_style:
14 | poetry run flake8 --exclude=__init__.py
15 | poetry run flake8 --ignore F401 funpymodeling/__init__.py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # funPyModeling
2 | A package to help data scientist in Exploratory Data Analysis and Data Preparation for ML models
3 |
--------------------------------------------------------------------------------
/funpymodeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_prep import todf
2 | from .exploratory import status, corr_pair, num_vars, cat_vars, profiling_num, freq_tbl
3 | from .model_validation import coord_plot
4 |
5 |
6 | __version__ = "0.1.7"
7 |
--------------------------------------------------------------------------------
/funpymodeling/data_prep.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 |
4 | def todf(data):
5 | """
6 | It converts almost any object to pandas dataframe. It supports: 1D/2D list, 1D/2D arrays, pandas series. If the object containts +2D it returns an error.
7 | Parameters:
8 | -----------
9 | data: data
10 |
11 | Returns:
12 | --------
13 | A pandas dataframe.
14 |
15 | Example:
16 | --------
17 | >> from numpy import array
18 |
19 | # Different case study:
20 | >> list_1d = [11, 12, 5, 2]
21 | >> todf(list_1d)
22 | >> list_2d = [[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]
23 | >> todf(list_2d)
24 | >> list_3d = [[[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]]
25 | >> todf(list_3d)
26 | >> array_1d = array(list_1d)
27 | >> todf(array_1d)
28 | >> array_2d = array(list_2d)
29 | >> todf(array_2d)
30 | >> pd_df=pd.DataFrame({'v1':[11, 12, 5, 2], 'v2':[15,24, 6,10]}) # ok
31 | >> todf(pd_df)
32 | >> pd_series=pd_df.v1
33 | """
34 | if isinstance(data, list):
35 | data=np.array(data)
36 |
37 | if(len(data.shape))>2:
38 | raise Exception("I live in flattland! (can't handle objects with more than 2 dimensions)")
39 |
40 | if isinstance(data, pd.Series):
41 | data2=pd.DataFrame({data.name: data})
42 | elif isinstance(data, np.ndarray):
43 | if(data.shape==1):
44 | data2=pd.DataFrame({'var': data}).convert_dtypes()
45 | else:
46 | data2=pd.DataFrame(data).convert_dtypes()
47 | else:
48 | data2=data
49 |
50 | return data2
51 |
52 |
--------------------------------------------------------------------------------
/funpymodeling/exploratory.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from .data_prep import todf
4 |
5 | def status(data):
6 | """
7 | For each variable it returns: Quantity and percentage of zeros (q_zeros and p_zeros respectevly). Same metrics for NA values (q_NA/p_na), and infinite values (q_inf/p_inf). Last two columns indicates data type and quantity of unique values.
8 | status can be used for EDA or in a data flow to spot errors or take actions based on the result.
9 |
10 | Parameters:
11 | -----------
12 | data: It can be a dataframe or a single column, 1D or 2D numpy array. It uses the todf() function.
13 |
14 | Returns:
15 | --------
16 | A pandas dataframe containing the status metrics for each input variable.
17 |
18 | Example:
19 | --------
20 | >> import seaborn as sns
21 | >> iris = sns.load_dataset('iris')
22 | >> # dataframe as input
23 | >> status(iris)
24 | >> # single columns:
25 | >> status(iris['species'])
26 | """
27 | data2=todf(data)
28 |
29 | # total de rows
30 | tot_rows=len(data2)
31 |
32 | # total de nan
33 | d2=data2.isnull().sum().reset_index()
34 | d2.columns=['variable', 'q_nan']
35 |
36 | # percentage of nan
37 | d2[['p_nan']]=d2[['q_nan']]/tot_rows
38 |
39 | # num of zeros
40 | d2['q_zeros']=(data2==0).sum().values
41 |
42 | # perc of zeros
43 | d2['p_zeros']=d2[['q_zeros']]/tot_rows
44 |
45 | # total unique values
46 | d2['unique']=data2.nunique().values
47 |
48 | # get data types per column
49 | d2['type']=[str(x) for x in data2.dtypes.values]
50 |
51 | return(d2)
52 |
53 |
54 | def corr_pair(data, method='pearson'):
55 | """
56 | Calcuate the correlations among all numeric features. Non-numeric are excluded since it uses the `corr` pandas function.
57 | It's useful to quickly extract those correlated input features and the correlation between the input and the target variable.
58 |
59 | Parameters:
60 | -----------
61 | data: pandas data containing the variables to calculate the correlation
62 | method: `pearson` as default, same as `corr` function in pandas.
63 | Returns:
64 | --------
65 | A pandas dataframe containing pairwaise correlation, R and R2 statistcs
66 |
67 | Example:
68 | --------
69 | >> import seaborn as sns
70 | >> iris = sns.load_dataset('iris')
71 | >> corr_pair(iris)
72 | """
73 | data2=todf(data)
74 |
75 | d_cor=data2.corr(method)
76 |
77 | d_cor2=d_cor.reset_index() # generates index as column
78 |
79 | d_long=d_cor2.melt(id_vars='index') # to long format, each row 1 var
80 |
81 | d_long.columns=['v1', 'v2', 'R']
82 |
83 | d_long[['R2']]=d_long[['R']]**2
84 |
85 | d_long2=d_long.query("v1 != v2") # don't need the auto-correlation
86 |
87 | return(d_long2)
88 |
89 |
90 | def num_vars(data, exclude_var=None):
91 | """
92 | Returns the numeric variable names. Useful to use with pipelines or any other method in which we need to keep numeric variables. It `exclude_var` can be a list with the variable names to skip in the result. Useful when we want to skip the target variable (i.e. in a data transformation).
93 | It's also available for categorical variables in the function `cat_vars()`
94 | Parameters:
95 | -----------
96 | data: pandas dataframe
97 | exclude_var: list of variable names to exclude from the result
98 |
99 | Returns:
100 | --------
101 | A list with all the numeric variable names.
102 |
103 | Example:
104 | --------
105 | >> import seaborn as sns
106 | >> iris = sns.load_dataset('iris')
107 | >> num_vars(iris)
108 | """
109 | num_v = data.select_dtypes(include=['int64', 'float64']).columns
110 | if exclude_var is not None:
111 | num_v=num_v.drop(exclude_var)
112 | return num_v
113 |
114 | def cat_vars(data, exclude_var=None):
115 | """
116 | Returns the categoric variable names. Useful to use with pipelines or any other method in which we need to keep categorical variables. It `exclude_var` can be a list with the variable names to skip in the result. Useful when we want to skip the target variable (i.e. in a data transformation). It will include all `object`, `category` and `string` variables.
117 | It's also available for numeric variables in the function `num_vars()`
118 |
119 | Parameters:
120 | -----------
121 | data: pandas dataframe
122 | exclude_var: list of variable names to exclude from the result
123 |
124 | Returns:
125 | --------
126 | A list with all the categoric variable names.
127 |
128 | Example:
129 | --------
130 | >> import seaborn as sns
131 | >> iris = sns.load_dataset('iris')
132 | >> cat_vars(iris)
133 | """
134 | cat_v = data.select_dtypes(include=['object','category', 'string']).columns
135 | if exclude_var is not None:
136 | cat_v=cat_v.drop(exclude_var)
137 | return cat_v
138 |
139 |
140 | def profiling_num(data):
141 | """
142 | Get a metric table with many indicators for all numerical variables, automatically skipping the non-numerical variables. Current metrics are: mean, std_dev: standard deviation, all the p_XX: percentile at XX number, skewness, kurtosis, iqr: inter quartile range, variation_coef: the ratio of sd/mean, range_98 is the limit for which the 98% of fall, range_80 similar to range_98 but with 80%. All NA values will be skipped from calculations.
143 |
144 | Parameters:
145 | -----------
146 | data: pandas series/dataframe, numpy 1D/2D array
147 |
148 | Returns:
149 | --------
150 | A dataframe in which each row is an input variable, and each column an statistic.
151 |
152 | Example:
153 | --------
154 | >> import seaborn as sns
155 | >> iris = sns.load_dataset('iris')
156 | >> profiling_num(iris)
157 | """
158 |
159 | # handling different inputs to dataframe
160 | data=todf(data)
161 |
162 | # explicit keep the num vars
163 | d=data[num_vars(data)]
164 |
165 | des1=pd.DataFrame({'mean':d.mean().transpose(),
166 | 'std_dev':d.std().transpose()})
167 |
168 | des1['variation_coef']=des1['std_dev']/des1['mean']
169 |
170 | d_quant=d.quantile([0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99]).transpose().add_prefix('p_')
171 |
172 | des2=des1.join(d_quant, how='outer')
173 |
174 | des_final=des2.copy()
175 |
176 | des_final['variable'] = des_final.index
177 |
178 | des_final=des_final.reset_index(drop=True)
179 |
180 | des_final=des_final[['variable', 'mean', 'std_dev','variation_coef', 'p_0.01', 'p_0.05', 'p_0.25', 'p_0.5', 'p_0.75', 'p_0.95', 'p_0.99']]
181 |
182 | return des_final
183 |
184 |
185 |
186 | def _freq_tbl_logic(var, name):
187 | """
188 | For internal use. Related to `freq_tbl`.
189 |
190 | Parameters:
191 | -----------
192 | var: pandas series
193 | name: column name (string)
194 |
195 | Returns:
196 | --------
197 | Dataframe with the metrics
198 |
199 | Example:
200 | --------
201 |
202 | """
203 | cnt=var.value_counts()
204 | df_res=pd.DataFrame({'frequency': var.value_counts(), 'percentage': var.value_counts()/len(var)})
205 | df_res.reset_index(drop=True)
206 |
207 | df_res[name] = df_res.index
208 |
209 | df_res=df_res.reset_index(drop=True)
210 |
211 | df_res['cumulative_perc'] = df_res.percentage.cumsum()/df_res.percentage.sum()
212 |
213 | df_res=df_res[[name, 'frequency', 'percentage', 'cumulative_perc']]
214 |
215 | return df_res
216 |
217 |
218 |
219 | def freq_tbl(data):
220 | """
221 | Frequency table for categorical variables. It retrieves the frequency, perrcentage and cummulative percentage for each categorical variables (excluding the numerical ones).
222 |
223 | Parameters:
224 | -----------
225 | data: pandas series/dataframe, numpy 1D/2D array
226 |
227 | Returns:
228 | --------
229 | If a single variable is passed, then it returns the table with the results (useful to be used in a processes and take actions based on the result.).
230 | If it contains more than one varible, it will print in the console the result for all the categorical variables (based on cat_vars).
231 |
232 | Example:
233 | --------
234 | > import seaborn as sns
235 | > tips=sns.load_dataset('tips')
236 | > freq_tbl(tips)
237 | """
238 | data=todf(data)
239 |
240 | cat_v=cat_vars(data)
241 | if(len(cat_v)==0):
242 | return('No categorical variables to analyze.')
243 |
244 | if(len(cat_v)>1):
245 | for col in cat_v:
246 | print(_freq_tbl_logic(data[col], name=col))
247 | print('\n----------------------------------------------------------------\n')
248 | else:
249 | # if only 1 column, then return the table for that variable
250 | col=cat_v[0]
251 | return _freq_tbl_logic(data[col], name=col)
252 |
253 |
--------------------------------------------------------------------------------
/funpymodeling/model_validation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from pandas.plotting import parallel_coordinates
4 | from sklearn.preprocessing import MinMaxScaler
5 | import seaborn as sns
6 | import matplotlib.pyplot as plt
7 |
8 |
9 | def coord_plot(data, group_var):
10 | """
11 | Coordinate plot analysis for clustering models. Also returns the original and the normalized (min-max) variable table. Useful to extract the main features for each cluster according to the variable means.
12 | Parameters:
13 | -----------
14 | data : Pandas DataFrame containing the variables to analyze the mean across each cluster
15 | group_var : String indicating the clustering variable name
16 | Returns:
17 | --------
18 | A tuple containing two data frames. The first contains the mean for each category across each value of the group_var. The other data set is similar but it is min-max normalized, range [0-1].
19 | It also shows the coordinate or parallel plot.
20 | Example:
21 | --------
22 | >> import seaborn as sns
23 | >> iris = sns.load_dataset('iris')
24 | # If species is the cluster variable:
25 | >> coord_plot(iris, 'species')
26 | """
27 | # 1- group by cluster, get the means
28 | x_grp=data.groupby(group_var).mean()
29 | x_grp[group_var] = x_grp.index
30 | x_grp=x_grp.reset_index(drop=True)
31 | x_grp # data with the original variables
32 |
33 | # 2- normalizing the data min-max
34 | x_grp_no_tgt=x_grp.drop(group_var, axis=1)
35 |
36 | mm_scaler = MinMaxScaler()
37 | mm_scaler.fit(x_grp_no_tgt)
38 | x_grp_mm=mm_scaler.transform(x_grp_no_tgt)
39 |
40 | # 3- convert to df
41 | df_grp_mm=pd.DataFrame(x_grp_mm, columns=x_grp_no_tgt.columns)
42 |
43 | df_grp_mm[group_var]=x_grp[group_var] # variables escaladas
44 |
45 | # 4- plot
46 | parallel_coordinates(df_grp_mm, group_var, colormap=plt.get_cmap("Dark2"))
47 | plt.xticks(rotation=90)
48 |
49 | return [x_grp, df_grp_mm]
--------------------------------------------------------------------------------
/funpymodeling/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pablo14/funpymodeling/b399804d5981e8873302980627fb6cc4139d1a10/funpymodeling/test/__init__.py
--------------------------------------------------------------------------------
/funpymodeling/test/test_funpymodeling.py:
--------------------------------------------------------------------------------
1 | from funpymodeling import __version__
2 |
3 |
4 | def test_version():
5 | assert __version__ == '0.1.7'
6 |
--------------------------------------------------------------------------------
/notebooks/quick-start_eng_v1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# funpymodeling > Basic usage\n",
8 | "\n",
9 | "Created by Pablo Casas [@pabloc_ds](https://twitter.com/pabloc_ds)\n",
10 | "\n",
11 | " "
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 48,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import pandas as pd\n",
21 | "import matplotlib.pyplot as plt\n",
22 | "import numpy as np\n",
23 | "import seaborn as sns"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 49,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "# Load some dataframes for this practice:\n",
33 | "iris = sns.load_dataset('iris')\n",
34 | "tips = sns.load_dataset('tips')"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "## 1) Exploratory Data Analysis"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "metadata": {},
47 | "source": [
48 | "### 1.1) Dataset health `status`"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 4,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "from funpymodeling.exploratory import status"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "Support data frame as input:"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/html": [
75 | "
\n",
76 | "\n",
89 | "
\n",
90 | " \n",
91 | " \n",
92 | " \n",
93 | " variable \n",
94 | " q_nan \n",
95 | " p_nan \n",
96 | " q_zeros \n",
97 | " p_zeros \n",
98 | " unique \n",
99 | " type \n",
100 | " \n",
101 | " \n",
102 | " \n",
103 | " \n",
104 | " 0 \n",
105 | " sepal_length \n",
106 | " 0 \n",
107 | " 0.0 \n",
108 | " 0 \n",
109 | " 0.0 \n",
110 | " 35 \n",
111 | " float64 \n",
112 | " \n",
113 | " \n",
114 | " 1 \n",
115 | " sepal_width \n",
116 | " 0 \n",
117 | " 0.0 \n",
118 | " 0 \n",
119 | " 0.0 \n",
120 | " 23 \n",
121 | " float64 \n",
122 | " \n",
123 | " \n",
124 | " 2 \n",
125 | " petal_length \n",
126 | " 0 \n",
127 | " 0.0 \n",
128 | " 0 \n",
129 | " 0.0 \n",
130 | " 43 \n",
131 | " float64 \n",
132 | " \n",
133 | " \n",
134 | " 3 \n",
135 | " petal_width \n",
136 | " 0 \n",
137 | " 0.0 \n",
138 | " 0 \n",
139 | " 0.0 \n",
140 | " 22 \n",
141 | " float64 \n",
142 | " \n",
143 | " \n",
144 | " 4 \n",
145 | " species \n",
146 | " 0 \n",
147 | " 0.0 \n",
148 | " 0 \n",
149 | " 0.0 \n",
150 | " 3 \n",
151 | " object \n",
152 | " \n",
153 | " \n",
154 | "
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " variable q_nan p_nan q_zeros p_zeros unique type\n",
159 | "0 sepal_length 0 0.0 0 0.0 35 float64\n",
160 | "1 sepal_width 0 0.0 0 0.0 23 float64\n",
161 | "2 petal_length 0 0.0 0 0.0 43 float64\n",
162 | "3 petal_width 0 0.0 0 0.0 22 float64\n",
163 | "4 species 0 0.0 0 0.0 3 object"
164 | ]
165 | },
166 | "execution_count": 5,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "status(iris) "
173 | ]
174 | },
175 | {
176 | "cell_type": "markdown",
177 | "metadata": {},
178 | "source": [
179 | "Supports Pandas series:"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 6,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/html": [
190 | "\n",
191 | "\n",
204 | "
\n",
205 | " \n",
206 | " \n",
207 | " \n",
208 | " variable \n",
209 | " q_nan \n",
210 | " p_nan \n",
211 | " q_zeros \n",
212 | " p_zeros \n",
213 | " unique \n",
214 | " type \n",
215 | " \n",
216 | " \n",
217 | " \n",
218 | " \n",
219 | " 0 \n",
220 | " sepal_width \n",
221 | " 0 \n",
222 | " 0.0 \n",
223 | " 0 \n",
224 | " 0.0 \n",
225 | " 23 \n",
226 | " float64 \n",
227 | " \n",
228 | " \n",
229 | "
\n",
230 | "
"
231 | ],
232 | "text/plain": [
233 | " variable q_nan p_nan q_zeros p_zeros unique type\n",
234 | "0 sepal_width 0 0.0 0 0.0 23 float64"
235 | ]
236 | },
237 | "execution_count": 6,
238 | "metadata": {},
239 | "output_type": "execute_result"
240 | }
241 | ],
242 | "source": [
243 | "status(iris['sepal_width'])"
244 | ]
245 | },
246 | {
247 | "cell_type": "markdown",
248 | "metadata": {},
249 | "source": [
250 | "Supports 2D numpy array:"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 7,
256 | "metadata": {},
257 | "outputs": [],
258 | "source": [
259 | "tips_np=tips.to_numpy()"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 8,
265 | "metadata": {},
266 | "outputs": [
267 | {
268 | "data": {
269 | "text/html": [
270 | "\n",
271 | "\n",
284 | "
\n",
285 | " \n",
286 | " \n",
287 | " \n",
288 | " variable \n",
289 | " q_nan \n",
290 | " p_nan \n",
291 | " q_zeros \n",
292 | " p_zeros \n",
293 | " unique \n",
294 | " type \n",
295 | " \n",
296 | " \n",
297 | " \n",
298 | " \n",
299 | " 0 \n",
300 | " 0 \n",
301 | " 0 \n",
302 | " 0.0 \n",
303 | " 0 \n",
304 | " 0.0 \n",
305 | " 229 \n",
306 | " float64 \n",
307 | " \n",
308 | " \n",
309 | " 1 \n",
310 | " 1 \n",
311 | " 0 \n",
312 | " 0.0 \n",
313 | " 0 \n",
314 | " 0.0 \n",
315 | " 123 \n",
316 | " float64 \n",
317 | " \n",
318 | " \n",
319 | " 2 \n",
320 | " 2 \n",
321 | " 0 \n",
322 | " 0.0 \n",
323 | " 0 \n",
324 | " 0.0 \n",
325 | " 2 \n",
326 | " string \n",
327 | " \n",
328 | " \n",
329 | " 3 \n",
330 | " 3 \n",
331 | " 0 \n",
332 | " 0.0 \n",
333 | " 0 \n",
334 | " 0.0 \n",
335 | " 2 \n",
336 | " string \n",
337 | " \n",
338 | " \n",
339 | " 4 \n",
340 | " 4 \n",
341 | " 0 \n",
342 | " 0.0 \n",
343 | " 0 \n",
344 | " 0.0 \n",
345 | " 4 \n",
346 | " string \n",
347 | " \n",
348 | " \n",
349 | " 5 \n",
350 | " 5 \n",
351 | " 0 \n",
352 | " 0.0 \n",
353 | " 0 \n",
354 | " 0.0 \n",
355 | " 2 \n",
356 | " string \n",
357 | " \n",
358 | " \n",
359 | " 6 \n",
360 | " 6 \n",
361 | " 0 \n",
362 | " 0.0 \n",
363 | " 0 \n",
364 | " 0.0 \n",
365 | " 6 \n",
366 | " Int64 \n",
367 | " \n",
368 | " \n",
369 | "
\n",
370 | "
"
371 | ],
372 | "text/plain": [
373 | " variable q_nan p_nan q_zeros p_zeros unique type\n",
374 | "0 0 0 0.0 0 0.0 229 float64\n",
375 | "1 1 0 0.0 0 0.0 123 float64\n",
376 | "2 2 0 0.0 0 0.0 2 string\n",
377 | "3 3 0 0.0 0 0.0 2 string\n",
378 | "4 4 0 0.0 0 0.0 4 string\n",
379 | "5 5 0 0.0 0 0.0 2 string\n",
380 | "6 6 0 0.0 0 0.0 6 Int64"
381 | ]
382 | },
383 | "execution_count": 8,
384 | "metadata": {},
385 | "output_type": "execute_result"
386 | }
387 | ],
388 | "source": [
389 | "status(tips_np)"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "Note: data types form numpy to pandas dataframe are infered by: pandas `convert_dtypes`"
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "metadata": {},
402 | "source": [
403 | "Supports 1D numpy array:"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 9,
409 | "metadata": {},
410 | "outputs": [
411 | {
412 | "data": {
413 | "text/html": [
414 | "\n",
415 | "\n",
428 | "
\n",
429 | " \n",
430 | " \n",
431 | " \n",
432 | " variable \n",
433 | " q_nan \n",
434 | " p_nan \n",
435 | " q_zeros \n",
436 | " p_zeros \n",
437 | " unique \n",
438 | " type \n",
439 | " \n",
440 | " \n",
441 | " \n",
442 | " \n",
443 | " 0 \n",
444 | " 0 \n",
445 | " 0 \n",
446 | " 0.0 \n",
447 | " 0 \n",
448 | " 0.0 \n",
449 | " 4 \n",
450 | " string \n",
451 | " \n",
452 | " \n",
453 | "
\n",
454 | "
"
455 | ],
456 | "text/plain": [
457 | " variable q_nan p_nan q_zeros p_zeros unique type\n",
458 | "0 0 0 0.0 0 0.0 4 string"
459 | ]
460 | },
461 | "execution_count": 9,
462 | "metadata": {},
463 | "output_type": "execute_result"
464 | }
465 | ],
466 | "source": [
467 | "status(tips_np[:,4])"
468 | ]
469 | },
470 | {
471 | "cell_type": "code",
472 | "execution_count": null,
473 | "metadata": {},
474 | "outputs": [],
475 | "source": [
476 | "\n",
477 | "\n",
478 | "\n",
479 | "\n",
480 | "\n",
481 | "\n",
482 | "\n",
483 | "\n"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "### 1.2) Univariate analysis in numeric variables"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 11,
496 | "metadata": {},
497 | "outputs": [],
498 | "source": [
499 | "from funpymodeling.exploratory import profiling_num"
500 | ]
501 | },
502 | {
503 | "cell_type": "markdown",
504 | "metadata": {},
505 | "source": [
506 | "`profiling_num` retrieves several statistics for all numeric variables excluding the categorical ones."
507 | ]
508 | },
509 | {
510 | "cell_type": "markdown",
511 | "metadata": {},
512 | "source": [
513 | "Supports dataframe:"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 12,
519 | "metadata": {},
520 | "outputs": [
521 | {
522 | "data": {
523 | "text/html": [
524 | "\n",
525 | "\n",
538 | "
\n",
539 | " \n",
540 | " \n",
541 | " \n",
542 | " variable \n",
543 | " mean \n",
544 | " std_dev \n",
545 | " variation_coef \n",
546 | " p_0.01 \n",
547 | " p_0.05 \n",
548 | " p_0.25 \n",
549 | " p_0.5 \n",
550 | " p_0.75 \n",
551 | " p_0.95 \n",
552 | " p_0.99 \n",
553 | " \n",
554 | " \n",
555 | " \n",
556 | " \n",
557 | " 0 \n",
558 | " total_bill \n",
559 | " 19.785943 \n",
560 | " 8.902412 \n",
561 | " 0.449936 \n",
562 | " 7.25 \n",
563 | " 9.5575 \n",
564 | " 13.3475 \n",
565 | " 17.795 \n",
566 | " 24.1275 \n",
567 | " 38.0610 \n",
568 | " 48.2270 \n",
569 | " \n",
570 | " \n",
571 | " 1 \n",
572 | " tip \n",
573 | " 2.998279 \n",
574 | " 1.383638 \n",
575 | " 0.461478 \n",
576 | " 1.00 \n",
577 | " 1.4400 \n",
578 | " 2.0000 \n",
579 | " 2.900 \n",
580 | " 3.5625 \n",
581 | " 5.1955 \n",
582 | " 7.2145 \n",
583 | " \n",
584 | " \n",
585 | " 2 \n",
586 | " size \n",
587 | " 2.569672 \n",
588 | " 0.951100 \n",
589 | " 0.370125 \n",
590 | " 1.00 \n",
591 | " 2.0000 \n",
592 | " 2.0000 \n",
593 | " 2.000 \n",
594 | " 3.0000 \n",
595 | " 4.0000 \n",
596 | " 6.0000 \n",
597 | " \n",
598 | " \n",
599 | "
\n",
600 | "
"
601 | ],
602 | "text/plain": [
603 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n",
604 | "0 total_bill 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n",
605 | "1 tip 2.998279 1.383638 0.461478 1.00 1.4400 2.0000 \n",
606 | "2 size 2.569672 0.951100 0.370125 1.00 2.0000 2.0000 \n",
607 | "\n",
608 | " p_0.5 p_0.75 p_0.95 p_0.99 \n",
609 | "0 17.795 24.1275 38.0610 48.2270 \n",
610 | "1 2.900 3.5625 5.1955 7.2145 \n",
611 | "2 2.000 3.0000 4.0000 6.0000 "
612 | ]
613 | },
614 | "execution_count": 12,
615 | "metadata": {},
616 | "output_type": "execute_result"
617 | }
618 | ],
619 | "source": [
620 | "profiling_num(tips)"
621 | ]
622 | },
623 | {
624 | "cell_type": "markdown",
625 | "metadata": {},
626 | "source": [
627 | "Also numpy as before:"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": 13,
633 | "metadata": {},
634 | "outputs": [
635 | {
636 | "data": {
637 | "text/html": [
638 | "\n",
639 | "\n",
652 | "
\n",
653 | " \n",
654 | " \n",
655 | " \n",
656 | " variable \n",
657 | " mean \n",
658 | " std_dev \n",
659 | " variation_coef \n",
660 | " p_0.01 \n",
661 | " p_0.05 \n",
662 | " p_0.25 \n",
663 | " p_0.5 \n",
664 | " p_0.75 \n",
665 | " p_0.95 \n",
666 | " p_0.99 \n",
667 | " \n",
668 | " \n",
669 | " \n",
670 | " \n",
671 | " 0 \n",
672 | " 0 \n",
673 | " 19.785943 \n",
674 | " 8.902412 \n",
675 | " 0.449936 \n",
676 | " 7.25 \n",
677 | " 9.5575 \n",
678 | " 13.3475 \n",
679 | " 17.795 \n",
680 | " 24.1275 \n",
681 | " 38.061 \n",
682 | " 48.227 \n",
683 | " \n",
684 | " \n",
685 | " 1 \n",
686 | " 1 \n",
687 | " 2.998279 \n",
688 | " 1.383638 \n",
689 | " 0.461478 \n",
690 | " 1 \n",
691 | " 1.44 \n",
692 | " 2 \n",
693 | " 2.9 \n",
694 | " 3.5625 \n",
695 | " 5.1955 \n",
696 | " 7.2145 \n",
697 | " \n",
698 | " \n",
699 | " 2 \n",
700 | " 6 \n",
701 | " 2.569672 \n",
702 | " 0.951100 \n",
703 | " 0.370125 \n",
704 | " 1 \n",
705 | " 2 \n",
706 | " 2 \n",
707 | " 2 \n",
708 | " 3 \n",
709 | " 4 \n",
710 | " 6 \n",
711 | " \n",
712 | " \n",
713 | "
\n",
714 | "
"
715 | ],
716 | "text/plain": [
717 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n",
718 | "0 0 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n",
719 | "1 1 2.998279 1.383638 0.461478 1 1.44 2 \n",
720 | "2 6 2.569672 0.951100 0.370125 1 2 2 \n",
721 | "\n",
722 | " p_0.5 p_0.75 p_0.95 p_0.99 \n",
723 | "0 17.795 24.1275 38.061 48.227 \n",
724 | "1 2.9 3.5625 5.1955 7.2145 \n",
725 | "2 2 3 4 6 "
726 | ]
727 | },
728 | "execution_count": 13,
729 | "metadata": {},
730 | "output_type": "execute_result"
731 | }
732 | ],
733 | "source": [
734 | "profiling_num(tips_np)"
735 | ]
736 | },
737 | {
738 | "cell_type": "markdown",
739 | "metadata": {},
740 | "source": [
741 | "Pandas series & 1D array:"
742 | ]
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": 14,
747 | "metadata": {},
748 | "outputs": [
749 | {
750 | "data": {
751 | "text/html": [
752 | "\n",
753 | "\n",
766 | "
\n",
767 | " \n",
768 | " \n",
769 | " \n",
770 | " variable \n",
771 | " mean \n",
772 | " std_dev \n",
773 | " variation_coef \n",
774 | " p_0.01 \n",
775 | " p_0.05 \n",
776 | " p_0.25 \n",
777 | " p_0.5 \n",
778 | " p_0.75 \n",
779 | " p_0.95 \n",
780 | " p_0.99 \n",
781 | " \n",
782 | " \n",
783 | " \n",
784 | " \n",
785 | " 0 \n",
786 | " total_bill \n",
787 | " 19.785943 \n",
788 | " 8.902412 \n",
789 | " 0.449936 \n",
790 | " 7.25 \n",
791 | " 9.5575 \n",
792 | " 13.3475 \n",
793 | " 17.795 \n",
794 | " 24.1275 \n",
795 | " 38.061 \n",
796 | " 48.227 \n",
797 | " \n",
798 | " \n",
799 | "
\n",
800 | "
"
801 | ],
802 | "text/plain": [
803 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n",
804 | "0 total_bill 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n",
805 | "\n",
806 | " p_0.5 p_0.75 p_0.95 p_0.99 \n",
807 | "0 17.795 24.1275 38.061 48.227 "
808 | ]
809 | },
810 | "execution_count": 14,
811 | "metadata": {},
812 | "output_type": "execute_result"
813 | }
814 | ],
815 | "source": [
816 | "profiling_num(tips['total_bill'])"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 15,
822 | "metadata": {},
823 | "outputs": [
824 | {
825 | "data": {
826 | "text/html": [
827 | "\n",
828 | "\n",
841 | "
\n",
842 | " \n",
843 | " \n",
844 | " \n",
845 | " variable \n",
846 | " mean \n",
847 | " std_dev \n",
848 | " variation_coef \n",
849 | " p_0.01 \n",
850 | " p_0.05 \n",
851 | " p_0.25 \n",
852 | " p_0.5 \n",
853 | " p_0.75 \n",
854 | " p_0.95 \n",
855 | " p_0.99 \n",
856 | " \n",
857 | " \n",
858 | " \n",
859 | " \n",
860 | " 0 \n",
861 | " 0 \n",
862 | " 19.785943 \n",
863 | " 8.902412 \n",
864 | " 0.449936 \n",
865 | " 7.25 \n",
866 | " 9.5575 \n",
867 | " 13.3475 \n",
868 | " 17.795 \n",
869 | " 24.1275 \n",
870 | " 38.061 \n",
871 | " 48.227 \n",
872 | " \n",
873 | " \n",
874 | "
\n",
875 | "
"
876 | ],
877 | "text/plain": [
878 | " variable mean std_dev variation_coef p_0.01 p_0.05 p_0.25 \\\n",
879 | "0 0 19.785943 8.902412 0.449936 7.25 9.5575 13.3475 \n",
880 | "\n",
881 | " p_0.5 p_0.75 p_0.95 p_0.99 \n",
882 | "0 17.795 24.1275 38.061 48.227 "
883 | ]
884 | },
885 | "execution_count": 15,
886 | "metadata": {},
887 | "output_type": "execute_result"
888 | }
889 | ],
890 | "source": [
891 | "profiling_num(tips_np[:,0])"
892 | ]
893 | },
894 | {
895 | "cell_type": "code",
896 | "execution_count": null,
897 | "metadata": {},
898 | "outputs": [],
899 | "source": [
900 | "\n",
901 | "\n",
902 | "\n",
903 | "\n",
904 | "\n",
905 | "\n",
906 | "\n",
907 | "\n",
908 | "\n"
909 | ]
910 | },
911 | {
912 | "cell_type": "markdown",
913 | "metadata": {},
914 | "source": [
915 | "### 1.3) Univariate analysis in categorical variables"
916 | ]
917 | },
918 | {
919 | "cell_type": "code",
920 | "execution_count": 16,
921 | "metadata": {},
922 | "outputs": [],
923 | "source": [
924 | "from funpymodeling.exploratory import freq_tbl"
925 | ]
926 | },
927 | {
928 | "cell_type": "markdown",
929 | "metadata": {},
930 | "source": [
931 | "It retrieves several statistics related to categorical variablees, such as frequecnyt, percentage an cumulative percentage.\n",
932 | "\n",
933 | "It will run for all categorical variables excluding all the other ones."
934 | ]
935 | },
936 | {
937 | "cell_type": "markdown",
938 | "metadata": {},
939 | "source": [
940 | "Just like the others, it supports: pandas dataframe, pandas series and 1D/2D numpy arrays"
941 | ]
942 | },
943 | {
944 | "cell_type": "code",
945 | "execution_count": 17,
946 | "metadata": {},
947 | "outputs": [
948 | {
949 | "name": "stdout",
950 | "output_type": "stream",
951 | "text": [
952 | " sex frequency percentage cumulative_perc\n",
953 | "0 Male 157 0.643443 0.643443\n",
954 | "1 Female 87 0.356557 1.000000\n",
955 | "\n",
956 | "----------------------------------------------------------------\n",
957 | "\n",
958 | " smoker frequency percentage cumulative_perc\n",
959 | "0 No 151 0.618852 0.618852\n",
960 | "1 Yes 93 0.381148 1.000000\n",
961 | "\n",
962 | "----------------------------------------------------------------\n",
963 | "\n",
964 | " day frequency percentage cumulative_perc\n",
965 | "0 Sat 87 0.356557 0.356557\n",
966 | "1 Sun 76 0.311475 0.668033\n",
967 | "2 Thur 62 0.254098 0.922131\n",
968 | "3 Fri 19 0.077869 1.000000\n",
969 | "\n",
970 | "----------------------------------------------------------------\n",
971 | "\n",
972 | " time frequency percentage cumulative_perc\n",
973 | "0 Dinner 176 0.721311 0.721311\n",
974 | "1 Lunch 68 0.278689 1.000000\n",
975 | "\n",
976 | "----------------------------------------------------------------\n",
977 | "\n"
978 | ]
979 | }
980 | ],
981 | "source": [
982 | "freq_tbl(tips)"
983 | ]
984 | },
985 | {
986 | "cell_type": "markdown",
987 | "metadata": {},
988 | "source": [
989 | "If 1 variable is provided, it returns the table associated to that variable so we can use in our data pipeline:"
990 | ]
991 | },
992 | {
993 | "cell_type": "code",
994 | "execution_count": 18,
995 | "metadata": {},
996 | "outputs": [
997 | {
998 | "data": {
999 | "text/html": [
1000 | "\n",
1001 | "\n",
1014 | "
\n",
1015 | " \n",
1016 | " \n",
1017 | " \n",
1018 | " day \n",
1019 | " frequency \n",
1020 | " percentage \n",
1021 | " cumulative_perc \n",
1022 | " \n",
1023 | " \n",
1024 | " \n",
1025 | " \n",
1026 | " 0 \n",
1027 | " Sat \n",
1028 | " 87 \n",
1029 | " 0.356557 \n",
1030 | " 0.356557 \n",
1031 | " \n",
1032 | " \n",
1033 | " 1 \n",
1034 | " Sun \n",
1035 | " 76 \n",
1036 | " 0.311475 \n",
1037 | " 0.668033 \n",
1038 | " \n",
1039 | " \n",
1040 | " 2 \n",
1041 | " Thur \n",
1042 | " 62 \n",
1043 | " 0.254098 \n",
1044 | " 0.922131 \n",
1045 | " \n",
1046 | " \n",
1047 | " 3 \n",
1048 | " Fri \n",
1049 | " 19 \n",
1050 | " 0.077869 \n",
1051 | " 1.000000 \n",
1052 | " \n",
1053 | " \n",
1054 | "
\n",
1055 | "
"
1056 | ],
1057 | "text/plain": [
1058 | " day frequency percentage cumulative_perc\n",
1059 | "0 Sat 87 0.356557 0.356557\n",
1060 | "1 Sun 76 0.311475 0.668033\n",
1061 | "2 Thur 62 0.254098 0.922131\n",
1062 | "3 Fri 19 0.077869 1.000000"
1063 | ]
1064 | },
1065 | "execution_count": 18,
1066 | "metadata": {},
1067 | "output_type": "execute_result"
1068 | }
1069 | ],
1070 | "source": [
1071 | "day_freq=freq_tbl(tips['day'])\n",
1072 | "\n",
1073 | "day_freq"
1074 | ]
1075 | },
1076 | {
1077 | "cell_type": "markdown",
1078 | "metadata": {},
1079 | "source": [
1080 | "Days with low representativity (30%):"
1081 | ]
1082 | },
1083 | {
1084 | "cell_type": "code",
1085 | "execution_count": 19,
1086 | "metadata": {},
1087 | "outputs": [
1088 | {
1089 | "data": {
1090 | "text/html": [
1091 | "\n",
1092 | "\n",
1105 | "
\n",
1106 | " \n",
1107 | " \n",
1108 | " \n",
1109 | " day \n",
1110 | " frequency \n",
1111 | " percentage \n",
1112 | " cumulative_perc \n",
1113 | " \n",
1114 | " \n",
1115 | " \n",
1116 | " \n",
1117 | " 0 \n",
1118 | " Sat \n",
1119 | " 87 \n",
1120 | " 0.356557 \n",
1121 | " 0.356557 \n",
1122 | " \n",
1123 | " \n",
1124 | " 1 \n",
1125 | " Sun \n",
1126 | " 76 \n",
1127 | " 0.311475 \n",
1128 | " 0.668033 \n",
1129 | " \n",
1130 | " \n",
1131 | "
\n",
1132 | "
"
1133 | ],
1134 | "text/plain": [
1135 | " day frequency percentage cumulative_perc\n",
1136 | "0 Sat 87 0.356557 0.356557\n",
1137 | "1 Sun 76 0.311475 0.668033"
1138 | ]
1139 | },
1140 | "execution_count": 19,
1141 | "metadata": {},
1142 | "output_type": "execute_result"
1143 | }
1144 | ],
1145 | "source": [
1146 | "day_freq[day_freq['percentage']>0.3]"
1147 | ]
1148 | },
1149 | {
1150 | "cell_type": "code",
1151 | "execution_count": 20,
1152 | "metadata": {},
1153 | "outputs": [
1154 | {
1155 | "data": {
1156 | "text/plain": [
1157 | "0 Sat\n",
1158 | "1 Sun\n",
1159 | "Name: day, dtype: category\n",
1160 | "Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']"
1161 | ]
1162 | },
1163 | "execution_count": 20,
1164 | "metadata": {},
1165 | "output_type": "execute_result"
1166 | }
1167 | ],
1168 | "source": [
1169 | "day_freq[day_freq['percentage']>0.3]['day']"
1170 | ]
1171 | },
1172 | {
1173 | "cell_type": "code",
1174 | "execution_count": null,
1175 | "metadata": {},
1176 | "outputs": [],
1177 | "source": [
1178 | "\n",
1179 | "\n",
1180 | "\n",
1181 | "\n",
1182 | "\n",
1183 | "\n",
1184 | "\n",
1185 | "\n",
1186 | "\n"
1187 | ]
1188 | },
1189 | {
1190 | "cell_type": "markdown",
1191 | "metadata": {},
1192 | "source": [
1193 | "### 1.4) Pairwaise correlation analysis"
1194 | ]
1195 | },
1196 | {
1197 | "cell_type": "code",
1198 | "execution_count": 21,
1199 | "metadata": {},
1200 | "outputs": [],
1201 | "source": [
1202 | "from funpymodeling.exploratory import corr_pair"
1203 | ]
1204 | },
1205 | {
1206 | "cell_type": "markdown",
1207 | "metadata": {},
1208 | "source": [
1209 | "A wrapper around `corr` of pandas that allow us to quickly filter most important variables, or not. \n",
1210 | "\n",
1211 | "Useful in EDA and when doing the features pre-selection before creating the predictive model."
1212 | ]
1213 | },
1214 | {
1215 | "cell_type": "code",
1216 | "execution_count": 22,
1217 | "metadata": {},
1218 | "outputs": [
1219 | {
1220 | "data": {
1221 | "text/html": [
1222 | "\n",
1223 | "\n",
1236 | "
\n",
1237 | " \n",
1238 | " \n",
1239 | " \n",
1240 | " variable \n",
1241 | " q_nan \n",
1242 | " p_nan \n",
1243 | " q_zeros \n",
1244 | " p_zeros \n",
1245 | " unique \n",
1246 | " type \n",
1247 | " \n",
1248 | " \n",
1249 | " \n",
1250 | " \n",
1251 | " 0 \n",
1252 | " carat \n",
1253 | " 0 \n",
1254 | " 0.0 \n",
1255 | " 0 \n",
1256 | " 0.000000 \n",
1257 | " 273 \n",
1258 | " float64 \n",
1259 | " \n",
1260 | " \n",
1261 | " 1 \n",
1262 | " cut \n",
1263 | " 0 \n",
1264 | " 0.0 \n",
1265 | " 0 \n",
1266 | " 0.000000 \n",
1267 | " 5 \n",
1268 | " object \n",
1269 | " \n",
1270 | " \n",
1271 | " 2 \n",
1272 | " color \n",
1273 | " 0 \n",
1274 | " 0.0 \n",
1275 | " 0 \n",
1276 | " 0.000000 \n",
1277 | " 7 \n",
1278 | " object \n",
1279 | " \n",
1280 | " \n",
1281 | " 3 \n",
1282 | " clarity \n",
1283 | " 0 \n",
1284 | " 0.0 \n",
1285 | " 0 \n",
1286 | " 0.000000 \n",
1287 | " 8 \n",
1288 | " object \n",
1289 | " \n",
1290 | " \n",
1291 | " 4 \n",
1292 | " depth \n",
1293 | " 0 \n",
1294 | " 0.0 \n",
1295 | " 0 \n",
1296 | " 0.000000 \n",
1297 | " 184 \n",
1298 | " float64 \n",
1299 | " \n",
1300 | " \n",
1301 | " 5 \n",
1302 | " table \n",
1303 | " 0 \n",
1304 | " 0.0 \n",
1305 | " 0 \n",
1306 | " 0.000000 \n",
1307 | " 127 \n",
1308 | " float64 \n",
1309 | " \n",
1310 | " \n",
1311 | " 6 \n",
1312 | " price \n",
1313 | " 0 \n",
1314 | " 0.0 \n",
1315 | " 0 \n",
1316 | " 0.000000 \n",
1317 | " 11602 \n",
1318 | " int64 \n",
1319 | " \n",
1320 | " \n",
1321 | " 7 \n",
1322 | " x \n",
1323 | " 0 \n",
1324 | " 0.0 \n",
1325 | " 8 \n",
1326 | " 0.000148 \n",
1327 | " 554 \n",
1328 | " float64 \n",
1329 | " \n",
1330 | " \n",
1331 | " 8 \n",
1332 | " y \n",
1333 | " 0 \n",
1334 | " 0.0 \n",
1335 | " 7 \n",
1336 | " 0.000130 \n",
1337 | " 552 \n",
1338 | " float64 \n",
1339 | " \n",
1340 | " \n",
1341 | " 9 \n",
1342 | " z \n",
1343 | " 0 \n",
1344 | " 0.0 \n",
1345 | " 20 \n",
1346 | " 0.000371 \n",
1347 | " 375 \n",
1348 | " float64 \n",
1349 | " \n",
1350 | " \n",
1351 | "
\n",
1352 | "
"
1353 | ],
1354 | "text/plain": [
1355 | " variable q_nan p_nan q_zeros p_zeros unique type\n",
1356 | "0 carat 0 0.0 0 0.000000 273 float64\n",
1357 | "1 cut 0 0.0 0 0.000000 5 object\n",
1358 | "2 color 0 0.0 0 0.000000 7 object\n",
1359 | "3 clarity 0 0.0 0 0.000000 8 object\n",
1360 | "4 depth 0 0.0 0 0.000000 184 float64\n",
1361 | "5 table 0 0.0 0 0.000000 127 float64\n",
1362 | "6 price 0 0.0 0 0.000000 11602 int64\n",
1363 | "7 x 0 0.0 8 0.000148 554 float64\n",
1364 | "8 y 0 0.0 7 0.000130 552 float64\n",
1365 | "9 z 0 0.0 20 0.000371 375 float64"
1366 | ]
1367 | },
1368 | "execution_count": 22,
1369 | "metadata": {},
1370 | "output_type": "execute_result"
1371 | }
1372 | ],
1373 | "source": [
1374 | "diamonds = sns.load_dataset('diamonds')\n",
1375 | "\n",
1376 | "status(diamonds)"
1377 | ]
1378 | },
1379 | {
1380 | "cell_type": "code",
1381 | "execution_count": 23,
1382 | "metadata": {},
1383 | "outputs": [
1384 | {
1385 | "data": {
1386 | "text/html": [
1387 | "\n",
1388 | "\n",
1401 | "
\n",
1402 | " \n",
1403 | " \n",
1404 | " \n",
1405 | " v1 \n",
1406 | " v2 \n",
1407 | " R \n",
1408 | " R2 \n",
1409 | " \n",
1410 | " \n",
1411 | " \n",
1412 | " \n",
1413 | " 1 \n",
1414 | " depth \n",
1415 | " carat \n",
1416 | " 0.028224 \n",
1417 | " 0.000797 \n",
1418 | " \n",
1419 | " \n",
1420 | " 2 \n",
1421 | " table \n",
1422 | " carat \n",
1423 | " 0.181618 \n",
1424 | " 0.032985 \n",
1425 | " \n",
1426 | " \n",
1427 | " 3 \n",
1428 | " price \n",
1429 | " carat \n",
1430 | " 0.921591 \n",
1431 | " 0.849331 \n",
1432 | " \n",
1433 | " \n",
1434 | " 4 \n",
1435 | " x \n",
1436 | " carat \n",
1437 | " 0.975094 \n",
1438 | " 0.950809 \n",
1439 | " \n",
1440 | " \n",
1441 | " 5 \n",
1442 | " y \n",
1443 | " carat \n",
1444 | " 0.951722 \n",
1445 | " 0.905775 \n",
1446 | " \n",
1447 | " \n",
1448 | " 6 \n",
1449 | " z \n",
1450 | " carat \n",
1451 | " 0.953387 \n",
1452 | " 0.908947 \n",
1453 | " \n",
1454 | " \n",
1455 | " 7 \n",
1456 | " carat \n",
1457 | " depth \n",
1458 | " 0.028224 \n",
1459 | " 0.000797 \n",
1460 | " \n",
1461 | " \n",
1462 | " 9 \n",
1463 | " table \n",
1464 | " depth \n",
1465 | " -0.295779 \n",
1466 | " 0.087485 \n",
1467 | " \n",
1468 | " \n",
1469 | " 10 \n",
1470 | " price \n",
1471 | " depth \n",
1472 | " -0.010647 \n",
1473 | " 0.000113 \n",
1474 | " \n",
1475 | " \n",
1476 | " 11 \n",
1477 | " x \n",
1478 | " depth \n",
1479 | " -0.025289 \n",
1480 | " 0.000640 \n",
1481 | " \n",
1482 | " \n",
1483 | "
\n",
1484 | "
"
1485 | ],
1486 | "text/plain": [
1487 | " v1 v2 R R2\n",
1488 | "1 depth carat 0.028224 0.000797\n",
1489 | "2 table carat 0.181618 0.032985\n",
1490 | "3 price carat 0.921591 0.849331\n",
1491 | "4 x carat 0.975094 0.950809\n",
1492 | "5 y carat 0.951722 0.905775\n",
1493 | "6 z carat 0.953387 0.908947\n",
1494 | "7 carat depth 0.028224 0.000797\n",
1495 | "9 table depth -0.295779 0.087485\n",
1496 | "10 price depth -0.010647 0.000113\n",
1497 | "11 x depth -0.025289 0.000640"
1498 | ]
1499 | },
1500 | "execution_count": 23,
1501 | "metadata": {},
1502 | "output_type": "execute_result"
1503 | }
1504 | ],
1505 | "source": [
1506 | "res=corr_pair(diamonds)\n",
1507 | "\n",
1508 | "res.head(10)"
1509 | ]
1510 | },
1511 | {
1512 | "cell_type": "markdown",
1513 | "metadata": {},
1514 | "source": [
1515 | "If `price` is the target..."
1516 | ]
1517 | },
1518 | {
1519 | "cell_type": "markdown",
1520 | "metadata": {},
1521 | "source": [
1522 | "Feature analysis for predictive modeling:"
1523 | ]
1524 | },
1525 | {
1526 | "cell_type": "code",
1527 | "execution_count": 24,
1528 | "metadata": {},
1529 | "outputs": [],
1530 | "source": [
1531 | "res_target_ordered=res[res['v2']=='price'].sort_values('R2', ascending=False)"
1532 | ]
1533 | },
1534 | {
1535 | "cell_type": "code",
1536 | "execution_count": null,
1537 | "metadata": {},
1538 | "outputs": [],
1539 | "source": [
1540 | "\n",
1541 | "\n",
1542 | "\n",
1543 | "\n"
1544 | ]
1545 | },
1546 | {
1547 | "cell_type": "markdown",
1548 | "metadata": {},
1549 | "source": [
1550 | "Get top 3 most correlated features:"
1551 | ]
1552 | },
1553 | {
1554 | "cell_type": "code",
1555 | "execution_count": 25,
1556 | "metadata": {},
1557 | "outputs": [
1558 | {
1559 | "data": {
1560 | "text/plain": [
1561 | "21 carat\n",
1562 | "25 x\n",
1563 | "26 y\n",
1564 | "Name: v1, dtype: object"
1565 | ]
1566 | },
1567 | "execution_count": 25,
1568 | "metadata": {},
1569 | "output_type": "execute_result"
1570 | }
1571 | ],
1572 | "source": [
1573 | "# Top 3:\n",
1574 | "top_vars=res_target_ordered[0:3]['v1']\n",
1575 | "top_vars"
1576 | ]
1577 | },
1578 | {
1579 | "cell_type": "code",
1580 | "execution_count": null,
1581 | "metadata": {},
1582 | "outputs": [],
1583 | "source": [
1584 | "\n",
1585 | "\n",
1586 | "\n",
1587 | "\n"
1588 | ]
1589 | },
1590 | {
1591 | "cell_type": "markdown",
1592 | "metadata": {},
1593 | "source": [
1594 | "On the opposite: Delete less relevant features (threshold R2 < 0.05)"
1595 | ]
1596 | },
1597 | {
1598 | "cell_type": "code",
1599 | "execution_count": 26,
1600 | "metadata": {},
1601 | "outputs": [
1602 | {
1603 | "data": {
1604 | "text/plain": [
1605 | "23 table\n",
1606 | "22 depth\n",
1607 | "Name: v1, dtype: object"
1608 | ]
1609 | },
1610 | "execution_count": 26,
1611 | "metadata": {},
1612 | "output_type": "execute_result"
1613 | }
1614 | ],
1615 | "source": [
1616 | "res_target_ordered[res_target_ordered['R2']<0.05]['v1']"
1617 | ]
1618 | },
1619 | {
1620 | "cell_type": "code",
1621 | "execution_count": null,
1622 | "metadata": {},
1623 | "outputs": [],
1624 | "source": [
1625 | "\n",
1626 | "\n",
1627 | "\n",
1628 | "\n",
1629 | "\n",
1630 | "\n",
1631 | "\n",
1632 | "\n",
1633 | "\n"
1634 | ]
1635 | },
1636 | {
1637 | "cell_type": "markdown",
1638 | "metadata": {},
1639 | "source": [
1640 | "### 1.5) Get numeric and categorical var names"
1641 | ]
1642 | },
1643 | {
1644 | "cell_type": "markdown",
1645 | "metadata": {},
1646 | "source": [
1647 | "Definitely, this is not fancy but useful internally and used with sklearn pipelines."
1648 | ]
1649 | },
1650 | {
1651 | "cell_type": "code",
1652 | "execution_count": 27,
1653 | "metadata": {},
1654 | "outputs": [],
1655 | "source": [
1656 | "from funpymodeling.exploratory import cat_vars, num_vars"
1657 | ]
1658 | },
1659 | {
1660 | "cell_type": "code",
1661 | "execution_count": 28,
1662 | "metadata": {},
1663 | "outputs": [
1664 | {
1665 | "data": {
1666 | "text/html": [
1667 | "\n",
1668 | "\n",
1681 | "
\n",
1682 | " \n",
1683 | " \n",
1684 | " \n",
1685 | " variable \n",
1686 | " q_nan \n",
1687 | " p_nan \n",
1688 | " q_zeros \n",
1689 | " p_zeros \n",
1690 | " unique \n",
1691 | " type \n",
1692 | " \n",
1693 | " \n",
1694 | " \n",
1695 | " \n",
1696 | " 0 \n",
1697 | " total_bill \n",
1698 | " 0 \n",
1699 | " 0.0 \n",
1700 | " 0 \n",
1701 | " 0.0 \n",
1702 | " 229 \n",
1703 | " float64 \n",
1704 | " \n",
1705 | " \n",
1706 | " 1 \n",
1707 | " tip \n",
1708 | " 0 \n",
1709 | " 0.0 \n",
1710 | " 0 \n",
1711 | " 0.0 \n",
1712 | " 123 \n",
1713 | " float64 \n",
1714 | " \n",
1715 | " \n",
1716 | " 2 \n",
1717 | " sex \n",
1718 | " 0 \n",
1719 | " 0.0 \n",
1720 | " 0 \n",
1721 | " 0.0 \n",
1722 | " 2 \n",
1723 | " category \n",
1724 | " \n",
1725 | " \n",
1726 | " 3 \n",
1727 | " smoker \n",
1728 | " 0 \n",
1729 | " 0.0 \n",
1730 | " 0 \n",
1731 | " 0.0 \n",
1732 | " 2 \n",
1733 | " category \n",
1734 | " \n",
1735 | " \n",
1736 | " 4 \n",
1737 | " day \n",
1738 | " 0 \n",
1739 | " 0.0 \n",
1740 | " 0 \n",
1741 | " 0.0 \n",
1742 | " 4 \n",
1743 | " category \n",
1744 | " \n",
1745 | " \n",
1746 | " 5 \n",
1747 | " time \n",
1748 | " 0 \n",
1749 | " 0.0 \n",
1750 | " 0 \n",
1751 | " 0.0 \n",
1752 | " 2 \n",
1753 | " category \n",
1754 | " \n",
1755 | " \n",
1756 | " 6 \n",
1757 | " size \n",
1758 | " 0 \n",
1759 | " 0.0 \n",
1760 | " 0 \n",
1761 | " 0.0 \n",
1762 | " 6 \n",
1763 | " int64 \n",
1764 | " \n",
1765 | " \n",
1766 | "
\n",
1767 | "
"
1768 | ],
1769 | "text/plain": [
1770 | " variable q_nan p_nan q_zeros p_zeros unique type\n",
1771 | "0 total_bill 0 0.0 0 0.0 229 float64\n",
1772 | "1 tip 0 0.0 0 0.0 123 float64\n",
1773 | "2 sex 0 0.0 0 0.0 2 category\n",
1774 | "3 smoker 0 0.0 0 0.0 2 category\n",
1775 | "4 day 0 0.0 0 0.0 4 category\n",
1776 | "5 time 0 0.0 0 0.0 2 category\n",
1777 | "6 size 0 0.0 0 0.0 6 int64"
1778 | ]
1779 | },
1780 | "execution_count": 28,
1781 | "metadata": {},
1782 | "output_type": "execute_result"
1783 | }
1784 | ],
1785 | "source": [
1786 | "status(tips)"
1787 | ]
1788 | },
1789 | {
1790 | "cell_type": "markdown",
1791 | "metadata": {},
1792 | "source": [
1793 | "Retrieve categorical var names:"
1794 | ]
1795 | },
1796 | {
1797 | "cell_type": "code",
1798 | "execution_count": 29,
1799 | "metadata": {},
1800 | "outputs": [
1801 | {
1802 | "data": {
1803 | "text/plain": [
1804 | "Index(['sex', 'smoker', 'day', 'time'], dtype='object')"
1805 | ]
1806 | },
1807 | "execution_count": 29,
1808 | "metadata": {},
1809 | "output_type": "execute_result"
1810 | }
1811 | ],
1812 | "source": [
1813 | "cat_vars(tips)"
1814 | ]
1815 | },
1816 | {
1817 | "cell_type": "markdown",
1818 | "metadata": {},
1819 | "source": [
1820 | "Retrieve numerical var names:"
1821 | ]
1822 | },
1823 | {
1824 | "cell_type": "code",
1825 | "execution_count": 30,
1826 | "metadata": {},
1827 | "outputs": [
1828 | {
1829 | "data": {
1830 | "text/plain": [
1831 | "Index(['total_bill', 'tip', 'size'], dtype='object')"
1832 | ]
1833 | },
1834 | "execution_count": 30,
1835 | "metadata": {},
1836 | "output_type": "execute_result"
1837 | }
1838 | ],
1839 | "source": [
1840 | "num_vars(tips)"
1841 | ]
1842 | },
1843 | {
1844 | "cell_type": "code",
1845 | "execution_count": null,
1846 | "metadata": {},
1847 | "outputs": [],
1848 | "source": [
1849 | "\n",
1850 | "\n",
1851 | "\n",
1852 | "\n",
1853 | "\n",
1854 | "\n",
1855 | "\n",
1856 | "\n",
1857 | "\n",
1858 | "\n"
1859 | ]
1860 | },
1861 | {
1862 | "cell_type": "markdown",
1863 | "metadata": {},
1864 | "source": [
1865 | "## 2) Data Preparation"
1866 | ]
1867 | },
1868 | {
1869 | "cell_type": "markdown",
1870 | "metadata": {},
1871 | "source": [
1872 | "### 2.1) Convert \"almost-everything\" into a pandas dataframe"
1873 | ]
1874 | },
1875 | {
1876 | "cell_type": "code",
1877 | "execution_count": 31,
1878 | "metadata": {},
1879 | "outputs": [],
1880 | "source": [
1881 | "from funpymodeling.data_prep import todf\n",
1882 | "\n",
1883 | "import numpy as np"
1884 | ]
1885 | },
1886 | {
1887 | "cell_type": "markdown",
1888 | "metadata": {},
1889 | "source": [
1890 | "Note: Yes, under certain scenarios is not convenient due to performance reasons. But many scenarios we need/want to test or do a quick explore.\n",
1891 | "\n",
1892 | "`todf` is used as the entry point in many functions of `funPyModeling`."
1893 | ]
1894 | },
1895 | {
1896 | "cell_type": "code",
1897 | "execution_count": 32,
1898 | "metadata": {},
1899 | "outputs": [
1900 | {
1901 | "data": {
1902 | "text/html": [
1903 | "\n",
1904 | "\n",
1917 | "
\n",
1918 | " \n",
1919 | " \n",
1920 | " \n",
1921 | " 0 \n",
1922 | " \n",
1923 | " \n",
1924 | " \n",
1925 | " \n",
1926 | " 0 \n",
1927 | " 11 \n",
1928 | " \n",
1929 | " \n",
1930 | " 1 \n",
1931 | " 12 \n",
1932 | " \n",
1933 | " \n",
1934 | " 2 \n",
1935 | " 5 \n",
1936 | " \n",
1937 | " \n",
1938 | " 3 \n",
1939 | " 2 \n",
1940 | " \n",
1941 | " \n",
1942 | "
\n",
1943 | "
"
1944 | ],
1945 | "text/plain": [
1946 | " 0\n",
1947 | "0 11\n",
1948 | "1 12\n",
1949 | "2 5\n",
1950 | "3 2"
1951 | ]
1952 | },
1953 | "execution_count": 32,
1954 | "metadata": {},
1955 | "output_type": "execute_result"
1956 | }
1957 | ],
1958 | "source": [
1959 | "# 1D List\n",
1960 | "list_1d = [11, 12, 5, 2] \n",
1961 | "todf(list_1d)"
1962 | ]
1963 | },
1964 | {
1965 | "cell_type": "code",
1966 | "execution_count": 33,
1967 | "metadata": {},
1968 | "outputs": [
1969 | {
1970 | "data": {
1971 | "text/html": [
1972 | "\n",
1973 | "\n",
1986 | "
\n",
1987 | " \n",
1988 | " \n",
1989 | " \n",
1990 | " 0 \n",
1991 | " 1 \n",
1992 | " 2 \n",
1993 | " 3 \n",
1994 | " \n",
1995 | " \n",
1996 | " \n",
1997 | " \n",
1998 | " 0 \n",
1999 | " 11 \n",
2000 | " 12 \n",
2001 | " 5 \n",
2002 | " 2 \n",
2003 | " \n",
2004 | " \n",
2005 | " 1 \n",
2006 | " 15 \n",
2007 | " 24 \n",
2008 | " 6 \n",
2009 | " 10 \n",
2010 | " \n",
2011 | " \n",
2012 | " 2 \n",
2013 | " 10 \n",
2014 | " 8 \n",
2015 | " 12 \n",
2016 | " 5 \n",
2017 | " \n",
2018 | " \n",
2019 | " 3 \n",
2020 | " 12 \n",
2021 | " 15 \n",
2022 | " 8 \n",
2023 | " 6 \n",
2024 | " \n",
2025 | " \n",
2026 | "
\n",
2027 | "
"
2028 | ],
2029 | "text/plain": [
2030 | " 0 1 2 3\n",
2031 | "0 11 12 5 2\n",
2032 | "1 15 24 6 10\n",
2033 | "2 10 8 12 5\n",
2034 | "3 12 15 8 6"
2035 | ]
2036 | },
2037 | "execution_count": 33,
2038 | "metadata": {},
2039 | "output_type": "execute_result"
2040 | }
2041 | ],
2042 | "source": [
2043 | "# 2D List\n",
2044 | "list_2d = [[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]\n",
2045 | "todf(list_2d)"
2046 | ]
2047 | },
2048 | {
2049 | "cell_type": "code",
2050 | "execution_count": 34,
2051 | "metadata": {},
2052 | "outputs": [
2053 | {
2054 | "data": {
2055 | "text/html": [
2056 | "\n",
2057 | "\n",
2070 | "
\n",
2071 | " \n",
2072 | " \n",
2073 | " \n",
2074 | " 0 \n",
2075 | " \n",
2076 | " \n",
2077 | " \n",
2078 | " \n",
2079 | " 0 \n",
2080 | " 11 \n",
2081 | " \n",
2082 | " \n",
2083 | " 1 \n",
2084 | " 12 \n",
2085 | " \n",
2086 | " \n",
2087 | " 2 \n",
2088 | " 5 \n",
2089 | " \n",
2090 | " \n",
2091 | " 3 \n",
2092 | " 2 \n",
2093 | " \n",
2094 | " \n",
2095 | "
\n",
2096 | "
"
2097 | ],
2098 | "text/plain": [
2099 | " 0\n",
2100 | "0 11\n",
2101 | "1 12\n",
2102 | "2 5\n",
2103 | "3 2"
2104 | ]
2105 | },
2106 | "execution_count": 34,
2107 | "metadata": {},
2108 | "output_type": "execute_result"
2109 | }
2110 | ],
2111 | "source": [
2112 | "# 1D numpy array\n",
2113 | "array_1d = np.array(list_1d)\n",
2114 | "todf(array_1d)"
2115 | ]
2116 | },
2117 | {
2118 | "cell_type": "code",
2119 | "execution_count": 35,
2120 | "metadata": {},
2121 | "outputs": [
2122 | {
2123 | "data": {
2124 | "text/html": [
2125 | "\n",
2126 | "\n",
2139 | "
\n",
2140 | " \n",
2141 | " \n",
2142 | " \n",
2143 | " 0 \n",
2144 | " 1 \n",
2145 | " 2 \n",
2146 | " 3 \n",
2147 | " \n",
2148 | " \n",
2149 | " \n",
2150 | " \n",
2151 | " 0 \n",
2152 | " 11 \n",
2153 | " 12 \n",
2154 | " 5 \n",
2155 | " 2 \n",
2156 | " \n",
2157 | " \n",
2158 | " 1 \n",
2159 | " 15 \n",
2160 | " 24 \n",
2161 | " 6 \n",
2162 | " 10 \n",
2163 | " \n",
2164 | " \n",
2165 | " 2 \n",
2166 | " 10 \n",
2167 | " 8 \n",
2168 | " 12 \n",
2169 | " 5 \n",
2170 | " \n",
2171 | " \n",
2172 | " 3 \n",
2173 | " 12 \n",
2174 | " 15 \n",
2175 | " 8 \n",
2176 | " 6 \n",
2177 | " \n",
2178 | " \n",
2179 | "
\n",
2180 | "
"
2181 | ],
2182 | "text/plain": [
2183 | " 0 1 2 3\n",
2184 | "0 11 12 5 2\n",
2185 | "1 15 24 6 10\n",
2186 | "2 10 8 12 5\n",
2187 | "3 12 15 8 6"
2188 | ]
2189 | },
2190 | "execution_count": 35,
2191 | "metadata": {},
2192 | "output_type": "execute_result"
2193 | }
2194 | ],
2195 | "source": [
2196 | "# 2D numpy array\n",
2197 | "array_2d = np.array(list_2d)\n",
2198 | "todf(array_2d)"
2199 | ]
2200 | },
2201 | {
2202 | "cell_type": "code",
2203 | "execution_count": 36,
2204 | "metadata": {},
2205 | "outputs": [
2206 | {
2207 | "data": {
2208 | "text/html": [
2209 | "\n",
2210 | "\n",
2223 | "
\n",
2224 | " \n",
2225 | " \n",
2226 | " \n",
2227 | " v1 \n",
2228 | " v2 \n",
2229 | " \n",
2230 | " \n",
2231 | " \n",
2232 | " \n",
2233 | " 0 \n",
2234 | " 11 \n",
2235 | " 15 \n",
2236 | " \n",
2237 | " \n",
2238 | " 1 \n",
2239 | " 12 \n",
2240 | " 24 \n",
2241 | " \n",
2242 | " \n",
2243 | " 2 \n",
2244 | " 5 \n",
2245 | " 6 \n",
2246 | " \n",
2247 | " \n",
2248 | " 3 \n",
2249 | " 2 \n",
2250 | " 10 \n",
2251 | " \n",
2252 | " \n",
2253 | "
\n",
2254 | "
"
2255 | ],
2256 | "text/plain": [
2257 | " v1 v2\n",
2258 | "0 11 15\n",
2259 | "1 12 24\n",
2260 | "2 5 6\n",
2261 | "3 2 10"
2262 | ]
2263 | },
2264 | "execution_count": 36,
2265 | "metadata": {},
2266 | "output_type": "execute_result"
2267 | }
2268 | ],
2269 | "source": [
2270 | "# Data frame (in=out)\n",
2271 | "pd_df=pd.DataFrame({'v1':[11, 12, 5, 2], 'v2':[15,24, 6,10]}) #\n",
2272 | "todf(pd_df)"
2273 | ]
2274 | },
2275 | {
2276 | "cell_type": "code",
2277 | "execution_count": 37,
2278 | "metadata": {},
2279 | "outputs": [
2280 | {
2281 | "data": {
2282 | "text/html": [
2283 | "\n",
2284 | "\n",
2297 | "
\n",
2298 | " \n",
2299 | " \n",
2300 | " \n",
2301 | " v1 \n",
2302 | " \n",
2303 | " \n",
2304 | " \n",
2305 | " \n",
2306 | " 0 \n",
2307 | " 11 \n",
2308 | " \n",
2309 | " \n",
2310 | " 1 \n",
2311 | " 12 \n",
2312 | " \n",
2313 | " \n",
2314 | " 2 \n",
2315 | " 5 \n",
2316 | " \n",
2317 | " \n",
2318 | " 3 \n",
2319 | " 2 \n",
2320 | " \n",
2321 | " \n",
2322 | "
\n",
2323 | "
"
2324 | ],
2325 | "text/plain": [
2326 | " v1\n",
2327 | "0 11\n",
2328 | "1 12\n",
2329 | "2 5\n",
2330 | "3 2"
2331 | ]
2332 | },
2333 | "execution_count": 37,
2334 | "metadata": {},
2335 | "output_type": "execute_result"
2336 | }
2337 | ],
2338 | "source": [
2339 | "# Pandas series\n",
2340 | "todf(pd_df['v1'])"
2341 | ]
2342 | },
2343 | {
2344 | "cell_type": "markdown",
2345 | "metadata": {},
2346 | "source": [
2347 | "Raise an error in +2D objects:"
2348 | ]
2349 | },
2350 | {
2351 | "cell_type": "code",
2352 | "execution_count": 41,
2353 | "metadata": {},
2354 | "outputs": [
2355 | {
2356 | "name": "stdout",
2357 | "output_type": "stream",
2358 | "text": [
2359 | "(1, 4, 4)\n"
2360 | ]
2361 | }
2362 | ],
2363 | "source": [
2364 | "list_3d = np.array([[[11, 12, 5, 2], [15,24, 6,10], [10, 8, 12, 5], [12,15,8,6]]]) # error\n",
2365 | "print(list_3d.shape)\n",
2366 | "# todf(list_3d) # <- error"
2367 | ]
2368 | },
2369 | {
2370 | "cell_type": "markdown",
2371 | "metadata": {},
2372 | "source": [
2373 | "It raises the error: `Exception: I live in flattland! (can't handle objects with more than 2 dimensions)`"
2374 | ]
2375 | },
2376 | {
2377 | "cell_type": "code",
2378 | "execution_count": null,
2379 | "metadata": {},
2380 | "outputs": [],
2381 | "source": [
2382 | "\n",
2383 | "\n",
2384 | "\n",
2385 | "\n",
2386 | "\n",
2387 | "\n",
2388 | "\n",
2389 | "\n",
2390 | "\n",
2391 | "\n",
2392 | "\n"
2393 | ]
2394 | },
2395 | {
2396 | "cell_type": "markdown",
2397 | "metadata": {},
2398 | "source": [
2399 | "## 3) Model validation: Clustering"
2400 | ]
2401 | },
2402 | {
2403 | "cell_type": "code",
2404 | "execution_count": 43,
2405 | "metadata": {},
2406 | "outputs": [],
2407 | "source": [
2408 | "from funpymodeling.model_validation import coord_plot"
2409 | ]
2410 | },
2411 | {
2412 | "cell_type": "code",
2413 | "execution_count": 44,
2414 | "metadata": {},
2415 | "outputs": [],
2416 | "source": [
2417 | "from sklearn.cluster import KMeans\n",
2418 | "\n",
2419 | "x = iris.drop('species', axis=1)\n",
2420 | "\n",
2421 | "mod_km=KMeans(n_clusters=3)\n",
2422 | "iris['cluster']=mod_km.fit_predict(x)\n"
2423 | ]
2424 | },
2425 | {
2426 | "cell_type": "code",
2427 | "execution_count": 45,
2428 | "metadata": {},
2429 | "outputs": [
2430 | {
2431 | "data": {
2432 | "image/png": "\n",
2433 | "text/plain": [
2434 | ""
2435 | ]
2436 | },
2437 | "metadata": {
2438 | "needs_background": "light"
2439 | },
2440 | "output_type": "display_data"
2441 | }
2442 | ],
2443 | "source": [
2444 | "x_grp, x_grp_norm=coord_plot(iris, 'cluster')"
2445 | ]
2446 | },
2447 | {
2448 | "cell_type": "code",
2449 | "execution_count": 46,
2450 | "metadata": {},
2451 | "outputs": [
2452 | {
2453 | "data": {
2454 | "text/html": [
2455 | "\n",
2456 | "\n",
2469 | "
\n",
2470 | " \n",
2471 | " \n",
2472 | " \n",
2473 | " sepal_length \n",
2474 | " sepal_width \n",
2475 | " petal_length \n",
2476 | " petal_width \n",
2477 | " cluster \n",
2478 | " \n",
2479 | " \n",
2480 | " \n",
2481 | " \n",
2482 | " 0 \n",
2483 | " 5.006000 \n",
2484 | " 3.428000 \n",
2485 | " 1.462000 \n",
2486 | " 0.246000 \n",
2487 | " 0 \n",
2488 | " \n",
2489 | " \n",
2490 | " 1 \n",
2491 | " 5.901613 \n",
2492 | " 2.748387 \n",
2493 | " 4.393548 \n",
2494 | " 1.433871 \n",
2495 | " 1 \n",
2496 | " \n",
2497 | " \n",
2498 | " 2 \n",
2499 | " 6.850000 \n",
2500 | " 3.073684 \n",
2501 | " 5.742105 \n",
2502 | " 2.071053 \n",
2503 | " 2 \n",
2504 | " \n",
2505 | " \n",
2506 | "
\n",
2507 | "
"
2508 | ],
2509 | "text/plain": [
2510 | " sepal_length sepal_width petal_length petal_width cluster\n",
2511 | "0 5.006000 3.428000 1.462000 0.246000 0\n",
2512 | "1 5.901613 2.748387 4.393548 1.433871 1\n",
2513 | "2 6.850000 3.073684 5.742105 2.071053 2"
2514 | ]
2515 | },
2516 | "execution_count": 46,
2517 | "metadata": {},
2518 | "output_type": "execute_result"
2519 | }
2520 | ],
2521 | "source": [
2522 | "x_grp"
2523 | ]
2524 | },
2525 | {
2526 | "cell_type": "code",
2527 | "execution_count": 47,
2528 | "metadata": {},
2529 | "outputs": [
2530 | {
2531 | "data": {
2532 | "text/html": [
2533 | "\n",
2534 | "\n",
2547 | "
\n",
2548 | " \n",
2549 | " \n",
2550 | " \n",
2551 | " sepal_length \n",
2552 | " sepal_width \n",
2553 | " petal_length \n",
2554 | " petal_width \n",
2555 | " cluster \n",
2556 | " \n",
2557 | " \n",
2558 | " \n",
2559 | " \n",
2560 | " 0 \n",
2561 | " 0.00000 \n",
2562 | " 1.000000 \n",
2563 | " 0.000000 \n",
2564 | " 0.000000 \n",
2565 | " 0 \n",
2566 | " \n",
2567 | " \n",
2568 | " 1 \n",
2569 | " 0.48569 \n",
2570 | " 0.000000 \n",
2571 | " 0.684924 \n",
2572 | " 0.650869 \n",
2573 | " 1 \n",
2574 | " \n",
2575 | " \n",
2576 | " 2 \n",
2577 | " 1.00000 \n",
2578 | " 0.478651 \n",
2579 | " 1.000000 \n",
2580 | " 1.000000 \n",
2581 | " 2 \n",
2582 | " \n",
2583 | " \n",
2584 | "
\n",
2585 | "
"
2586 | ],
2587 | "text/plain": [
2588 | " sepal_length sepal_width petal_length petal_width cluster\n",
2589 | "0 0.00000 1.000000 0.000000 0.000000 0\n",
2590 | "1 0.48569 0.000000 0.684924 0.650869 1\n",
2591 | "2 1.00000 0.478651 1.000000 1.000000 2"
2592 | ]
2593 | },
2594 | "execution_count": 47,
2595 | "metadata": {},
2596 | "output_type": "execute_result"
2597 | }
2598 | ],
2599 | "source": [
2600 | "x_grp_norm"
2601 | ]
2602 | },
2603 | {
2604 | "cell_type": "code",
2605 | "execution_count": null,
2606 | "metadata": {},
2607 | "outputs": [],
2608 | "source": []
2609 | }
2610 | ],
2611 | "metadata": {
2612 | "kernelspec": {
2613 | "display_name": "Python 3",
2614 | "language": "python",
2615 | "name": "python3"
2616 | },
2617 | "language_info": {
2618 | "codemirror_mode": {
2619 | "name": "ipython",
2620 | "version": 3
2621 | },
2622 | "file_extension": ".py",
2623 | "mimetype": "text/x-python",
2624 | "name": "python",
2625 | "nbconvert_exporter": "python",
2626 | "pygments_lexer": "ipython3",
2627 | "version": "3.7.5"
2628 | }
2629 | },
2630 | "nbformat": 4,
2631 | "nbformat_minor": 4
2632 | }
2633 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "funpymodeling"
3 | version = "0.1.7"
4 | description = "A package designed for data scientists and teachers, to speed up their ML projects, focused on exploratory data analysis, data preparation, and model performance."
5 | license="MIT"
6 | authors = ["Pablo Casas "]
7 | readme = "README.md"
8 | repository = "https://github.com/pablo14/funPyModeling"
9 | documentation = "https://github.com/pablo14/funPyModeling"
10 |
11 | [tool.poetry.dependencies]
12 | python = ">=3.8.1,<4.0"
13 | pandas = "^2.0.2"
14 | numpy = "^1.24.3"
15 | matplotlib = "^3.7.1"
16 | typing-extensions = "^4.6.3"
17 | scikit-learn = "^1.2.2"
18 | seaborn = "^0.12.2"
19 | flake8 = "^6.0.0"
20 | jupyter = "^1.0.0"
21 | pre-commit = "^3.3.2"
22 | pytest = "^7.3.1"
23 |
24 |
25 | [build-system]
26 | requires = ["poetry-core"]
27 | build-backend = "poetry.core.masonry.api"
28 |
--------------------------------------------------------------------------------