├── Day_1_Scientific_Python
    ├── pandas
    │   └── _solutions
    │   │   ├── pandas_01_data_structures3.py
    │   │   ├── pandas_01_data_structures2.py
    │   │   ├── pandas_01_data_structures4.py
    │   │   ├── pandas_03a_selecting_data10.py
    │   │   ├── pandas_02_basic_operations1.py
    │   │   ├── pandas_02_basic_operations5.py
    │   │   ├── pandas_04_time_series_data1.py
    │   │   ├── pandas_02_basic_operations4.py
    │   │   ├── pandas_02_basic_operations6.py
    │   │   ├── pandas_02_basic_operations9.py
    │   │   ├── pandas_03a_selecting_data5.py
    │   │   ├── pandas_01_data_structures5.py
    │   │   ├── pandas_02_basic_operations7.py
    │   │   ├── pandas_02_basic_operations8.py
    │   │   ├── pandas_03a_selecting_data18.py
    │   │   ├── pandas_03a_selecting_data4.py
    │   │   ├── pandas_04_time_series_data2.py
    │   │   ├── pandas_07_reshaping_data3.py
    │   │   ├── pandas_02_basic_operations2.py
    │   │   ├── pandas_03a_selecting_data11.py
    │   │   ├── pandas_03b_indexing6.py
    │   │   ├── pandas_03b_indexing7.py
    │   │   ├── pandas_06_groupby_operations1.py
    │   │   ├── pandas_01_data_structures1.py
    │   │   ├── pandas_01_data_structures6.py
    │   │   ├── pandas_03a_selecting_data1.py
    │   │   ├── pandas_03a_selecting_data12.py
    │   │   ├── pandas_03a_selecting_data15.py
    │   │   ├── pandas_03a_selecting_data17.py
    │   │   ├── pandas_03a_selecting_data19.py
    │   │   ├── pandas_03a_selecting_data9.py
    │   │   ├── pandas_04_time_series_data3.py
    │   │   ├── pandas_04_time_series_data5.py
    │   │   ├── pandas_06_groupby_operations18.py
    │   │   ├── pandas_06_groupby_operations4.py
    │   │   ├── pandas_02_basic_operations3.py
    │   │   ├── pandas_03a_selecting_data16.py
    │   │   ├── pandas_03a_selecting_data3.py
    │   │   ├── pandas_03a_selecting_data6.py
    │   │   ├── pandas_06_groupby_operations12.py
    │   │   ├── pandas_06_groupby_operations7.py
    │   │   ├── pandas_02_basic_operations10.py
    │   │   ├── pandas_03a_selecting_data8.py
    │   │   ├── pandas_04_time_series_data7.py
    │   │   ├── pandas_06_groupby_operations29.py
    │   │   ├── pandas_04_time_series_data4.py
    │   │   ├── pandas_06_groupby_operations21.py
    │   │   ├── pandas_07_reshaping_data5.py
    │   │   ├── pandas_03a_selecting_data13.py
    │   │   ├── pandas_03a_selecting_data2.py
    │   │   ├── pandas_03a_selecting_data7.py
    │   │   ├── pandas_03b_indexing2.py
    │   │   ├── pandas_07_reshaping_data8.py
    │   │   ├── pandas_03a_selecting_data14.py
    │   │   ├── pandas_03b_indexing4.py
    │   │   ├── pandas_03b_indexing5.py
    │   │   ├── pandas_05_combining_datasets.py
    │   │   ├── pandas_06_groupby_operations8.py
    │   │   ├── pandas_03b_indexing1.py
    │   │   ├── pandas_07_reshaping_data9.py
    │   │   ├── pandas_04_time_series_data10.py
    │   │   ├── pandas_04_time_series_data8.py
    │   │   ├── pandas_06_groupby_operations15.py
    │   │   ├── pandas_06_groupby_operations16.py
    │   │   ├── pandas_06_groupby_operations19.py
    │   │   ├── pandas_06_groupby_operations2.py
    │   │   ├── pandas_06_groupby_operations28.py
    │   │   ├── pandas_06_groupby_operations6.py
    │   │   ├── pandas_06_groupby_operations3.py
    │   │   ├── pandas_07_reshaping_data12.py
    │   │   ├── pandas_03b_indexing3.py
    │   │   ├── pandas_06_groupby_operations20.py
    │   │   ├── pandas_06_groupby_operations22.py
    │   │   ├── pandas_06_groupby_operations13.py
    │   │   ├── pandas_06_groupby_operations14.py
    │   │   ├── pandas_07_reshaping_data1.py
    │   │   ├── pandas_07_reshaping_data4.py
    │   │   ├── pandas_07_reshaping_data6.py
    │   │   ├── pandas_04_time_series_data9.py
    │   │   ├── pandas_06_groupby_operations23.py
    │   │   ├── pandas_06_groupby_operations26.py
    │   │   ├── pandas_06_groupby_operations30.py
    │   │   ├── pandas_06_groupby_operations31.py
    │   │   ├── pandas_06_groupby_operations24.py
    │   │   ├── pandas_06_groupby_operations11.py
    │   │   ├── pandas_06_groupby_operations17.py
    │   │   ├── pandas_06_groupby_operations5.py
    │   │   ├── pandas_03a_selecting_data20.py
    │   │   ├── pandas_03a_selecting_data21.py
    │   │   ├── pandas_07_reshaping_data10.py
    │   │   ├── pandas_06_groupby_operations9.py
    │   │   ├── pandas_06_groupby_operations25.py
    │   │   ├── pandas_06_groupby_operations27.py
    │   │   ├── pandas_07_reshaping_data11.py
    │   │   ├── pandas_06_groupby_operations10.py
    │   │   ├── pandas_04_time_series_data6.py
    │   │   ├── pandas_07_reshaping_data7.py
    │   │   └── pandas_07_reshaping_data2.py
    ├── images
    │   ├── axis.png
    │   ├── broadcasting.png
    │   ├── kmeans_illustration.png
    │   └── tidyr-spread-gather.gif
    ├── data
    │   ├── spectra.mat
    │   ├── kmeans_data.csv
    │   ├── brain_size.csv
    │   └── inflammation-01.csv
    ├── img
    │   ├── dataframe.png
    │   ├── pivot_excel.png
    │   ├── splitApplyCombine.png
    │   └── logoUPSayPlusCDS_990.png
    ├── README.md
    ├── numpy_with_answers
    │   ├── numpys
    │   │   ├── broadcasting.ipynb
    │   │   ├── stacking.ipynb
    │   │   ├── savez.ipynb
    │   │   ├── test_yourself.ipynb
    │   │   ├── fancy_indexing.ipynb
    │   │   ├── filtering_data.ipynb
    │   │   ├── boolean_mask.ipynb
    │   │   ├── numpy_intro.ipynb
    │   │   ├── operations.ipynb
    │   │   └── dataset_intro.ipynb
    │   └── 01-numpy-introduction.ipynb
    ├── numpys
    │   ├── broadcasting.ipynb
    │   ├── stacking.ipynb
    │   ├── test_yourself.ipynb
    │   ├── savez.ipynb
    │   ├── fancy_indexing.ipynb
    │   ├── filtering_data.ipynb
    │   ├── boolean_mask.ipynb
    │   ├── numpy_intro.ipynb
    │   ├── operations.ipynb
    │   └── dataset_intro.ipynb
    └── 01-numpy-introduction.ipynb
├── figures
    ├── README.md
    ├── style_figs.py
    ├── plot_iris_visualization.py
    ├── plot_splines.py
    └── polynomial_overfit_0.svg
├── img
    ├── postit.jpg
    ├── git
    │   ├── coding.png
    │   ├── writing.png
    │   ├── commit_1.png
    │   ├── commit_2.png
    │   ├── commit_3.png
    │   └── git-transport.png
    ├── sphinx-logo.png
    ├── splitApplyCombine.png
    ├── logoUPSayPlusCDS_990.png
    ├── slides.css
    ├── webfont-ubuntu-400-300-100.css
    └── webfont-ubuntu-mono-400-700-400italic.css
├── datasets
    └── README.md
├── requirements.txt
├── Day_2_Machine_Learning_Python
    ├── figures
    │   ├── simple-decision-tree-adult-census.png
    │   └── plot-simple-decision-tree-adult-census.py
    ├── 02_basic_preprocessing_exercise_01.ipynb
    ├── 03_basic_preprocessing_categorical_variables_exercise_01.ipynb
    ├── 02_basic_preprocessing_exercise_01_solution.ipynb
    ├── 04_basic_parameters_tuning_exercise_01.ipynb
    ├── 04_basic_parameters_tuning_exercise_02.ipynb
    ├── 04_basic_parameters_tuning_exercise_01_solution.ipynb
    ├── 03_basic_preprocessing_categorical_variables_exercise_02.ipynb
    ├── 03_basic_preprocessing_categorical_variables_exercise_01_solution.ipynb
    └── 03_basic_preprocessing_categorical_variables_exercise_02_solution.ipynb
├── environment.yml
├── LICENSE
├── .gitignore
├── check_env.py
├── index.html
└── README.md


/Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures3.py:
--------------------------------------------------------------------------------
1 | len(df)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures2.py:
--------------------------------------------------------------------------------
1 | df.head()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures4.py:
--------------------------------------------------------------------------------
1 | df['Age']


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data10.py:
--------------------------------------------------------------------------------
1 | len(titles)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations1.py:
--------------------------------------------------------------------------------
1 | df['Age'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations5.py:
--------------------------------------------------------------------------------
1 | df['Fare'].max()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data1.py:
--------------------------------------------------------------------------------
1 | data['2012':]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations4.py:
--------------------------------------------------------------------------------
1 | df['Survived'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations6.py:
--------------------------------------------------------------------------------
1 | df['Fare'].median()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations9.py:
--------------------------------------------------------------------------------
1 | np.log(df['Fare'])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data5.py:
--------------------------------------------------------------------------------
1 | (df['Age'] > 70).sum()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures5.py:
--------------------------------------------------------------------------------
1 | df['Fare'].plot(kind='box')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations7.py:
--------------------------------------------------------------------------------
1 | df['Fare'].quantile(0.75)


--------------------------------------------------------------------------------
/figures/README.md:
--------------------------------------------------------------------------------
1 | This directory contains didactic figures and scripts that generate them.
2 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations8.py:
--------------------------------------------------------------------------------
1 | df['Fare'] / df['Fare'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data18.py:
--------------------------------------------------------------------------------
1 | inception['n'].isnull().sum()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data4.py:
--------------------------------------------------------------------------------
1 | len(df.loc[df['Age'] > 70, :])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data2.py:
--------------------------------------------------------------------------------
1 | data[data.index.month == 1]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data3.py:
--------------------------------------------------------------------------------
1 | df['Underaged'] = df['Age'] <= 18


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations2.py:
--------------------------------------------------------------------------------
1 | df['Age'].hist() #bins=30, log=True


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data11.py:
--------------------------------------------------------------------------------
1 | titles.sort_values('year').head(2)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing6.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Sex'] == 'male', 'Age'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing7.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Sex'] == 'female', 'Age'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations1.py:
--------------------------------------------------------------------------------
1 | df.groupby('Sex')['Age'].mean()


--------------------------------------------------------------------------------
/img/postit.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/postit.jpg


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures1.py:
--------------------------------------------------------------------------------
1 | df = pd.read_csv("../data/titanic.csv")


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures6.py:
--------------------------------------------------------------------------------
1 | df.sort_values(by='Age', ascending=False)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data1.py:
--------------------------------------------------------------------------------
1 | males = df.loc[df['Sex'] == 'male', :]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data12.py:
--------------------------------------------------------------------------------
1 | len(titles[titles['title'] == 'Hamlet'])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data15.py:
--------------------------------------------------------------------------------
1 | len(titles[titles['year'] // 10 == 195])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data17.py:
--------------------------------------------------------------------------------
1 | len(inception[inception['n'].isnull()])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data19.py:
--------------------------------------------------------------------------------
1 | len(inception[inception['n'].notnull()])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data9.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Surname'].str.len() > 15, :]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data3.py:
--------------------------------------------------------------------------------
1 | data[data.index.month.isin([4, 5, 6])]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data5.py:
--------------------------------------------------------------------------------
1 | data.resample('M').std().plot() # 'A'


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations18.py:
--------------------------------------------------------------------------------
1 | cast.character.value_counts().head(11)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations4.py:
--------------------------------------------------------------------------------
1 | df.groupby('Sex')['Survived'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations3.py:
--------------------------------------------------------------------------------
1 | df['Survived'].sum() / len(df['Survived'])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data16.py:
--------------------------------------------------------------------------------
1 | inception = cast[cast['title'] == 'Inception']


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data3.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Sex'] == 'female', 'Age'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data6.py:
--------------------------------------------------------------------------------
1 | df.loc[(df['Age'] > 30) & (df['Age'] <= 40), :]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations12.py:
--------------------------------------------------------------------------------
1 | cast1990['name'].value_counts().head(10)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations7.py:
--------------------------------------------------------------------------------
1 | titles['decade'] = titles['year'] // 10 * 10


--------------------------------------------------------------------------------
/img/git/coding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/coding.png


--------------------------------------------------------------------------------
/img/git/writing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/writing.png


--------------------------------------------------------------------------------
/img/sphinx-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/sphinx-logo.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations10.py:
--------------------------------------------------------------------------------
1 | df['Fare_log'] = np.log(df['Fare'])
2 | df.head()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data8.py:
--------------------------------------------------------------------------------
1 | df.loc[df['Surname'].str.startswith('Williams'), :]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data7.py:
--------------------------------------------------------------------------------
1 | subset.resample('M').agg(['mean', 'median']).plot()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations29.py:
--------------------------------------------------------------------------------
1 | t = titles
2 | t.year.value_counts().head(3)


--------------------------------------------------------------------------------
/img/git/commit_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/commit_1.png


--------------------------------------------------------------------------------
/img/git/commit_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/commit_2.png


--------------------------------------------------------------------------------
/img/git/commit_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/commit_3.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data4.py:
--------------------------------------------------------------------------------
1 | data[(data.index.hour > 8) & (data.index.hour < 20)]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations21.py:
--------------------------------------------------------------------------------
1 | cast[cast.year == 2010].name.value_counts().head(10)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data5.py:
--------------------------------------------------------------------------------
1 | df.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data13.py:
--------------------------------------------------------------------------------
1 | titles[titles.title == 'Treasure Island'].sort_values('year')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data2.py:
--------------------------------------------------------------------------------
1 | males.loc[:,'Age'].mean()
2 | # or
3 | males.loc['Age'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data7.py:
--------------------------------------------------------------------------------
1 | df['Surname'] = df['Name'].apply(lambda x: x.split(',')[0])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing2.py:
--------------------------------------------------------------------------------
1 | countries.loc[countries['density'] > 300, ['capital', 'population']]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data8.py:
--------------------------------------------------------------------------------
1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot()


--------------------------------------------------------------------------------
/img/git/git-transport.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/git-transport.png


--------------------------------------------------------------------------------
/img/splitApplyCombine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/splitApplyCombine.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data14.py:
--------------------------------------------------------------------------------
1 | len(titles[(titles['year'] >= 1950) & (titles['year'] <= 1959)])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing4.py:
--------------------------------------------------------------------------------
1 | countries.loc['United Kingdom', 'capital'] = 'Cambridge'
2 | countries


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing5.py:
--------------------------------------------------------------------------------
1 | countries[(countries['density'] > 100) & (countries['density'] < 300)]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_05_combining_datasets.py:
--------------------------------------------------------------------------------
1 | pd.merge(countries, country_economics, on='country', how='right')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations8.py:
--------------------------------------------------------------------------------
1 | titles.groupby('decade').size().plot(kind='bar', color='green')


--------------------------------------------------------------------------------
/img/logoUPSayPlusCDS_990.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/logoUPSayPlusCDS_990.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing1.py:
--------------------------------------------------------------------------------
1 | countries['density'] = countries['population']*1000000 / countries['area']


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data9.py:
--------------------------------------------------------------------------------
1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot(kind='area')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data10.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots()
2 | data['2013'].mean().plot(kind='barh', ax=ax)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data8.py:
--------------------------------------------------------------------------------
1 | daily = data['LS06_348'].resample('D').mean() # daily averages calculated


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations15.py:
--------------------------------------------------------------------------------
1 | title_longest = titles['title'].str.len().nlargest(10)
2 | title_longest


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations16.py:
--------------------------------------------------------------------------------
1 | pd.options.display.max_colwidth = 210
2 | titles.loc[title_longest.index]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations19.py:
--------------------------------------------------------------------------------
1 | cast[cast.name == 'Brad Pitt'].year.value_counts().sort_index().plot()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations2.py:
--------------------------------------------------------------------------------
1 | # df['Survived'].sum() / len(df['Survived'])
2 | df['Survived'].mean()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations28.py:
--------------------------------------------------------------------------------
1 | ratios_decade[:, 'actor'].plot()
2 | ratios_decade[:, 'actress'].plot()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations6.py:
--------------------------------------------------------------------------------
1 | df.groupby('AgeClass')['Fare'].mean().plot(kind='bar', rot=0, color="C0")


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations3.py:
--------------------------------------------------------------------------------
1 | df25 = df[df['Age'] < 25]
2 | df25['Survived'].sum() / len(df25['Survived'])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data12.py:
--------------------------------------------------------------------------------
1 | d = c.Superman - c.Batman
2 | print('Superman years:')
3 | print(len(d[d > 0.0]))


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/images/axis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/axis.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing3.py:
--------------------------------------------------------------------------------
1 | countries['density_ratio'] = countries['density'] / countries['density'].mean()
2 | countries


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations20.py:
--------------------------------------------------------------------------------
1 | titles[titles['title'].str.startswith('The Life')]['title'].value_counts().head(10)


--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | `cps_85_wages.csv` is available at https://www.openml.org/d/534
2 | `adult-census.csv` is available at https://www.openml.org/d/15950
3 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/data/spectra.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/data/spectra.mat


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/img/dataframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/dataframe.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations22.py:
--------------------------------------------------------------------------------
1 | pink = cast[cast['title'] == 'The Pink Panther']
2 | pink.groupby(['year'])[['n']].max()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/img/pivot_excel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/pivot_excel.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations13.py:
--------------------------------------------------------------------------------
1 | hamlets = titles[titles['title'].str.contains('Hamlet')]
2 | hamlets['title'].value_counts()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/images/broadcasting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/broadcasting.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations14.py:
--------------------------------------------------------------------------------
1 | hamlets = titles[titles['title'].str.startswith('Hamlet')]
2 | hamlets['title'].value_counts()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data1.py:
--------------------------------------------------------------------------------
1 | df.pivot_table(index='Pclass', columns='Sex', 
2 |                values='Survived', aggfunc='mean')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data4.py:
--------------------------------------------------------------------------------
1 | df.pivot_table(index='Underaged', columns='Sex', 
2 |                values='Fare', aggfunc='mean')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data6.py:
--------------------------------------------------------------------------------
1 | grouped = cast.groupby(['year', 'type']).size()
2 | table = grouped.unstack('type')
3 | table.plot()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/img/splitApplyCombine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/splitApplyCombine.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/img/logoUPSayPlusCDS_990.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/logoUPSayPlusCDS_990.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data9.py:
--------------------------------------------------------------------------------
1 | daily.resample('M').agg(['min', 'max']).plot() # monthly minimum and maximum values of these daily averages


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/images/kmeans_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/kmeans_illustration.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/images/tidyr-spread-gather.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/tidyr-spread-gather.gif


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations23.py:
--------------------------------------------------------------------------------
1 | oz = cast[cast['name'] == 'Frank Oz']
2 | oz_roles = oz.groupby(['year', 'title']).size()
3 | oz_roles[oz_roles > 1]


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations26.py:
--------------------------------------------------------------------------------
1 | leading = cast[cast['n'] == 1]
2 | sums_decade = leading.groupby([cast['year'] // 10 * 10, 'type']).size()
3 | sums_decade


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations30.py:
--------------------------------------------------------------------------------
1 | cast1950 = cast[cast['year'] // 10 == 195]
2 | cast1950 = cast1950[cast1950['n'] == 1]
3 | cast1950['type'].value_counts()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations31.py:
--------------------------------------------------------------------------------
1 | cast2000 = cast[cast['year'] // 10 == 200]
2 | cast2000 = cast2000[cast2000['n'] == 1]
3 | cast2000['type'].value_counts()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | scipy
 3 | joblib
 4 | scikit-learn
 5 | pandas
 6 | pandas-profiling
 7 | ipython
 8 | jupyter
 9 | pillow
10 | matplotlib
11 | mplleaflet
12 | seaborn
13 | plotly
14 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations24.py:
--------------------------------------------------------------------------------
1 | oz = cast[cast['name'] == 'Frank Oz']
2 | oz_roles = oz.groupby(['character']).size()
3 | oz_roles[oz_roles > 1].sort_values()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations11.py:
--------------------------------------------------------------------------------
1 | cast1990 = cast[cast['year'] >= 1990]
2 | cast1990 = cast1990[cast1990['n'] == 1]
3 | cast1990.groupby('name').size().nlargest(10)


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations17.py:
--------------------------------------------------------------------------------
1 | cast1950 = cast[cast['year'] // 10 == 195]
2 | cast1950 = cast1950[cast1950['n'] == 1]
3 | cast1950.groupby(['year', 'type']).size()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations5.py:
--------------------------------------------------------------------------------
1 | df.groupby('Pclass')['Survived'].mean().plot(kind='bar', color="C0") #and what if you would compare the total number of survivors?


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data20.py:
--------------------------------------------------------------------------------
1 | titanic = cast[(cast['title'] == 'Titanic') & (cast['year'] == 1997)]
2 | titanic = titanic[titanic['n'].notnull()]
3 | titanic.sort_values('n')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data21.py:
--------------------------------------------------------------------------------
1 | brad = cast[cast['name'] == 'Brad Pitt']
2 | brad = brad[brad['year'] // 10 == 199]
3 | brad = brad[brad['n'] == 2]
4 | brad.sort_values('year')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data10.py:
--------------------------------------------------------------------------------
1 | grouped = cast.groupby(['year', 'type']).size()
2 | table = grouped.unstack('type')
3 | (table['actor'] / (table['actor'] + table['actress'])).plot(ylim=[0,1])


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/figures/simple-decision-tree-adult-census.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_2_Machine_Learning_Python/figures/simple-decision-tree-adult-census.png


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations9.py:
--------------------------------------------------------------------------------
1 | titles['decade'] = titles['year'] // 10 * 10
2 | hamlet = titles[titles['title'] == 'Hamlet']
3 | hamlet.groupby('decade').size().plot(kind='bar', color="orange")


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations25.py:
--------------------------------------------------------------------------------
1 | cast['n_total'] = cast.groupby('title')['n'].transform('max') # transform will return an element for each row, so the max value is given to the whole group
2 | cast.head()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations27.py:
--------------------------------------------------------------------------------
1 | #sums_decade.groupby(level='year').transform(lambda x: x / x.sum())
2 | ratios_decade = sums_decade / sums_decade.groupby(level='year').transform('sum')
3 | ratios_decade


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data11.py:
--------------------------------------------------------------------------------
1 | c = cast
2 | c = c[(c.character == 'Superman') | (c.character == 'Batman')]
3 | c = c.groupby(['year', 'character']).size()
4 | c = c.unstack()
5 | c = c.fillna(0)
6 | c.head()


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations10.py:
--------------------------------------------------------------------------------
1 | titles['decade'] = titles['year'] // 10 * 10
2 | hamlet = titles[titles['title'].str.contains('Hamlet')]
3 | hamlet.groupby('decade').size().plot(kind='bar', color="lightblue")


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data6.py:
--------------------------------------------------------------------------------
1 | subset = data['2011':'2012']['L06_347']
2 | fig, ax = plt.subplots()
3 | subset.resample('M').mean().plot(ax=ax)
4 | subset.resample('M').median().plot(ax=ax)
5 | ax.legend(["mean", "median"])


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data7.py:
--------------------------------------------------------------------------------
1 | cast.pivot_table(index='year', columns='type', values="character", aggfunc='count').plot() 
2 | # for values in using the , take a column with no Nan values in order to count effectively all values -> at this stage: aha-erlebnis about crosstab function(!)


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: python-workshop
 2 | 
 3 | dependencies:
 4 |   - python=3.7
 5 |   - numpy
 6 |   - scipy
 7 |   - joblib
 8 |   - scikit-learn
 9 |   - pandas
10 |   - conda-forge::pandas-profiling
11 |   - ipython
12 |   - jupyter
13 |   - pillow
14 |   - matplotlib
15 |   - mplleaflet
16 |   - seaborn
17 |   - plotly
18 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data2.py:
--------------------------------------------------------------------------------
1 | fig, ax1 = plt.subplots()
2 | df.pivot_table(index='Pclass', columns='Sex', 
3 |                values='Survived', aggfunc='mean').plot(kind='bar', 
4 |                                                        rot=0, 
5 |                                                        ax=ax1)
6 | ax1.set_ylabel('Survival ratio')


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/README.md:
--------------------------------------------------------------------------------
 1 | # Day 1 - Scientific programming with Python
 2 | 
 3 | Goal: introducing the most important packages for scientific computing and data analysis in Python.
 4 | 
 5 | Overview:
 6 | 
 7 | 1. Introduction to numpy: [01-numpy-introduction.ipynb](01-numpy-introduction.ipynb)
 8 | 
 9 | 2. Introduction to pandas: [02-pandas_introduction.ipynb](02-pandas_introduction.ipynb)
10 | 
11 | 3. Short overview of matplotlib and seaborn: [03-matplotib_seaborn.ipynb](03-matplotib_seaborn.ipynb)
12 | 
13 | 


--------------------------------------------------------------------------------
/figures/style_figs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple styling used for matplotlib figures
 3 | """
 4 | 
 5 | from matplotlib import pyplot as plt
 6 | 
 7 | # Configuration settings to help visibility on small screen / prints
 8 | plt.rcParams['xtick.labelsize'] = 20
 9 | plt.rcParams['ytick.labelsize'] = 20
10 | plt.rcParams['figure.titlesize'] = 15
11 | plt.rcParams['font.size'] = 20
12 | plt.rcParams['axes.labelsize'] = 20
13 | plt.rcParams['axes.facecolor'] = 'none'
14 | plt.rcParams['legend.fontsize'] = 18
15 | plt.rcParams['lines.linewidth'] = 3
16 | plt.rcParams['figure.figsize'] = [.8 * 6.4, .8 * 4.8]
17 | plt.rcParams['legend.frameon'] = False
18 | plt.rcParams['legend.columnspacing'] = 1.8
19 | plt.rcParams['legend.handlelength'] = 1.5
20 | plt.rcParams['legend.handletextpad'] = 0.5
21 | 
22 | # Utility functions
23 | def light_axis():
24 |     "Hide the top and right spines"
25 |     ax = plt.gca()
26 |     for s in ('top', 'right'):
27 |         ax.spines[s].set_visible(False)
28 |     plt.xticks(())
29 |     plt.yticks(())
30 |     plt.subplots_adjust(left=.01, bottom=.01, top=.99, right=.99)
31 | 
32 | def no_axis():
33 |     plt.axis('off')
34 |     plt.subplots_adjust(left=.0, bottom=.0, top=1, right=1)
35 | 


--------------------------------------------------------------------------------
/figures/plot_iris_visualization.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Some simple visualizations on the iris data.
 3 | """
 4 | 
 5 | import numpy as np
 6 | from sklearn import datasets
 7 | from matplotlib import pyplot as plt
 8 | import style_figs
 9 | 
10 | iris = datasets.load_iris()
11 | 
12 | # Plot the histograms of each class for each feature
13 | 
14 | 
15 | X = iris.data
16 | y = iris.target
17 | for x, feature_name in zip(X.T, iris.feature_names):
18 |     plt.figure(figsize=(2.5, 2))
19 |     patches = list()
20 |     for this_y, target_name in enumerate(iris.target_names):
21 |         patch = plt.hist(x[y == this_y],
22 |                          bins=np.linspace(x.min(), x.max(), 20),
23 |                          label=target_name)
24 |         patches.append(patch[-1][0])
25 |     style_figs.light_axis()
26 |     feature_name = feature_name.replace(' ', '_')
27 |     feature_name = feature_name.replace('(', '')
28 |     feature_name = feature_name.replace(')', '')
29 |     plt.savefig('iris_{}_hist.svg'.format(feature_name))
30 | 
31 | plt.figure(figsize=(6, .25))
32 | plt.legend(patches, iris.target_names, ncol=3, loc=(0, -.37),
33 |            borderaxespad=0)
34 | style_figs.no_axis()
35 | plt.savefig('legend_irises.svg')
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, Paris-Saclay Center for Data Science
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/data/kmeans_data.csv:
--------------------------------------------------------------------------------
 1 | 2.766997449176169521e+00 3.006059008441898772e+00
 2 | 2.088094777054783080e+00 2.903810986407362904e+00
 3 | 2.822686541285641670e+00 3.149696392939078660e+00
 4 | 2.960355618458414551e+00 1.927877097616605750e+00
 5 | 2.659122371155123066e+00 3.006991014034997356e+00
 6 | 2.032954596475295084e+00 3.081060886926234588e+00
 7 | 2.854008323349864984e+00 1.999801525369903965e+00
 8 | 1.859387078586314690e+00 2.932274234602143093e+00
 9 | 2.979922522654017136e+00 1.866821022670417829e+00
10 | 2.995763163325959599e+00 1.985595460871006912e+00
11 | 2.010735721721112146e+00 2.908542147932652089e+00
12 | 2.843601373873463789e+00 1.955819928799183538e+00
13 | 3.014684940949362346e+00 2.071817254798781871e+00
14 | 2.955848758498268669e+00 2.002549532855308456e+00
15 | 2.953853347117966432e+00 2.014228889781465970e+00
16 | 2.723478874518370674e+00 2.909772625778544786e+00
17 | 1.822563743484528320e+00 2.925210296677877242e+00
18 | 2.914468938893320260e+00 3.054336780969086451e+00
19 | 2.746014094374167325e+00 3.104808300944346566e+00
20 | 1.898162333714168204e+00 2.857695041511004952e+00
21 | 2.706409191152936433e+00 2.957686421588001213e+00
22 | 2.858956467111453126e+00 3.033124170697728328e+00
23 | 2.939364448465704438e+00 1.980821208728706928e+00
24 | 2.717811191556968708e+00 3.082535431495139644e+00
25 | 2.919293342210746545e+00 3.080578453448903353e+00
26 | 1.981070710113886468e+00 3.050562435643691561e+00
27 | 2.891639482190973887e+00 2.023183666995666652e+00
28 | 1.929576687174035410e+00 2.985977074479872595e+00
29 | 2.140504867514827492e+00 2.921581307550503936e+00
30 | 1.975460027120392370e+00 3.035061741609020647e+00
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/broadcasting.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Broadcasting"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "It’s possible to do operations on arrays of different sizes. In some case NumPy can transform these arrays automatically so that they all have the same size: this conversion is called broadcasting."
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "<img src=\"../images/broadcasting.png\" alt=\"drawing\" width=800 >"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "You can find the full tutorial on broadcasting: [Broadcasting](https://paris-swc.github.io/advanced-numpy-lesson/03-broadcasting.html) <br>\n",
29 |     "And explanation on what it is: [Broadcasting](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n",
30 |     "\n"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "markdown",
35 |    "metadata": {},
36 |    "source": [
37 |     "[Previous: Savez() and load()](savez.ipynb)<br>[Next: Test yourself](test_yourself.ipynb)"
38 |    ]
39 |   }
40 |  ],
41 |  "metadata": {
42 |   "kernelspec": {
43 |    "display_name": "Python 3",
44 |    "language": "python",
45 |    "name": "python3"
46 |   },
47 |   "language_info": {
48 |    "codemirror_mode": {
49 |     "name": "ipython",
50 |     "version": 3
51 |    },
52 |    "file_extension": ".py",
53 |    "mimetype": "text/x-python",
54 |    "name": "python",
55 |    "nbconvert_exporter": "python",
56 |    "pygments_lexer": "ipython3",
57 |    "version": "3.7.2"
58 |   }
59 |  },
60 |  "nbformat": 4,
61 |  "nbformat_minor": 2
62 | }
63 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/broadcasting.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Broadcasting"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "It’s possible to do operations on arrays of different sizes. In some case NumPy can transform these arrays automatically so that they all have the same size: this conversion is called broadcasting."
15 |    ]
16 |   },
17 |   {
18 |    "cell_type": "markdown",
19 |    "metadata": {},
20 |    "source": [
21 |     "<img src=\"../images/broadcasting.png\" alt=\"drawing\" width=800 >"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "markdown",
26 |    "metadata": {},
27 |    "source": [
28 |     "You can find the full tutorial on broadcasting: [Broadcasting](https://paris-swc.github.io/advanced-numpy-lesson/03-broadcasting.html) <br>\n",
29 |     "And explanation on what it is: [Broadcasting](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n",
30 |     "\n"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "markdown",
35 |    "metadata": {},
36 |    "source": [
37 |     "[Previous: Savez() and load()](savez.ipynb)<br>[Next: Test yourself](test_yourself.ipynb)"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "code",
42 |    "execution_count": null,
43 |    "metadata": {},
44 |    "outputs": [],
45 |    "source": []
46 |   }
47 |  ],
48 |  "metadata": {
49 |   "kernelspec": {
50 |    "display_name": "Python 3",
51 |    "language": "python",
52 |    "name": "python3"
53 |   },
54 |   "language_info": {
55 |    "codemirror_mode": {
56 |     "name": "ipython",
57 |     "version": 3
58 |    },
59 |    "file_extension": ".py",
60 |    "mimetype": "text/x-python",
61 |    "name": "python",
62 |    "nbconvert_exporter": "python",
63 |    "pygments_lexer": "ipython3",
64 |    "version": "3.7.2"
65 |   }
66 |  },
67 |  "nbformat": 4,
68 |  "nbformat_minor": 2
69 | }
70 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/data/brain_size.csv:
--------------------------------------------------------------------------------
 1 | "";"Gender";"FSIQ";"VIQ";"PIQ";"Weight";"Height";"MRI_Count"
 2 | "1";"Female";133;132;124;"118";"64.5";816932
 3 | "2";"Male";140;150;124;".";"72.5";1001121
 4 | "3";"Male";139;123;150;"143";"73.3";1038437
 5 | "4";"Male";133;129;128;"172";"68.8";965353
 6 | "5";"Female";137;132;134;"147";"65.0";951545
 7 | "6";"Female";99;90;110;"146";"69.0";928799
 8 | "7";"Female";138;136;131;"138";"64.5";991305
 9 | "8";"Female";92;90;98;"175";"66.0";854258
10 | "9";"Male";89;93;84;"134";"66.3";904858
11 | "10";"Male";133;114;147;"172";"68.8";955466
12 | "11";"Female";132;129;124;"118";"64.5";833868
13 | "12";"Male";141;150;128;"151";"70.0";1079549
14 | "13";"Male";135;129;124;"155";"69.0";924059
15 | "14";"Female";140;120;147;"155";"70.5";856472
16 | "15";"Female";96;100;90;"146";"66.0";878897
17 | "16";"Female";83;71;96;"135";"68.0";865363
18 | "17";"Female";132;132;120;"127";"68.5";852244
19 | "18";"Male";100;96;102;"178";"73.5";945088
20 | "19";"Female";101;112;84;"136";"66.3";808020
21 | "20";"Male";80;77;86;"180";"70.0";889083
22 | "21";"Male";83;83;86;".";".";892420
23 | "22";"Male";97;107;84;"186";"76.5";905940
24 | "23";"Female";135;129;134;"122";"62.0";790619
25 | "24";"Male";139;145;128;"132";"68.0";955003
26 | "25";"Female";91;86;102;"114";"63.0";831772
27 | "26";"Male";141;145;131;"171";"72.0";935494
28 | "27";"Female";85;90;84;"140";"68.0";798612
29 | "28";"Male";103;96;110;"187";"77.0";1062462
30 | "29";"Female";77;83;72;"106";"63.0";793549
31 | "30";"Female";130;126;124;"159";"66.5";866662
32 | "31";"Female";133;126;132;"127";"62.5";857782
33 | "32";"Male";144;145;137;"191";"67.0";949589
34 | "33";"Male";103;96;110;"192";"75.5";997925
35 | "34";"Male";90;96;86;"181";"69.0";879987
36 | "35";"Female";83;90;81;"143";"66.5";834344
37 | "36";"Female";133;129;128;"153";"66.5";948066
38 | "37";"Male";140;150;124;"144";"70.5";949395
39 | "38";"Female";88;86;94;"139";"64.5";893983
40 | "39";"Male";81;90;74;"148";"74.0";930016
41 | "40";"Male";89;91;89;"179";"75.5";935863
42 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/02_basic_preprocessing_exercise_01.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "#  Exercise 01\n",
 8 |     "\n",
 9 |     "The goal of is to compare the performance of our classifier (81% accuracy) to some baseline classifiers that  would ignore the input data and instead make constant predictions.\n",
10 |     "\n",
11 |     "The online [documentation for DummyClassifier](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators) gives instructions on how to use it."
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": null,
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "import pandas as pd\n",
21 |     "\n",
22 |     "df = pd.read_csv(\n",
23 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": null,
29 |    "metadata": {},
30 |    "outputs": [],
31 |    "source": [
32 |     "target_name = \"class\"\n",
33 |     "target = df[target_name].to_numpy()\n",
34 |     "data = df.drop(columns=[target_name, \"fnlwgt\"])\n",
35 |     "numerical_columns = [\n",
36 |     "    c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n",
37 |     "data_numeric = data[numerical_columns]"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "code",
42 |    "execution_count": null,
43 |    "metadata": {},
44 |    "outputs": [],
45 |    "source": [
46 |     "from sklearn.model_selection import cross_val_score\n",
47 |     "from sklearn.dummy import DummyClassifier\n",
48 |     "\n",
49 |     "# TODO: write me!"
50 |    ]
51 |   }
52 |  ],
53 |  "metadata": {
54 |   "jupytext": {
55 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
56 |   },
57 |   "kernelspec": {
58 |    "display_name": "Python 3",
59 |    "language": "python",
60 |    "name": "python3"
61 |   }
62 |  },
63 |  "nbformat": 4,
64 |  "nbformat_minor": 2
65 | }
66 | 


--------------------------------------------------------------------------------
/check_env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from distutils.version import LooseVersion as Version
 3 | import sys
 4 | 
 5 | OK = '\x1b[42m[ OK ]\x1b[0m'
 6 | FAIL = "\x1b[41m[FAIL]\x1b[0m"
 7 | 
 8 | try:
 9 |     import importlib
10 | except ImportError:
11 |     print(FAIL, "Python version 3.6 or above is required,"
12 |                 " but %s is installed." % sys.version)
13 | 
14 | 
15 | def import_version(pkg, min_ver, fail_msg=""):
16 |     mod = None
17 |     try:
18 |         mod = importlib.import_module(pkg)
19 |         if pkg in {'PIL'}:
20 |             try:
21 |                 ver = mod.VERSION
22 |             except AttributeError:
23 |                 try:
24 |                     ver = mod.PILLOW_VERSION
25 |                 except:
26 |                     raise
27 |         else:
28 |             ver = mod.__version__
29 |         if Version(ver) < min_ver:
30 |             print(FAIL, "%s version %s or higher required, but %s installed."
31 |                   % (lib, min_ver, ver))
32 |         else:
33 |             print(OK, '%s version %s' % (pkg, ver))
34 |     except ImportError:
35 |         print(FAIL, '%s not installed. %s' % (pkg, fail_msg))
36 |     return mod
37 | 
38 | 
39 | # first check the python version
40 | print('Using python in', sys.prefix)
41 | print(sys.version)
42 | pyversion = Version(sys.version)
43 | if pyversion >= "3":
44 |     if pyversion < "3.6":
45 |         print(FAIL, "Python version 3.6 or above is required,"
46 |                     " but %s is installed." % sys.version)
47 | elif pyversion >= "2":
48 |     print(FAIL, "Python version 3.6 or above is required,"
49 |                 " but %s is installed." % sys.version)
50 | else:
51 |     print(FAIL, "Unknown Python version: %s" % sys.version)
52 | 
53 | print()
54 | requirements = {'numpy': "1.16", 'scipy': "1.2", 'matplotlib': "3.0",
55 |                 'IPython': "3.0", 'sklearn': "0.21", 'pandas': "0.24",
56 |                 'PIL': "1.1.7", 'notebook': "5.7", 'plotly': "4.3",
57 |                 'pandas_profiling': "2.3"}
58 | 
59 | # now the dependencies
60 | for lib, required_version in list(requirements.items()):
61 |     import_version(lib, required_version)
62 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/stacking.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Stacking\n",
 8 |     "\n",
 9 |     "Arrays can be concatenated and stacked on top of one another, using NumPy’s vstack and hstack functions for vertical and horizontal stacking, respectively."
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "import numpy as np"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "A = np.array([[1,2,3], [4,5,6], [7, 8, 9]])\n",
28 |     "print('A = ')\n",
29 |     "print(A)\n",
30 |     "\n",
31 |     "B = np.hstack([A, A])\n",
32 |     "print('B = ')\n",
33 |     "print(B)\n",
34 |     "\n",
35 |     "C = np.vstack([A, A])\n",
36 |     "print('C = ')\n",
37 |     "print(C)"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "markdown",
42 |    "metadata": {},
43 |    "source": [
44 |     "## $\\color{green}{\\text{Excercise}}$  stacking\n",
45 |     "Write some additional code that slices the first and last columns of A, and stacks them into a 3x2 array. Make sure to print the results to verify your solution. <br>\n",
46 |     "__Tip__: A ‘gotcha’ with array indexing is that singleton dimensions are dropped by default. That means A[:, 0] is a one dimensional array, which won’t stack as desired. To preserve singleton dimensions, the index itself can be a slice or array. For example, A[:, :1] returns a two dimensional array with one singleton dimension (i.e. a column vector)."
47 |    ]
48 |   },
49 |   {
50 |    "cell_type": "code",
51 |    "execution_count": null,
52 |    "metadata": {},
53 |    "outputs": [],
54 |    "source": []
55 |   },
56 |   {
57 |    "cell_type": "markdown",
58 |    "metadata": {},
59 |    "source": [
60 |     "[Previous: Operations](operations.ipynb)<br>[Next: K-means clustering](k_means.ipynb)"
61 |    ]
62 |   },
63 |   {
64 |    "cell_type": "code",
65 |    "execution_count": null,
66 |    "metadata": {},
67 |    "outputs": [],
68 |    "source": []
69 |   }
70 |  ],
71 |  "metadata": {
72 |   "kernelspec": {
73 |    "display_name": "Python 3",
74 |    "language": "python",
75 |    "name": "python3"
76 |   },
77 |   "language_info": {
78 |    "codemirror_mode": {
79 |     "name": "ipython",
80 |     "version": 3
81 |    },
82 |    "file_extension": ".py",
83 |    "mimetype": "text/x-python",
84 |    "name": "python",
85 |    "nbconvert_exporter": "python",
86 |    "pygments_lexer": "ipython3",
87 |    "version": "3.7.2"
88 |   }
89 |  },
90 |  "nbformat": 4,
91 |  "nbformat_minor": 2
92 | }
93 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/stacking.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Stacking\n",
 8 |     "\n",
 9 |     "Arrays can be concatenated and stacked on top of one another, using NumPy’s vstack and hstack functions for vertical and horizontal stacking, respectively."
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "import numpy as np"
19 |    ]
20 |   },
21 |   {
22 |    "cell_type": "code",
23 |    "execution_count": null,
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": [
27 |     "A = np.array([[1,2,3], [4,5,6], [7, 8, 9]])\n",
28 |     "print('A = ')\n",
29 |     "print(A)\n",
30 |     "\n",
31 |     "B = np.hstack([A, A])\n",
32 |     "print('B = ')\n",
33 |     "print(B)\n",
34 |     "\n",
35 |     "C = np.vstack([A, A])\n",
36 |     "print('C = ')\n",
37 |     "print(C)"
38 |    ]
39 |   },
40 |   {
41 |    "cell_type": "markdown",
42 |    "metadata": {},
43 |    "source": [
44 |     "## $\\color{green}{\\text{Excercise}}$  stacking\n",
45 |     "Write some additional code that slices the first and last columns of A, and stacks them into a 3x2 array. Make sure to print the results to verify your solution. <br>\n",
46 |     "__Tip__: A ‘gotcha’ with array indexing is that singleton dimensions are dropped by default. That means A[:, 0] is a one dimensional array, which won’t stack as desired. To preserve singleton dimensions, the index itself can be a slice or array. For example, A[:, :1] returns a two dimensional array with one singleton dimension (i.e. a column vector)."
47 |    ]
48 |   },
49 |   {
50 |    "cell_type": "code",
51 |    "execution_count": null,
52 |    "metadata": {},
53 |    "outputs": [],
54 |    "source": [
55 |     "D = np.hstack((A[:, :1], A[:, -1:]))\n",
56 |     "print('D = ')\n",
57 |     "print(D)\n",
58 |     "print(D.shape)"
59 |    ]
60 |   },
61 |   {
62 |    "cell_type": "markdown",
63 |    "metadata": {},
64 |    "source": [
65 |     "[Previous: Operations](operations.ipynb)<br>[Next: K-means clustering](k_means.ipynb)"
66 |    ]
67 |   }
68 |  ],
69 |  "metadata": {
70 |   "kernelspec": {
71 |    "display_name": "Python 3",
72 |    "language": "python",
73 |    "name": "python3"
74 |   },
75 |   "language_info": {
76 |    "codemirror_mode": {
77 |     "name": "ipython",
78 |     "version": 3
79 |    },
80 |    "file_extension": ".py",
81 |    "mimetype": "text/x-python",
82 |    "name": "python",
83 |    "nbconvert_exporter": "python",
84 |    "pygments_lexer": "ipython3",
85 |    "version": "3.7.2"
86 |   }
87 |  },
88 |  "nbformat": 4,
89 |  "nbformat_minor": 2
90 | }
91 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_01.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Exercise 02\n",
 8 |     "\n",
 9 |     "The goal of this exercise is to evalutate the impact of using an arbitrary\n",
10 |     "integer encoding for categorical variables along with a linear\n",
11 |     "classification model such as Logistic Regression.\n",
12 |     "\n",
13 |     "To do so, let's try to use `OrdinalEncoder` to preprocess the categorical\n",
14 |     "variables. This preprocessor is assembled in a pipeline with\n",
15 |     "`LogisticRegression`. The performance of the pipeline can be evaluated as\n",
16 |     "usual by cross-validation and then compared to the score obtained when using\n",
17 |     "`OneHotEncoding` or to some other baseline score.\n",
18 |     "\n",
19 |     "Because `OrdinalEncoder` can raise errors if it sees an unknown category at\n",
20 |     "prediction time, we need to pre-compute the list of all possible categories\n",
21 |     "ahead of time:\n",
22 |     "\n",
23 |     "```python\n",
24 |     "categories = [data[column].unique()\n",
25 |     "              for column in data[categorical_columns]]\n",
26 |     "OrdinalEncoder(categories=categories)\n",
27 |     "```"
28 |    ]
29 |   },
30 |   {
31 |    "cell_type": "code",
32 |    "execution_count": null,
33 |    "metadata": {},
34 |    "outputs": [],
35 |    "source": [
36 |     "import pandas as pd\n",
37 |     "\n",
38 |     "df = pd.read_csv(\n",
39 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
40 |     "\n",
41 |     "# Or use the local copy:\n",
42 |     "# df = pd.read_csv('../datasets/adult-census.csv')"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": [
51 |     "target_name = \"class\"\n",
52 |     "target = df[target_name].to_numpy()\n",
53 |     "data = df.drop(columns=[target_name, \"fnlwgt\"])\n",
54 |     "categorical_columns = [\n",
55 |     "    c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n",
56 |     "data_categorical = data[categorical_columns]"
57 |    ]
58 |   },
59 |   {
60 |    "cell_type": "code",
61 |    "execution_count": null,
62 |    "metadata": {},
63 |    "outputs": [],
64 |    "source": [
65 |     "from sklearn.model_selection import cross_val_score\n",
66 |     "from sklearn.pipeline import make_pipeline\n",
67 |     "from sklearn.preprocessing import OrdinalEncoder\n",
68 |     "from sklearn.linear_model import LogisticRegression\n",
69 |     "\n",
70 |     "# TODO: write me!"
71 |    ]
72 |   }
73 |  ],
74 |  "metadata": {
75 |   "jupytext": {
76 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
77 |   },
78 |   "kernelspec": {
79 |    "display_name": "Python 3",
80 |    "language": "python",
81 |    "name": "python3"
82 |   }
83 |  },
84 |  "nbformat": 4,
85 |  "nbformat_minor": 2
86 | }
87 | 


--------------------------------------------------------------------------------
/figures/plot_splines.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simple example of overfit with splines
  3 | """
  4 | import numpy as np
  5 | from matplotlib import pyplot as plt
  6 | import style_figs
  7 | 
  8 | from sklearn import datasets, linear_model
  9 | 
 10 | # Load the diabetes dataset
 11 | diabetes = datasets.load_diabetes()
 12 | 
 13 | 
 14 | # Use only one feature
 15 | diabetes_X = diabetes.data[:, np.newaxis]
 16 | diabetes_X_temp = diabetes_X[:, :, 2]
 17 | 
 18 | # Split the data into training/testing sets
 19 | diabetes_X_train = diabetes_X_temp[:-200:3]
 20 | diabetes_X_test = diabetes_X_temp[-200:].T
 21 | 
 22 | # Split the targets into training/testing sets
 23 | diabetes_y_train = diabetes.target[:-200:3]
 24 | diabetes_y_test = diabetes.target[-200:]
 25 | 
 26 | # Sort the data and remove duplicates (for interpolation)
 27 | order = np.argsort(diabetes_X_train.ravel())
 28 | X_train = diabetes_X_train.ravel()[order]
 29 | y_train = diabetes_y_train[order]
 30 | # Avoid duplicates
 31 | y_train_ = list()
 32 | for this_x in np.unique(X_train):
 33 |     y_train_.append(np.mean(y_train[X_train == this_x]))
 34 | X_train = np.unique(X_train)
 35 | 
 36 | y_train = np.array(y_train_)
 37 | 
 38 | # Create linear regression object
 39 | regr = linear_model.LinearRegression()
 40 | 
 41 | # Train the model using the training sets
 42 | regr.fit(X_train.reshape((-1, 1)), y_train)
 43 | 
 44 | 
 45 | plt.figure(1, figsize=(.8*4, .8*3), facecolor='none')
 46 | # Plot with test data
 47 | plt.clf()
 48 | ax = plt.axes([.1, .1, .9, .9])
 49 | 
 50 | plt.scatter(X_train, y_train,  color='k', s=9)
 51 | 
 52 | plt.plot([-.08, .12], regr.predict([[-.08, ], [.12, ]]),
 53 |         linewidth=3)
 54 | 
 55 | plt.axis('tight')
 56 | ymin, ymax = plt.ylim()
 57 | style_figs.light_axis()
 58 | plt.ylabel('y', size=16, weight=600)
 59 | plt.xlabel('x', size=16, weight=600)
 60 | 
 61 | plt.savefig('ols_simple.svg', facecolor='none', edgecolor='none')
 62 | 
 63 | plt.scatter(diabetes_X_test, diabetes_y_test,  color='C1', s=9)
 64 | plt.ylim(ymin, ymax)
 65 | plt.xlim(-.08, .12)
 66 | 
 67 | plt.savefig('ols_test.svg', facecolor='none', edgecolor='none')
 68 | 
 69 | 
 70 | # Plot cubic splines
 71 | plt.clf()
 72 | ax = plt.axes([.1, .1, .9, .9])
 73 | 
 74 | from scipy import interpolate
 75 | f = interpolate.interp1d(X_train, y_train,
 76 |                          kind="quadratic",
 77 |                          bounds_error=False, fill_value="extrapolate")
 78 | plt.scatter(X_train, y_train,  color='k', s=9, zorder=20)
 79 | x_spline = np.linspace(-.08, .12, 600)
 80 | y_spline = f(x_spline)
 81 | plt.plot(x_spline, y_spline, linewidth=3)
 82 | 
 83 | plt.axis('tight')
 84 | plt.xlim(-.08, .12)
 85 | plt.ylim(ymin, ymax)
 86 | 
 87 | style_figs.light_axis()
 88 | 
 89 | plt.ylabel('y', size=16, weight=600)
 90 | plt.xlabel('x', size=16, weight=600)
 91 | 
 92 | 
 93 | plt.savefig('splines_cubic.svg', facecolor='none', edgecolor='none')
 94 | 
 95 | 
 96 | plt.scatter(diabetes_X_test, diabetes_y_test,  color='C1', s=9)
 97 | plt.savefig('splines_test.svg', facecolor='none', edgecolor='none')
 98 | 
 99 | plt.show()
100 | 
101 | 


--------------------------------------------------------------------------------
/img/slides.css:
--------------------------------------------------------------------------------
  1 | @import url(webfont-ubuntu-400-300-100.css);
  2 | @import url(webfont-ubuntu-mono-400-700-400italic.css);
  3 | 
  4 | body {
  5 |   font-family: 'Ubuntu';
  6 |   font-weight: normal;
  7 | }
  8 | 
  9 | h1, h2, h3, h4, h5, h6 {
 10 |   font-family: 'Ubuntu';
 11 |   font-weight: 300;
 12 |   margin-top: 0;
 13 | }
 14 | h1 {
 15 |   margin-top: 0.5em;
 16 | }
 17 | h2 {
 18 |   font-size: 140%;
 19 |   line-height: 150%;
 20 | }
 21 | h3 {
 22 |   font-size: 120%;
 23 |   line-height: 140%;
 24 | }
 25 | 
 26 | 
 27 | 
 28 | li {
 29 |   font-size: 120%;
 30 |   line-height: 160%;
 31 | }
 32 | 
 33 | p {
 34 | font-size: 120%;
 35 |   line-height: 140%;
 36 | }
 37 | 
 38 | .singleimg .middlebelowheader {
 39 |   text-align: center;
 40 | }
 41 | 
 42 | .singleimg img {
 43 |   max-width: 90%;
 44 |   max-height: 600px;
 45 |   /*border: 2px solid #ddd;*/
 46 | }
 47 | table {
 48 |   margin: 0 auto 0.8em;
 49 |   border-collapse: collapse;
 50 | }
 51 | td, th {
 52 |   border: 1px solid #ddd;
 53 |   padding: 0.3em 0.5em;
 54 | }
 55 | 
 56 | .bgheader h1 {
 57 |   background-color: rgba(0, 0, 0, 0.9);
 58 |   opacity: 50%;
 59 |   padding: 0.5em;
 60 |   color: white;
 61 |   border-radius: .5em;
 62 | }
 63 | .middlebelowheader {
 64 |   /* This fixed size height was found to work well with the slide
 65 |      scaling mechanism of remark.js:
 66 |    */
 67 |   height: 500px;
 68 |   display: table-cell;
 69 |   vertical-align: middle;
 70 | }
 71 | .widespace h2 {
 72 |   line-height: 200%;
 73 | }
 74 | .big .remark-code  {
 75 |   font-size: 200%;
 76 | }
 77 | .remark-code, .remark-inline-code {
 78 |    font-family: 'Ubuntu Mono';
 79 | }
 80 | 
 81 | .medium .remark-code  {
 82 |   font-size: 120%;
 83 | }
 84 | 
 85 | .mmedium .remark-code  {
 86 |   font-size: 99%;
 87 | }
 88 | 
 89 | .affiliations img {
 90 |   /*height: 100px;*/
 91 |   margin: 2em;
 92 |   margin-right: 0.5em;
 93 |   margin-left:0.5em;
 94 | }
 95 | 
 96 | .hidden {
 97 |   visibility: hidden;
 98 | }
 99 | 
100 | .small {
101 |   font-size: 90%;
102 | }
103 | 
104 | .credits {
105 |   font-style: italic;
106 |   font-size: 70%;
107 | }
108 | 
109 | .bunchoflogos img {
110 |   max-height: 100px;
111 |   padding: 1em;
112 | }
113 | 
114 | .bunchoflogos p {
115 |   text-align: center;
116 |   width: 750px;
117 | }
118 | 
119 | a:visited {
120 |   color: blue;
121 | }
122 | 
123 | .inverse a:visited {
124 |   color: Maroon;
125 | }
126 | 
127 | .inverse {
128 |   background: #272822;
129 |   color: #777872;
130 |   text-shadow: 0 0 20px #333;
131 | }
132 | .inverse h1, .inverse h2 {
133 |   color: #f3f3f3;
134 | }
135 | 
136 | code {
137 |   background: #e7e8e2;
138 |   border-radius: 5px;
139 | }
140 | .pull-left {
141 |   float: left;
142 |   width: 47%;
143 | }
144 | .pull-right {
145 |   float: right;
146 |   width: 47%;
147 | }
148 | .pull-right ~ p {
149 |   clear: both;
150 | }
151 | 
152 | @page {
153 |   size: 1024px 768px;
154 |   margin: 0;
155 | }
156 | 
157 | @media print {
158 |   .remark-slide-scaler {
159 |     width: 100% !important;
160 |     height: 100% !important;
161 |     transform: scale(1) !important;
162 |     top: 0 !important;
163 |     left: 0 !important;
164 |   }
165 | }
166 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
  1 | <!--
  2 | 
  3 | -->
  4 | 
  5 | 
  6 | <!DOCTYPE html>
  7 | <html>
  8 |   <head>
  9 |     <title>Python workshop - Paris-Saclay Center for Data Science</title>
 10 |     <meta charset="utf-8">
 11 |     <link rel="stylesheet" type="text/css" href="img/slides.css">
 12 |   </head>
 13 |   <body>
 14 |     <textarea id="source">
 15 | 
 16 | class: center, middle
 17 | 
 18 | # Scientific Programming with Python and Software Engineering Best Practices
 19 | 
 20 | 2-day workshop, 3-4 July 2017
 21 | 
 22 | <p style="margin-bottom: 3cm;"></p>
 23 | 
 24 | .affiliations[
 25 |   ![:scale 80%](img/logoUPSayPlusCDS_990.png)
 26 | ]
 27 | 
 28 | 
 29 | ---
 30 | 
 31 | ### Those slides: https://paris-saclay-cds.github.io/python-workshop
 32 | 
 33 | ### Download the material: https://github.com/paris-saclay-cds/python-workshop/archive/master.zip
 34 | 
 35 | ### Check that you can run a jupyter notebook and have everything installed
 36 | 
 37 | ---
 38 | 
 39 | # Why Python for data analysis?
 40 | 
 41 | ### High level language
 42 | ### General purpose
 43 | ### Excellent interactive use
 44 | 
 45 | --
 46 | count: false
 47 | 
 48 | ### + a rich ecosystem of tools for scientific computing and data analysis.
 49 | 
 50 | 
 51 | 
 52 | ---
 53 | class: center, middle
 54 | 
 55 | # Python's scientific ecosystem
 56 | 
 57 | 
 58 | #### ## Thanks to Jake VanderPlas for the figure
 59 | 
 60 | 
 61 | 
 62 | ---
 63 | class: center, middle, bgheader
 64 | background-image: url(img/JakeVdP-ecosystem1.svg)
 65 | background-size: cover
 66 | 
 67 | ---
 68 | count: false
 69 | class: center, middle, bgheader
 70 | background-image: url(img/JakeVdP-ecosystem2.svg)
 71 | background-size: cover
 72 | 
 73 | ---
 74 | count: false
 75 | class: center, middle, bgheader
 76 | background-image: url(img/JakeVdP-ecosystem3.svg)
 77 | background-size: cover
 78 | 
 79 | ---
 80 | count: false
 81 | class: center, middle, bgheader
 82 | background-image: url(img/JakeVdP-ecosystem4.svg)
 83 | background-size: cover
 84 | 
 85 | ---
 86 | count: false
 87 | class: center, middle, bgheader
 88 | background-image: url(img/JakeVdP-ecosystem5.svg)
 89 | background-size: cover
 90 | 
 91 | ---
 92 | 
 93 | # Goal of today's workshop
 94 | 
 95 | ## = give you a sense of this ecosystem
 96 | 
 97 | --
 98 | count:false
 99 | 
100 | Overview of today:
101 | 
102 | * numpy
103 | * pandas
104 | * matplotlib
105 | * scipy, statsmodels
106 | * scikit-learn
107 | 
108 | ---
109 | 
110 | 
111 | 
112 | ---
113 | class:middle, center
114 | 
115 | # Ask questions!
116 | 
117 | ![:scale 60%](img/postit.jpg)
118 | 
119 | 
120 | 
121 | 
122 | 
123 |     </textarea>
124 |     <script src="img/remark.min.js" type="text/javascript">
125 |     </script>
126 |     <script>
127 | 	    remark.macros.scale = function (percentage) {
128 |           var url = this;
129 |           return '<img src="' + url + '" style="width: ' + percentage + '" />';
130 |       };
131 |       remark.macros.scaleH = function (percentage) {
132 |           var url = this;
133 |           return '<img src="' + url + '" style="height: ' + percentage + '" />';
134 |       };
135 |       var slideshow = remark.create();
136 |     </script>
137 |   </body>
138 | </html>
139 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/test_yourself.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## $\\color{green}{\\text{Excercise}}$ Rectification\n",
  8 |     "\n",
  9 |     "Rectify an array (replace negative elements with zeros) of random numbers from normal distribution (generated with np.random.randn) using boolean indexing."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## $\\color{green}{\\text{Excercise}}$ Sub-arrays\n",
 17 |     "\n",
 18 |     "Let \n",
 19 |     "`x = np.array([1, 5, 10])`.\n",
 20 |     "\n",
 21 |     "Which of the following will show `[1, 10]`:\n",
 22 |     "\n",
 23 |     "a)    x[::2]\n",
 24 |     "\n",
 25 |     "b)    x[[1, 3]]\n",
 26 |     "\n",
 27 |     "c)    x[[0, 2]]\n",
 28 |     "\n",
 29 |     "d)    x[0, 2]\n",
 30 |     "\n",
 31 |     "e)    x[[1, -1]]\n",
 32 |     "\n",
 33 |     "f)    x[[False, True, False]]\n",
 34 |     "\n",
 35 |     "For each statement predict whether it returns a copy or a view."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## $\\color{green}{\\text{Excercise}}$ Random elements\n",
 43 |     "\n",
 44 |     "Using fancy indexing select randomly with repetition 10 elements from a random array of 100 elements (Hint: you can use np.random.randint(max_int, size=n) to generate n random numbers from 0 to max_int)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## $\\color{green}{\\text{Excercise}}$ Drawing random integers without repetition\n",
 52 |     "\n",
 53 |     "Generate a random sequence of 10 integers from 1 to 100 without repetition (Hint: you may want to use np.random.rand and np.argsort)."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## $\\color{green}{\\text{Excercise}}$\n",
 61 |     "\n",
 62 |     "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": []
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "[Previous: Broadcasting](broadcasting.ipynb)<br>[Back to index](../01-numpy-introduction.ipynb)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": []
 85 |   }
 86 |  ],
 87 |  "metadata": {
 88 |   "kernelspec": {
 89 |    "display_name": "Python 3",
 90 |    "language": "python",
 91 |    "name": "python3"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 3
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython3",
103 |    "version": "3.7.2"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 2
108 | }
109 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/savez.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Saving and loading the data in .npz format\n",
  8 |     "The .npz file format is a zipped archive of files named after the variables they contain. The archive is not compressed and each file in the archive contains one variable in .npy format"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "use __np.savez(filename, args)__ to save the data"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np\n",
 25 |     "import matplotlib.pylab as plt\n",
 26 |     "\n",
 27 |     "%matplotlib inline\n",
 28 |     "\n",
 29 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "filename = 'datainfo.npz'\n",
 39 |     "np.savez(filename, data=data, mean_daily=np.mean(data,0))"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "use __np.load()__ to load it :"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "patient = np.load(filename)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "to check the keys in the loaded data"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "list(patient)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "patient['mean_daily']"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "We can plot this data using matplotlib.pyplot.plt:"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "plt.plot(patient['mean_daily'])"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "[Previous: K-means clustering](k_means.ipynb)<br>[Next: Fancy indexing](fancy_indexing.ipynb)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.7.2"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 2
135 | }
136 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/savez.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Saving and loading the data in .npz format\n",
  8 |     "The .npz file format is a zipped archive of files named after the variables they contain. The archive is not compressed and each file in the archive contains one variable in .npy format"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "use __np.savez(filename, args)__ to save the data"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np\n",
 25 |     "import matplotlib.pylab as plt\n",
 26 |     "\n",
 27 |     "%matplotlib inline\n",
 28 |     "\n",
 29 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "filename = 'datainfo.npz'\n",
 39 |     "np.savez(filename, data=data, mean_daily=np.mean(data,0))"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "use __np.load()__ to load it :"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "patient = np.load(filename)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "to check the keys in the loaded data"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "list(patient)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "patient['mean_daily']"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "We can plot this data using matplotlib.pyplot.plt:"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "plt.plot(patient['mean_daily'])"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "[Previous: K-means clustering](k_means.ipynb)<br>[Next: Fancy indexing](fancy_indexing.ipynb)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.7.2"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 2
135 | }
136 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/test_yourself.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## $\\color{green}{\\text{Excercise}}$ Rectification\n",
  8 |     "\n",
  9 |     "Rectify an array (replace negative elements with zeros) of random numbers from normal distribution (generated with np.random.randn) using boolean indexing."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## $\\color{green}{\\text{Excercise}}$ Sub-arrays\n",
 17 |     "\n",
 18 |     "Let \n",
 19 |     "`x = np.array([1, 5, 10])`.\n",
 20 |     "\n",
 21 |     "Which of the following will show `[1, 10]`:\n",
 22 |     "\n",
 23 |     "a)    x[::2]\n",
 24 |     "\n",
 25 |     "b)    x[[1, 3]]\n",
 26 |     "\n",
 27 |     "c)    x[[0, 2]]\n",
 28 |     "\n",
 29 |     "d)    x[0, 2]\n",
 30 |     "\n",
 31 |     "e)    x[[1, -1]]\n",
 32 |     "\n",
 33 |     "f)    x[[False, True, False]]\n",
 34 |     "\n",
 35 |     "For each statement predict whether it returns a copy or a view."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## $\\color{green}{\\text{Excercise}}$ Random elements\n",
 43 |     "\n",
 44 |     "Using fancy indexing select randomly with repetition 10 elements from a random array of 100 elements (Hint: you can use np.random.randint(max_int, size=n) to generate n random numbers from 0 to max_int)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## $\\color{green}{\\text{Excercise}}$ Drawing random integers without repetition\n",
 52 |     "\n",
 53 |     "Generate a random sequence of 10 integers from 1 to 100 without repetition (Hint: you may want to use np.random.rand and np.argsort)."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## $\\color{green}{\\text{Excercise}}$\n",
 61 |     "\n",
 62 |     "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import numpy as np\n",
 72 |     "\n",
 73 |     "rand_array = np.random.rand(10,3)\n",
 74 |     "rand_array2 = rand_array - 0.75\n",
 75 |     "closest = np.argmin(np.abs(rand_array2),1)\n",
 76 |     "print(rand_array)\n",
 77 |     "print(closest)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "[Previous: Broadcasting](broadcasting.ipynb)<br>[Back to index](../01-numpy-introduction.ipynb)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": []
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.7.2"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/fancy_indexing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Fancy indexing"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Indexing can be done with a list or an array of integers. In this case the same index can be also repeated several times:"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import matplotlib.pylab as plt"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "a = np.arange(0, 100, 10)\n",
 34 |     "a"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "a[[2, 3, 2, 4, 2]] "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "New values can be also assigned with this kind of indexing:"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "a[[9, 7]] = -100\n",
 60 |     "a"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "When a new array is created by indexing with an array of integers, the new array has the same shape than the array of integers. "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "a = np.arange(10)\n",
 77 |     "idx = np.array([[3, 4], [9, 7]])\n",
 78 |     "print('idx shape: {}'.format(idx.shape))\n",
 79 |     "a[idx]"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Fancy indexing is often used to re-order or sort data. You can easily obtain the indices required to sort data using np.argsort:"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "a = np.random.randint(10, size=5)\n",
 96 |     "a"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "i = np.argsort(a)\n",
106 |     "a[i]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Note that fancy indexing returns a copy and not a view."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "[Previous: Savez() and load()](savez.ipynb)<br>[Next: Broadcasting](broadcasting.ipynb)"
121 |    ]
122 |   }
123 |  ],
124 |  "metadata": {
125 |   "kernelspec": {
126 |    "display_name": "Python 3",
127 |    "language": "python",
128 |    "name": "python3"
129 |   },
130 |   "language_info": {
131 |    "codemirror_mode": {
132 |     "name": "ipython",
133 |     "version": 3
134 |    },
135 |    "file_extension": ".py",
136 |    "mimetype": "text/x-python",
137 |    "name": "python",
138 |    "nbconvert_exporter": "python",
139 |    "pygments_lexer": "ipython3",
140 |    "version": "3.7.2"
141 |   }
142 |  },
143 |  "nbformat": 4,
144 |  "nbformat_minor": 2
145 | }
146 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/fancy_indexing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Fancy indexing"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Indexing can be done with a list or an array of integers. In this case the same index can be also repeated several times:"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import matplotlib.pylab as plt"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "a = np.arange(0, 100, 10)\n",
 34 |     "a"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "a[[2, 3, 2, 4, 2]] "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "New values can be also assigned with this kind of indexing:"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "a[[9, 7]] = -100\n",
 60 |     "a"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "When a new array is created by indexing with an array of integers, the new array has the same shape than the array of integers. "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "a = np.arange(10)\n",
 77 |     "idx = np.array([[3, 4], [9, 7]])\n",
 78 |     "print('idx shape: {}'.format(idx.shape))\n",
 79 |     "a[idx]"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Fancy indexing is often used to re-order or sort data. You can easily obtain the indices required to sort data using np.argsort:"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "a = np.random.randint(10, size=5)\n",
 96 |     "a"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "i = np.argsort(a)\n",
106 |     "a[i]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "Note that fancy indexing returns a copy and not a view."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "[Previous: Savez() and load()](savez.ipynb)<br>[Next: Broadcasting](broadcasting.ipynb)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": []
129 |   }
130 |  ],
131 |  "metadata": {
132 |   "kernelspec": {
133 |    "display_name": "Python 3",
134 |    "language": "python",
135 |    "name": "python3"
136 |   },
137 |   "language_info": {
138 |    "codemirror_mode": {
139 |     "name": "ipython",
140 |     "version": 3
141 |    },
142 |    "file_extension": ".py",
143 |    "mimetype": "text/x-python",
144 |    "name": "python",
145 |    "nbconvert_exporter": "python",
146 |    "pygments_lexer": "ipython3",
147 |    "version": "3.7.2"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 2
152 | }
153 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/02_basic_preprocessing_exercise_01_solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Solution for Exercise 01\n",
  8 |     "\n",
  9 |     "The goal of is to compare the performance of our classifier to some baseline classifier that  would ignore the input data and instead make constant predictions:"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "\n",
 20 |     "df = pd.read_csv(\n",
 21 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "target_name = \"class\"\n",
 31 |     "target = df[target_name].to_numpy()\n",
 32 |     "data = df.drop(columns=[target_name, \"fnlwgt\"])\n",
 33 |     "numerical_columns = [\n",
 34 |     "    c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n",
 35 |     "data_numeric = data[numerical_columns]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from sklearn.model_selection import cross_val_score\n",
 45 |     "from sklearn.dummy import DummyClassifier\n",
 46 |     "\n",
 47 |     "high_revenue_clf = DummyClassifier(strategy=\"constant\",\n",
 48 |     "                                   constant=\" >50K\")\n",
 49 |     "scores = cross_val_score(high_revenue_clf, data_numeric, target)\n",
 50 |     "print(f\"{scores.mean():.3f} +/- {scores.std():.3f}\")"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "low_revenue_clf = DummyClassifier(strategy=\"constant\",\n",
 60 |     "                                  constant=\" <=50K\")\n",
 61 |     "scores = cross_val_score(low_revenue_clf, data_numeric, target)\n",
 62 |     "print(f\"{scores.mean():.3f} +/- {scores.std():.3f}\")"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "most_freq_revenue_clf = DummyClassifier(strategy=\"most_frequent\")\n",
 72 |     "scores = cross_val_score(most_freq_revenue_clf, data_numeric, target)\n",
 73 |     "print(f\"{scores.mean():.3f} +/- {scores.std():.3f}\")"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "So 81% accuracy is significantly better than 76% which is the score of a baseline model that would always predict the most frequent class which is the low revenue class: `\" <=50K\"`.\n",
 81 |     "\n",
 82 |     "In this dataset, we can see that the target classes are imbalanced: almost 3/4 of the records are people with a revenue below 50K:"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "df[\"class\"].value_counts()"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "(target == \" <=50K\").mean()"
101 |    ]
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "jupytext": {
106 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
107 |   },
108 |   "kernelspec": {
109 |    "display_name": "Python 3",
110 |    "language": "python",
111 |    "name": "python3"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 2
116 | }
117 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/04_basic_parameters_tuning_exercise_01.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "#  Exercise 01\n",
 8 |     "The goal is to write an exhaustive search to find the best parameters\n",
 9 |     "combination maximizing the model performance"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": null,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "import pandas as pd\n",
19 |     "\n",
20 |     "from sklearn.model_selection import train_test_split\n",
21 |     "from sklearn.preprocessing import OrdinalEncoder\n",
22 |     "from sklearn.model_selection import RandomizedSearchCV\n",
23 |     "from sklearn.compose import ColumnTransformer\n",
24 |     "from sklearn.pipeline import Pipeline\n",
25 |     "# This line is currently required to import HistGradientBoostingClassifier\n",
26 |     "from sklearn.experimental import enable_hist_gradient_boosting\n",
27 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
28 |     "\n",
29 |     "from scipy.stats import expon, uniform\n",
30 |     "from scipy.stats import randint\n",
31 |     "\n",
32 |     "df = pd.read_csv(\n",
33 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
34 |     "# Or use the local copy:\n",
35 |     "# df = pd.read_csv('../datasets/adult-census.csv')\n",
36 |     "\n",
37 |     "target_name = \"class\"\n",
38 |     "target = df[target_name].to_numpy()\n",
39 |     "data = df.drop(columns=target_name)\n",
40 |     "\n",
41 |     "df_train, df_test, target_train, target_test = train_test_split(\n",
42 |     "    data, target, random_state=42)\n",
43 |     "\n",
44 |     "from sklearn.compose import ColumnTransformer\n",
45 |     "from sklearn.preprocessing import OrdinalEncoder\n",
46 |     "\n",
47 |     "categorical_columns = [\n",
48 |     "    'workclass', 'education', 'marital-status', 'occupation',\n",
49 |     "    'relationship', 'race', 'native-country', 'sex']\n",
50 |     "\n",
51 |     "categories = [data[column].unique()\n",
52 |     "              for column in data[categorical_columns]]\n",
53 |     "\n",
54 |     "categorical_preprocessor = OrdinalEncoder(categories=categories)\n",
55 |     "\n",
56 |     "preprocessor = ColumnTransformer(\n",
57 |     "    [('cat-preprocessor', categorical_preprocessor, categorical_columns)],\n",
58 |     "    remainder='passthrough', sparse_threshold=0)\n",
59 |     "\n",
60 |     "from sklearn.experimental import enable_hist_gradient_boosting\n",
61 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
62 |     "from sklearn.pipeline import make_pipeline\n",
63 |     "\n",
64 |     "model = make_pipeline(\n",
65 |     "    preprocessor, HistGradientBoostingClassifier(random_state=42))"
66 |    ]
67 |   },
68 |   {
69 |    "cell_type": "markdown",
70 |    "metadata": {},
71 |    "source": [
72 |     "TODO: write your solution here\n",
73 |     "\n",
74 |     "Use the previously defined model (called `model`) and using two nested `for`\n",
75 |     "loops, make a search of the best combinations of the `learning_rate` and\n",
76 |     "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n",
77 |     "the model by setting the parameters. The evaluation of the model should be\n",
78 |     "performed using `cross_val_score`. We can propose to define the following\n",
79 |     "parameters search:\n",
80 |     "- `learning_rate` for the values 0.01, 0.1, and 1;\n",
81 |     "- `max_leaf_nodes` for the values 5, 25, 45."
82 |    ]
83 |   }
84 |  ],
85 |  "metadata": {
86 |   "jupytext": {
87 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
88 |   },
89 |   "kernelspec": {
90 |    "display_name": "Python 3",
91 |    "language": "python",
92 |    "name": "python3"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 2
97 | }
98 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/01-numpy-introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to Data Science in Python\n",
  8 |     "\n",
  9 |     "### Numpy tutorial, March, 13th 2019\n",
 10 |     "\n",
 11 |     "Working efficiently with multi-dimensional arrays in NumPy\n",
 12 |     "\n",
 13 |     "Maria Teleńczuk <br>\n",
 14 |     "email: maria@telenczuk.pl"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Quick introduction to Jupyter notebook\n",
 22 |     "\n",
 23 |     "to run Jupyter notebook in your terminal type: <br>\n",
 24 |     "    `jupyter notebook`\n",
 25 |     "    \n",
 26 |     "**Esc** : takes you into command mode, there you can use:\n",
 27 |     " -  __a__ : insert a new cell above <br>\n",
 28 |     " -  __b__ : insert a new cell below <br>\n",
 29 |     " -  **m** : change the current cell to Markdown <br>\n",
 30 |     " -  **y** : change the current cell to code\n",
 31 |     "\n",
 32 |     "**Enter** : go back to edit mode\n",
 33 |     "    \n",
 34 |     "**Shift + Enter** : execute the cell, move to the cell below\n",
 35 |     "\n",
 36 |     "__?__ : help"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "# Topics\n",
 44 |     "1. [Introduction to numpy](numpys/numpy_intro.ipynb)\n",
 45 |     "2. [Working with a dataset](numpys/dataset_intro.ipynb)\n",
 46 |     "3. [Filtering data](numpys/filtering_data.ipynb)\n",
 47 |     "4. [Slices](numpys/slices.ipynb)\n",
 48 |     "5. [Operations](numpys/operations.ipynb)\n",
 49 |     "6. [Stacking](numpys/stacking.ipynb) \n",
 50 |     "\n",
 51 |     "### Extra topics\n",
 52 |     "\n",
 53 |     "7. [K-means clustering](numpys/k_means.ipynb)\n",
 54 |     "8. [Savez() and load()](numpys/savez.ipynb)\n",
 55 |     "9. [Fancy indexing](numpys/fancy_indexing.ipynb)\n",
 56 |     "10. [Broadcasting](numpys/broadcasting.ipynb)\n",
 57 |     "11. [Test yourself](numpys/test_yourself.ipynb)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Based on material from:\n",
 65 |     " - Software Carpentry, [Python Novice inflammation](https://github.com/swcarpentry/python-novice-inflammation)\n",
 66 |     " - Paris Software Carpentry, [Advanced  numpy lesson](https://paris-swc.github.io/advanced-numpy-lesson/)\n",
 67 |     " - Bartosz Teleńczuk, [Advanced Numpy tutorial](https://github.com/paris-saclay-cds/data-science-workshop-2019/blob/master/Day_1_Scientific_Python/01-numpy-introduction.ipynb)\n",
 68 |     " "
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     " ### You might also like:\n",
 76 |     "  - Gaël Varoquaux, Emmanuelle Gouillart and Olav Vahtras (editors), [SciPy Lectures](http://scipy-lectures.org/)\n",
 77 |     "  - NumPy community, [NumPy Docs](https://docs.scipy.org/doc/numpy/)\n",
 78 |     "  - Juan Nuñez-Iglesias, [Lecture on Advanced NumPy patterns](https://github.com/jni/aspp2015)\n",
 79 |     "  - Stéfan van der Walt, [Advanced NumPy tutorial](https://python.g-node.org/python-summerschool-2014/numpy.html)\n",
 80 |     "  - Nicolas Rougier, [100 NumPy exercises](https://github.com/rougier/numpy-100)\n",
 81 |     "  - Bartosz Teleńczuk, [Advanced NumPy lesson](https://github.com/paris-swc/advanced-numpy-lesson)"
 82 |    ]
 83 |   }
 84 |  ],
 85 |  "metadata": {
 86 |   "kernelspec": {
 87 |    "display_name": "Python 3",
 88 |    "language": "python",
 89 |    "name": "python3"
 90 |   },
 91 |   "language_info": {
 92 |    "codemirror_mode": {
 93 |     "name": "ipython",
 94 |     "version": 3
 95 |    },
 96 |    "file_extension": ".py",
 97 |    "mimetype": "text/x-python",
 98 |    "name": "python",
 99 |    "nbconvert_exporter": "python",
100 |    "pygments_lexer": "ipython3",
101 |    "version": "3.7.2"
102 |   }
103 |  },
104 |  "nbformat": 4,
105 |  "nbformat_minor": 2
106 | }
107 | 


--------------------------------------------------------------------------------
/img/webfont-ubuntu-400-300-100.css:
--------------------------------------------------------------------------------
 1 | /* cyrillic-ext */
 2 | @font-face {
 3 |   font-family: 'Ubuntu';
 4 |   font-style: normal;
 5 |   font-weight: 300;
 6 |   src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/X_EdMnknKUltk57alVVbVxJtnKITppOI_IvcXXDNrsc.woff2) format('woff2');
 7 |   unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F;
 8 | }
 9 | /* cyrillic */
10 | @font-face {
11 |   font-family: 'Ubuntu';
12 |   font-style: normal;
13 |   font-weight: 300;
14 |   src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/nBF2d6Y3AbOwfkBM-9HcWBJtnKITppOI_IvcXXDNrsc.woff2) format('woff2');
15 |   unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116;
16 | }
17 | /* greek-ext */
18 | @font-face {
19 |   font-family: 'Ubuntu';
20 |   font-style: normal;
21 |   font-weight: 300;
22 |   src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/CdlIlwqST01WNAKqZbtZkhJtnKITppOI_IvcXXDNrsc.woff2) format('woff2');
23 |   unicode-range: U+1F00-1FFF;
24 | }
25 | /* greek */
26 | @font-face {
27 |   font-family: 'Ubuntu';
28 |   font-style: normal;
29 |   font-weight: 300;
30 |   src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/7k0RmqCN8EFxqS6sChuRzRJtnKITppOI_IvcXXDNrsc.woff2) format('woff2');
31 |   unicode-range: U+0370-03FF;
32 | }
33 | /* latin-ext */
34 | @font-face {
35 |   font-family: 'Ubuntu';
36 |   font-style: normal;
37 |   font-weight: 300;
38 |   src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/WtcvfJHWXKxx4x0kuS1koRJtnKITppOI_IvcXXDNrsc.woff2) format('woff2');
39 |   unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF;
40 | }
41 | /* latin */
42 | @font-face {
43 |   font-family: 'Ubuntu';
44 |   font-style: normal;
45 |   font-weight: 300;
46 |   src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/_aijTyevf54tkVDLy-dlnFtXRa8TVwTICgirnJhmVJw.woff2) format('woff2');
47 |   unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000;
48 | }
49 | /* cyrillic-ext */
50 | @font-face {
51 |   font-family: 'Ubuntu';
52 |   font-style: normal;
53 |   font-weight: 400;
54 |   src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/ODszJI8YqNw8V2xPulzjO_esZW2xOQ-xsNqO47m55DA.woff2) format('woff2');
55 |   unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F;
56 | }
57 | /* cyrillic */
58 | @font-face {
59 |   font-family: 'Ubuntu';
60 |   font-style: normal;
61 |   font-weight: 400;
62 |   src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/iQ9VJx1UMASKNiGywyyCXvesZW2xOQ-xsNqO47m55DA.woff2) format('woff2');
63 |   unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116;
64 | }
65 | /* greek-ext */
66 | @font-face {
67 |   font-family: 'Ubuntu';
68 |   font-style: normal;
69 |   font-weight: 400;
70 |   src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/WkvQmvwsfw_KKeau9SlQ2_esZW2xOQ-xsNqO47m55DA.woff2) format('woff2');
71 |   unicode-range: U+1F00-1FFF;
72 | }
73 | /* greek */
74 | @font-face {
75 |   font-family: 'Ubuntu';
76 |   font-style: normal;
77 |   font-weight: 400;
78 |   src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/gYAtqXUikkQjyJA1SnpDLvesZW2xOQ-xsNqO47m55DA.woff2) format('woff2');
79 |   unicode-range: U+0370-03FF;
80 | }
81 | /* latin-ext */
82 | @font-face {
83 |   font-family: 'Ubuntu';
84 |   font-style: normal;
85 |   font-weight: 400;
86 |   src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/Wu5Iuha-XnKDBvqRwQzAG_esZW2xOQ-xsNqO47m55DA.woff2) format('woff2');
87 |   unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF;
88 | }
89 | /* latin */
90 | @font-face {
91 |   font-family: 'Ubuntu';
92 |   font-style: normal;
93 |   font-weight: 400;
94 |   src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/sDGTilo5QRsfWu6Yc11AXg.woff2) format('woff2');
95 |   unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000;
96 | }
97 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/figures/plot-simple-decision-tree-adult-census.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import pandas as pd
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | from matplotlib.pyplot import cm
  7 | from matplotlib.colors import ListedColormap
  8 | 
  9 | import seaborn as sns
 10 | 
 11 | from sklearn.preprocessing import LabelEncoder
 12 | from sklearn.tree import DecisionTreeClassifier
 13 | 
 14 | adult_census = pd.read_csv(
 15 |     "https://www.openml.org/data/get_csv/1595261/adult-census.csv")
 16 | 
 17 | target_column = 'class'
 18 | 
 19 | numerical_columns = [
 20 |     'age', 'education-num', 'capital-gain', 'capital-loss',
 21 |     'hours-per-week']
 22 | categorical_columns = [
 23 |     'workclass', 'education', 'marital-status', 'occupation',
 24 |     'relationship', 'race', 'sex', 'native-country']
 25 | all_columns = numerical_columns + categorical_columns + [
 26 |     target_column]
 27 | 
 28 | adult_census = adult_census[all_columns]
 29 | 
 30 | n_samples_to_plot = 5000
 31 | columns = ['age', 'education-num', 'hours-per-week']
 32 | _ = sns.pairplot(data=adult_census[:n_samples_to_plot], vars=columns,
 33 |                  hue=target_column, plot_kws={'alpha': 0.2},
 34 |                  height=4, diag_kind='hist')
 35 | 
 36 | _ = sns.pairplot(data=adult_census[:n_samples_to_plot], x_vars='age',
 37 |                  y_vars='hours-per-week', hue=target_column,
 38 |                  markers=['o',
 39 |                           'v'], plot_kws={'alpha': 0.2}, height=12)
 40 | 
 41 | top = cm.get_cmap('Oranges', 128)
 42 | bottom = cm.get_cmap('Blues_r', 128)
 43 | 
 44 | colors = np.vstack([bottom(np.linspace(0, 1, 128)),
 45 |                     top(np.linspace(0, 1, 128))])
 46 | blue_orange_cmap = ListedColormap(colors, name='BlueOrange')
 47 | 
 48 | 
 49 | def plot_tree_decision_function(tree, X, y, ax):
 50 |     """Plot the different decision rules found by a `DecisionTreeClassifier`.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     tree : DecisionTreeClassifier instance
 55 |         The decision tree to inspect.
 56 |     X : dataframe of shape (n_samples, n_features)
 57 |         The data used to train the `tree` estimator.
 58 |     y : ndarray of shape (n_samples,)
 59 |         The target used to train the `tree` estimator.
 60 |     ax : matplotlib axis
 61 |         The matplotlib axis where to plot the different decision rules.
 62 |     """
 63 |     import numpy as np
 64 |     from scipy import ndimage
 65 | 
 66 |     h = 0.02
 67 |     x_min, x_max = 0, 100
 68 |     y_min, y_max = 0, 100
 69 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
 70 |                          np.arange(y_min, y_max, h))
 71 | 
 72 |     Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
 73 |     Z = Z.reshape(xx.shape)
 74 |     faces = tree.tree_.apply(
 75 |         np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
 76 |     faces = faces.reshape(xx.shape)
 77 |     border = ndimage.laplace(faces) != 0
 78 |     ax.scatter(X.iloc[:, 0], X.iloc[:, 1],
 79 |                c=np.array(['tab:blue',
 80 |                            'tab:orange'])[y], s=60, alpha=0.7, vmin=0, vmax=1)
 81 |     levels = np.linspace(0, 1, 101)
 82 |     contours = ax.contourf(xx, yy, Z, levels=levels, alpha=.4, cmap=blue_orange_cmap)
 83 |     ax.get_figure().colorbar(contours, ticks=np.linspace(0, 1, 11))
 84 |     ax.scatter(xx[border], yy[border], marker='.', s=1)
 85 |     ax.set_xlabel(X.columns[0])
 86 |     ax.set_ylabel(X.columns[1])
 87 |     ax.set_xlim([x_min, x_max])
 88 |     ax.set_ylim([y_min, y_max])
 89 |     sns.despine(offset=10)
 90 | 
 91 | 
 92 | # select a subset of data
 93 | data_subset = adult_census[:n_samples_to_plot]
 94 | X = data_subset[["age", "hours-per-week"]]
 95 | y = LabelEncoder().fit_transform(
 96 |     data_subset[target_column].to_numpy())
 97 | 
 98 | max_leaf_nodes = 3
 99 | tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes,
100 |                               random_state=0)
101 | tree.fit(X, y)
102 | 
103 | # plot the decision function learned by the tree
104 | fig, ax = plt.subplots()
105 | plot_tree_decision_function(tree, X, y, ax=ax)
106 | 
107 | fig.savefig('simple-decision-tree-adult-census.png')
108 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/01-numpy-introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to Data Science in Python\n",
  8 |     "\n",
  9 |     "### Numpy tutorial, November, 28th 2019\n",
 10 |     "\n",
 11 |     "Working efficiently with multi-dimensional arrays in NumPy\n",
 12 |     "\n",
 13 |     "Maria Teleńczuk <br>\n",
 14 |     "email: telenczukm at gmail.com"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Quick introduction to Jupyter notebook\n",
 22 |     "\n",
 23 |     "to run Jupyter notebook in your terminal type: <br>\n",
 24 |     "    `jupyter notebook`\n",
 25 |     "    \n",
 26 |     "alternatively you may want to run Jupyter lab which is more advanced product of jupyter: <br>\n",
 27 |     "    `jupyter lab`\n",
 28 |     "    \n",
 29 |     "**Esc** : takes you into command mode, there you can use:\n",
 30 |     " -  __a__ : insert a new cell above <br>\n",
 31 |     " -  __b__ : insert a new cell below <br>\n",
 32 |     " -  **m** : change the current cell to Markdown <br>\n",
 33 |     " -  **y** : change the current cell to code\n",
 34 |     "\n",
 35 |     "**Enter** : go back to edit mode\n",
 36 |     "    \n",
 37 |     "**Shift + Enter** : execute the cell, move to the cell below\n",
 38 |     "\n",
 39 |     "__?__ : help"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "# Topics\n",
 47 |     "1. [Introduction to numpy](numpys/numpy_intro.ipynb)\n",
 48 |     "2. [Working with a dataset](numpys/dataset_intro.ipynb)\n",
 49 |     "3. [Filtering data](numpys/filtering_data.ipynb)\n",
 50 |     "4. [Slices](numpys/slices.ipynb)\n",
 51 |     "5. [Operations](numpys/operations.ipynb)\n",
 52 |     "6. [Stacking](numpys/stacking.ipynb) \n",
 53 |     "\n",
 54 |     "### Extra topics\n",
 55 |     "\n",
 56 |     "7. [K-means clustering](numpys/k_means.ipynb)\n",
 57 |     "8. [Savez() and load()](numpys/savez.ipynb)\n",
 58 |     "9. [Fancy indexing](numpys/fancy_indexing.ipynb)\n",
 59 |     "10. [Broadcasting](numpys/broadcasting.ipynb)\n",
 60 |     "11. [Test yourself](numpys/test_yourself.ipynb)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Based on material from:\n",
 68 |     " - Software Carpentry, [Python Novice inflammation](https://github.com/swcarpentry/python-novice-inflammation)\n",
 69 |     " - Paris Software Carpentry, [Advanced  numpy lesson](https://paris-swc.github.io/advanced-numpy-lesson/)\n",
 70 |     " - Bartosz Teleńczuk, [Advanced Numpy tutorial](https://github.com/paris-saclay-cds/data-science-workshop-2019/blob/master/Day_1_Scientific_Python/01-numpy-introduction.ipynb)\n",
 71 |     " "
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     " ### You might also like:\n",
 79 |     "  - Gaël Varoquaux, Emmanuelle Gouillart and Olav Vahtras (editors), [SciPy Lectures](http://scipy-lectures.org/)\n",
 80 |     "  - NumPy community, [NumPy Docs](https://docs.scipy.org/doc/numpy/)\n",
 81 |     "  - Juan Nuñez-Iglesias, [Lecture on Advanced NumPy patterns](https://github.com/jni/aspp2015)\n",
 82 |     "  - Stéfan van der Walt, [Advanced NumPy tutorial](https://python.g-node.org/python-summerschool-2014/numpy.html)\n",
 83 |     "  - Nicolas Rougier, [100 NumPy exercises](https://github.com/rougier/numpy-100)\n",
 84 |     "  - Bartosz Teleńczuk, [Advanced NumPy lesson](https://github.com/paris-swc/advanced-numpy-lesson)"
 85 |    ]
 86 |   }
 87 |  ],
 88 |  "metadata": {
 89 |   "kernelspec": {
 90 |    "display_name": "Python 3",
 91 |    "language": "python",
 92 |    "name": "python3"
 93 |   },
 94 |   "language_info": {
 95 |    "codemirror_mode": {
 96 |     "name": "ipython",
 97 |     "version": 3
 98 |    },
 99 |    "file_extension": ".py",
100 |    "mimetype": "text/x-python",
101 |    "name": "python",
102 |    "nbconvert_exporter": "python",
103 |    "pygments_lexer": "ipython3",
104 |    "version": "3.7.2"
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 2
109 | }
110 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/filtering_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Filtering data\n",
  8 |     "It's also possible to select elements (filter) based on a condition. "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import numpy as np\n",
 18 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "a = np.arange(4)\n",
 35 |     "print(a)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "mask = np.array([False, True, True, False])"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "a[mask]"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "odd = (a % 2) == 1\n",
 70 |     "odd"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "a[odd]"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "This could be also done in a single step:"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "a[(a % 2) == 1]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## $\\color{green}{\\text{Excercise}}$ Filtering data\n",
103 |     "In the `data` what do you have to do to select all measurments above 10 in the first patient (index 0)?"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "patient0_data = data[0, :]\n",
113 |     "patient0_data[patient0_data > 10]"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "We can also substitute the measurement with a new value:"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "patient1_data = data[1, :]\n",
130 |     "patient1_data[patient1_data > 10] = 10\n",
131 |     "print(patient1_data)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "[Previous: Working with a dataset](dataset_intro.ipynb)<br>[Next: Slices](slices.ipynb)"
139 |    ]
140 |   }
141 |  ],
142 |  "metadata": {
143 |   "kernelspec": {
144 |    "display_name": "Python 3",
145 |    "language": "python",
146 |    "name": "python3"
147 |   },
148 |   "language_info": {
149 |    "codemirror_mode": {
150 |     "name": "ipython",
151 |     "version": 3
152 |    },
153 |    "file_extension": ".py",
154 |    "mimetype": "text/x-python",
155 |    "name": "python",
156 |    "nbconvert_exporter": "python",
157 |    "pygments_lexer": "ipython3",
158 |    "version": "3.7.2"
159 |   }
160 |  },
161 |  "nbformat": 4,
162 |  "nbformat_minor": 2
163 | }
164 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/filtering_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Filtering data\n",
  8 |     "It's also possible to select elements (filter) based on a condition. "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "import numpy as np\n",
 18 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "a = np.arange(4)\n",
 35 |     "print(a)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "mask = np.array([False, True, True, False])"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "a[mask]"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "odd = (a % 2) == 1\n",
 70 |     "odd"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "a[odd]"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "This could be also done in a single step:"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "a[(a % 2) == 1]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## $\\color{green}{\\text{Excercise}}$ Filtering data\n",
103 |     "In the `data` what do you have to do to select all measurments above 10 in the first patient (index 0)?"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "We can also substitute the measurement with a new value:"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "patient1_data = data[1, :]\n",
127 |     "patient1_data[patient1_data > 10] = 10\n",
128 |     "print(patient1_data)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "[Previous: Working with a dataset](dataset_intro.ipynb)<br>[Next: Slices](slices.ipynb)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": []
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "kernelspec": {
148 |    "display_name": "Python 3",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.7.2"
163 |   }
164 |  },
165 |  "nbformat": 4,
166 |  "nbformat_minor": 2
167 | }
168 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 2-Day Workshop - Introduction to Data Science in Python
 2 | 
 3 | Materials for the Paris-Saclay Center for Data Science python workshop
 4 | 
 5 | Data science is gaining attention impacting many scientific fields and applications. Data science encompasses a large number of topics such as data mining, data wrangling, data visualisation, pattern recognition, or machine learning.
 6 | 
 7 | This workshop intends to give an introduction to some of these topics using Python and the PyData ecosystem. It is not a course on deep learning.
 8 | 
 9 | *Note: the material in this repo is WIP, not the finalized material.*
10 | 
11 | You can run the notebooks in a binder: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/paris-saclay-cds/data-science-workshop-2019/master)
12 | 
13 | ## Program
14 | 
15 | ### Day 1 -  Data wrangling, exploration, and visualisation
16 | 
17 | **Goal:** introduce the PyData ecosystem to manipulate, explore, and visualize data.
18 | 
19 | * Introduction to the basics of numpy, pandas, and matplotlib.
20 | 
21 | ### Day 2 - Machine learning
22 | 
23 | **Goal:** introduce the basics of machine learning using the  scikit-learn library.
24 | 
25 | * Get familiar with general principles of machine learning;
26 | * Use these principles by using the scikit-learn library on some toy and real-world data examples.
27 | 
28 | 
29 | ## Getting started
30 | 
31 | The course uses Python 3 and some data analysis packages such as Numpy, Pandas, scikit-learn, matplotlib, and seaborn. To install the required libraries, we highly recommend Anaconda or miniconda (<https://www.anaconda.com/download/>) or another Python distribution that includes the scientific libraries (this recommendation applies to all platforms, so for both Window, Linux and Mac).
32 | 
33 | ### Install Anaconda
34 | 
35 | For first time users and people not fully confident with using the command line, we advice to install Anaconda, by downloading and installing the Python 3.x version from <https://www.anaconda.com/download/>. Recent computers will require the 64-Bit installer.
36 | 
37 | For more detailed instructions to install Anaconda, check the [Windows](https://docs.anaconda.com/anaconda/install/windows/), [Mac](https://docs.anaconda.com/anaconda/install/mac-os/) or [linux](https://docs.anaconda.com/anaconda/install/linux/) installation tutorial.
38 | 
39 | **Note:** When you are already familiar to the command line and Python environments you could opt to use Miniconda instead of Anaconda and download it  from <https://conda.io/miniconda.html>. The main difference is that Anaconda provides a graphical user interface (Anaconda navigator) and a whole lot of scientific packages (e.g <https://docs.anaconda.com/anaconda/packages/py3.6_win-64/>) when installing, whereas for Miniconda the user needs to install all packages using the command line. On the other hand, Miniconda requires less disc space. By choosing Miniconda, create the workshop environment using the `environment.yml` file: `conda env create -f environment.yml`
40 | 
41 | ### Install/check of required packages
42 | 
43 | This tutorial will require recent installations of
44 | 
45 | - [NumPy](http://www.numpy.org)
46 | - [SciPy](http://www.scipy.org)
47 | - [matplotlib](http://matplotlib.org)
48 | - [pandas](http://pandas.pydata.org)
49 | - [pillow](https://python-pillow.org)
50 | - [scikit-learn](http://scikit-learn.org/stable/)
51 | - [seaborn](http://seaborn.pydata.org/)
52 | - [IPython](http://ipython.readthedocs.org/en/stable/)
53 | - [Jupyter notebook](http://jupyter.org)
54 | - [plotly](https://plot.ly/)
55 | - [pandas-profiling](https://pandas-profiling.github.io/pandas-profiling/docs/)
56 | 
57 | 
58 | The last one is important and you should be able to type:
59 | 
60 | ```bash
61 | jupyter notebook
62 | ```
63 | 
64 | in your terminal window and see the notebook panel load in your web browser. Try opening and running a notebook from the material to see check that it works. Alternatively you can use Jupyter notebook.
65 | 
66 | After obtaining the material, we **strongly recommend** you to open and execute the script using `python check_env.py` that is located at the top level of this repository.
67 | 
68 | We also recommend you to update the scikit-learn the latest release version to ensure best compatibility with the teaching material. Please upgrade already installed packages by executing
69 | 
70 | ```bash
71 | conda update [package-name]
72 | ```
73 | 
74 | Depending on how you installed ``scikit-learn``.
75 | 
76 | 
77 | <img src="img/logoUPSayPlusCDS_990.png"/>
78 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/04_basic_parameters_tuning_exercise_02.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Exercise 02\n",
  8 |     "The goal is to find the best set of hyper-parameters which maximize the\n",
  9 |     "performance on a training set."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "\n",
 21 |     "df = pd.read_csv(\n",
 22 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
 23 |     "# Or use the local copy:\n",
 24 |     "# df = pd.read_csv('../datasets/adult-census.csv')\n",
 25 |     "\n",
 26 |     "target_name = \"class\"\n",
 27 |     "target = df[target_name].to_numpy()\n",
 28 |     "data = df.drop(columns=target_name)\n",
 29 |     "\n",
 30 |     "from sklearn.model_selection import train_test_split\n",
 31 |     "\n",
 32 |     "df_train, df_test, target_train, target_test = train_test_split(\n",
 33 |     "    data, target, random_state=42)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "TODO: create your machine learning pipeline\n",
 41 |     "\n",
 42 |     "You should:\n",
 43 |     "* preprocess the categorical columns using a `OneHotEncoder` and use a\n",
 44 |     "  `StandardScaler` to normalize the numerical data.\n",
 45 |     "* use a `LogisticRegression` as a predictive model."
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {
 51 |     "lines_to_next_cell": 0
 52 |    },
 53 |    "source": [
 54 |     "Start by defining the columns and the preprocessing pipelines to be applied\n",
 55 |     "on each columns."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "\n",
 65 |     "from sklearn.preprocessing import OneHotEncoder\n",
 66 |     "from sklearn.preprocessing import StandardScaler"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {
 72 |     "lines_to_next_cell": 0
 73 |    },
 74 |    "source": [
 75 |     "Subsequently, create a `ColumnTransformer` to redirect the specific columns\n",
 76 |     "a preprocessing pipeline."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "\n",
 86 |     "from sklearn.compose import ColumnTransformer"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {
 92 |     "lines_to_next_cell": 0
 93 |    },
 94 |    "source": [
 95 |     "Finally, concatenate the preprocessing pipeline with a logistic regression."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "lines_to_next_cell": 2
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "\n",
107 |     "from sklearn.pipeline import make_pipeline\n",
108 |     "from sklearn.linear_model import LogisticRegression"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "TODO: make your random search\n",
116 |     "\n",
117 |     "Use a `RandomizedSearchCV` to find the best set of hyper-parameters by tuning\n",
118 |     "the following parameters for the `LogisticRegression` model:\n",
119 |     "- `C` with values ranging from 0.001 to 10. You can use a reciprocal\n",
120 |     "  distribution (i.e. `scipy.stats.reciprocal`);\n",
121 |     "- `solver` with possible values being `\"liblinear\"` and `\"lbfgs\"`;\n",
122 |     "- `penalty` with possible values being `\"l2\"` and `\"l1\"`;\n",
123 |     "In addition, try several preprocessing strategies with the `OneHotEncoder`\n",
124 |     "by always (or not) dropping the first column when encoding the categorical\n",
125 |     "data.\n",
126 |     "\n",
127 |     "Notes: You can accept failure during a grid-search or a randomized-search\n",
128 |     "by settgin `error_score` to `np.nan` for instance."
129 |    ]
130 |   }
131 |  ],
132 |  "metadata": {
133 |   "jupytext": {
134 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
135 |   },
136 |   "kernelspec": {
137 |    "display_name": "Python 3",
138 |    "language": "python",
139 |    "name": "python3"
140 |   }
141 |  },
142 |  "nbformat": 4,
143 |  "nbformat_minor": 2
144 | }
145 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/boolean_mask.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Boolean mask\n",
  8 |     "\n",
  9 |     "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "[0 1 2 3]\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "a = np.arange(4)\n",
 36 |     "print(a)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "array([1, 2])"
 48 |       ]
 49 |      },
 50 |      "execution_count": 3,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "mask = np.array([False, True, True, False])\n",
 57 |     "a[mask]"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "array([False,  True, False,  True])"
 76 |       ]
 77 |      },
 78 |      "execution_count": 4,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "odd = (a % 2) == 1\n",
 85 |     "odd"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "array([1, 3])"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "np.array([False,  True, False,  True], dtype=bool)\n",
106 |     "a[odd]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "This could be also done in a single step:"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "array([1, 3])"
125 |       ]
126 |      },
127 |      "execution_count": 6,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "a[(a % 2) == 1]"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "## $\\color{green}{\\text{Quiz}}$  view or copy\n",
141 |     "What are the final values of a and b at the end of the following program? Explain why.\n",
142 |     "\n",
143 |     "`a = np.arange(5)\n",
144 |     "b = a[a < 3]\n",
145 |     "b[::2] = 0`\n",
146 |     "\n",
147 |     "\n",
148 |     "a)  a = [0, 1, 2, 3, 4], b = [0, 1, 2] <br>\n",
149 |     "b)  a = [0, 1, 0, 3, 4], b = [0, 1, 0] <br>\n",
150 |     "c)  a = [0, 0, 2, 3, 4], b = [0, 0, 2] <br>\n",
151 |     "d)  a = [0, 1, 2, 3, 4], b = [0, 1, 0] <br>\n",
152 |     "e)  a = [0, 1, 2, 3, 4], b = [0, 1, 0, 3, 0] <br>"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "[Previous: Operations](operations.ipynb)<br>[Next: Stacking](stacking.ipynb)"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.7.2"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 2
184 | }
185 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/boolean_mask.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Boolean mask\n",
  8 |     "\n",
  9 |     "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "[0 1 2 3]\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "a = np.arange(4)\n",
 36 |     "print(a)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "array([1, 2])"
 48 |       ]
 49 |      },
 50 |      "execution_count": 3,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "mask = np.array([False, True, True, False])\n",
 57 |     "a[mask]"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "array([False,  True, False,  True])"
 76 |       ]
 77 |      },
 78 |      "execution_count": 4,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "odd = (a % 2) == 1\n",
 85 |     "odd"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "array([1, 3])"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "np.array([False,  True, False,  True], dtype=bool)\n",
106 |     "a[odd]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "This could be also done in a single step:"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "array([1, 3])"
125 |       ]
126 |      },
127 |      "execution_count": 6,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "a[(a % 2) == 1]"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "## $\\color{green}{\\text{Quiz}}$  view or copy\n",
141 |     "What are the final values of a and b at the end of the following program? Explain why.\n",
142 |     "\n",
143 |     "`a = np.arange(5)\n",
144 |     "b = a[a < 3]\n",
145 |     "b[::2] = 0`\n",
146 |     "\n",
147 |     "\n",
148 |     "a)  a = [0, 1, 2, 3, 4], b = [0, 1, 2] <br>\n",
149 |     "b)  a = [0, 1, 0, 3, 4], b = [0, 1, 0] <br>\n",
150 |     "c)  a = [0, 0, 2, 3, 4], b = [0, 0, 2] <br>\n",
151 |     "d)  a = [0, 1, 2, 3, 4], b = [0, 1, 0] <br>\n",
152 |     "e)  a = [0, 1, 2, 3, 4], b = [0, 1, 0, 3, 0] <br>"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "[Previous: Operations](operations.ipynb)<br>[Next: Stacking](stacking.ipynb)"
160 |    ]
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.7.2"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 2
184 | }
185 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/04_basic_parameters_tuning_exercise_01_solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#  Exercise 01\n",
  8 |     "The goal is to write an exhaustive search to find the best parameters\n",
  9 |     "combination maximizing the model performance"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "\n",
 20 |     "from sklearn.model_selection import train_test_split\n",
 21 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 22 |     "from sklearn.model_selection import RandomizedSearchCV\n",
 23 |     "from sklearn.compose import ColumnTransformer\n",
 24 |     "from sklearn.pipeline import Pipeline\n",
 25 |     "# This line is currently required to import HistGradientBoostingClassifier\n",
 26 |     "from sklearn.experimental import enable_hist_gradient_boosting\n",
 27 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
 28 |     "\n",
 29 |     "from scipy.stats import expon, uniform\n",
 30 |     "from scipy.stats import randint\n",
 31 |     "\n",
 32 |     "df = pd.read_csv(\n",
 33 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
 34 |     "# Or use the local copy:\n",
 35 |     "# df = pd.read_csv('../datasets/adult-census.csv')\n",
 36 |     "\n",
 37 |     "target_name = \"class\"\n",
 38 |     "target = df[target_name].to_numpy()\n",
 39 |     "data = df.drop(columns=target_name)\n",
 40 |     "\n",
 41 |     "df_train, df_test, target_train, target_test = train_test_split(\n",
 42 |     "    data, target, random_state=42)\n",
 43 |     "\n",
 44 |     "from sklearn.compose import ColumnTransformer\n",
 45 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 46 |     "\n",
 47 |     "categorical_columns = [\n",
 48 |     "    'workclass', 'education', 'marital-status', 'occupation',\n",
 49 |     "    'relationship', 'race', 'native-country', 'sex']\n",
 50 |     "\n",
 51 |     "categories = [data[column].unique()\n",
 52 |     "              for column in data[categorical_columns]]\n",
 53 |     "\n",
 54 |     "categorical_preprocessor = OrdinalEncoder(categories=categories)\n",
 55 |     "\n",
 56 |     "preprocessor = ColumnTransformer(\n",
 57 |     "    [('cat-preprocessor', categorical_preprocessor, categorical_columns)],\n",
 58 |     "    remainder='passthrough', sparse_threshold=0)\n",
 59 |     "\n",
 60 |     "from sklearn.experimental import enable_hist_gradient_boosting\n",
 61 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
 62 |     "from sklearn.pipeline import make_pipeline\n",
 63 |     "\n",
 64 |     "model = make_pipeline(\n",
 65 |     "    preprocessor, HistGradientBoostingClassifier(random_state=42))"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "TODO: write your solution here\n",
 73 |     "\n",
 74 |     "Use the previously defined model (called `model`) and using two nested `for`\n",
 75 |     "loops, make a search of the best combinations of the `learning_rate` and\n",
 76 |     "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n",
 77 |     "the model by setting the parameters. The evaluation of the model should be\n",
 78 |     "performed using `cross_val_score`. We can propose to define the following\n",
 79 |     "parameters search:\n",
 80 |     "- `learning_rate` for the values 0.01, 0.1, and 1;\n",
 81 |     "- `max_leaf_nodes` for the values 5, 25, 45."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "from sklearn.model_selection import cross_val_score\n",
 91 |     "\n",
 92 |     "learning_rate = [0.01, 0.1, 1, 10]\n",
 93 |     "max_leaf_nodes = [5, 25, 45]\n",
 94 |     "\n",
 95 |     "best_score = 0\n",
 96 |     "best_params = {}\n",
 97 |     "for lr in learning_rate:\n",
 98 |     "    for mln in max_leaf_nodes:\n",
 99 |     "        model.set_params(\n",
100 |     "            histgradientboostingclassifier__learning_rate=lr,\n",
101 |     "            histgradientboostingclassifier__max_leaf_nodes=mln\n",
102 |     "        )\n",
103 |     "        scores = cross_val_score(model, df_train, target_train, cv=3)\n",
104 |     "        if scores.mean() > best_score:\n",
105 |     "            best_score = scores.mean()\n",
106 |     "            best_params = {'learning-rate': lr, 'max leaf nodes': mln}\n",
107 |     "print(f\"The best accuracy obtained is {best_score:.3f}\")\n",
108 |     "print(f\"The best parameters found are:\\n {best_params}\")"
109 |    ]
110 |   }
111 |  ],
112 |  "metadata": {
113 |   "jupytext": {
114 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
115 |   },
116 |   "kernelspec": {
117 |    "display_name": "Python 3",
118 |    "language": "python",
119 |    "name": "python3"
120 |   }
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 2
124 | }
125 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_02.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Exercise 03\n",
  8 |     "\n",
  9 |     "The goal of this exercise is to evaluate the impact of feature preprocessing on a pipeline that uses a  decision-tree-based classifier instead of logistic regression.\n",
 10 |     "\n",
 11 |     "- The first question is to empirically evaluate whether scaling numerical feature is helpful or not;\n",
 12 |     "\n",
 13 |     "- The second question is to evaluate whether it is empirically better (both from a computational and a statistical perspective) to use integer coded or one-hot encoded categories."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import pandas as pd\n",
 23 |     "from sklearn.model_selection import cross_val_score\n",
 24 |     "from sklearn.pipeline import make_pipeline\n",
 25 |     "from sklearn.compose import ColumnTransformer\n",
 26 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 27 |     "from sklearn.experimental import enable_hist_gradient_boosting\n",
 28 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
 29 |     "\n",
 30 |     "df = pd.read_csv(\n",
 31 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
 32 |     "\n",
 33 |     "# Or use the local copy:\n",
 34 |     "# df = pd.read_csv('../datasets/adult-census.csv')"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "target_name = \"class\"\n",
 44 |     "target = df[target_name].to_numpy()\n",
 45 |     "data = df.drop(columns=[target_name, \"fnlwgt\"])"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "numerical_columns = [\n",
 55 |     "    c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n",
 56 |     "categorical_columns = [\n",
 57 |     "    c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n",
 58 |     "\n",
 59 |     "categories = [\n",
 60 |     "    data[column].unique() for column in data[categorical_columns]]"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Reference pipeline (no numerical scaling and integer-coded categories)\n",
 68 |     "\n",
 69 |     "First let's time the pipeline we used in the main notebook to serve as a reference:"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "%%time\n",
 79 |     "\n",
 80 |     "preprocessor = ColumnTransformer([\n",
 81 |     "    ('categorical', OrdinalEncoder(categories=categories),\n",
 82 |     "     categorical_columns),], remainder=\"passthrough\")\n",
 83 |     "\n",
 84 |     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
 85 |     "scores = cross_val_score(model, data, target)\n",
 86 |     "print(f\"The different scores obtained are: \\n{scores}\")\n",
 87 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Scaling numerical features\n",
 95 |     "\n",
 96 |     "Let's write a similar pipeline that also scales the numerical features using `StandardScaler` (or similar):"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "# TODO write me!"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "## One-hot encoding of categorical variables\n",
113 |     "\n",
114 |     "For linear models, we have observed that integer coding of categorical\n",
115 |     "variables can be very detrimental. However for\n",
116 |     "`HistGradientBoostingClassifier` models, it does not seem to be the\n",
117 |     "case as the cross-validation of the reference pipeline with\n",
118 |     "`OrdinalEncoder` is good.\n",
119 |     "\n",
120 |     "Let's see if we can get an even better accuracy with `OneHotEncoding`.\n",
121 |     "\n",
122 |     "Hint: `HistGradientBoostingClassifier` does not yet support sparse input data. You might want to use\n",
123 |     "`OneHotEncoder(handle_unknown=\"ignore\", sparse=False)` to force the use a dense representation as a workaround."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "# TODO: write me!"
133 |    ]
134 |   }
135 |  ],
136 |  "metadata": {
137 |   "jupytext": {
138 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
139 |   },
140 |   "kernelspec": {
141 |    "display_name": "Python 3",
142 |    "language": "python",
143 |    "name": "python3"
144 |   }
145 |  },
146 |  "nbformat": 4,
147 |  "nbformat_minor": 2
148 | }
149 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/data/inflammation-01.csv:
--------------------------------------------------------------------------------
 1 | 0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0
 2 | 0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1
 3 | 0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1
 4 | 0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1
 5 | 0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,7,13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1
 6 | 0,0,1,2,2,4,2,1,6,4,7,6,6,9,9,15,4,16,18,12,12,5,18,9,5,3,10,3,12,7,8,4,7,3,5,4,4,3,2,1
 7 | 0,0,2,2,4,2,2,5,5,8,6,5,11,9,4,13,5,12,10,6,9,17,15,8,9,3,13,7,8,2,8,8,4,2,3,5,4,1,1,1
 8 | 0,0,1,2,3,1,2,3,5,3,7,8,8,5,10,9,15,11,18,19,20,8,5,13,15,10,6,10,6,7,4,9,3,5,2,5,3,2,2,1
 9 | 0,0,0,3,1,5,6,5,5,8,2,4,11,12,10,11,9,10,17,11,6,16,12,6,8,14,6,13,10,11,4,6,4,7,6,3,2,1,0,0
10 | 0,1,1,2,1,3,5,3,5,8,6,8,12,5,13,6,13,8,16,8,18,15,16,14,12,7,3,8,9,11,2,5,4,5,1,4,1,2,0,0
11 | 0,1,0,0,4,3,3,5,5,4,5,8,7,10,13,3,7,13,15,18,8,15,15,16,11,14,12,4,10,10,4,3,4,5,5,3,3,2,2,1
12 | 0,1,0,0,3,4,2,7,8,5,2,8,11,5,5,8,14,11,6,11,9,16,18,6,12,5,4,3,5,7,8,3,5,4,5,5,4,0,1,1
13 | 0,0,2,1,4,3,6,4,6,7,9,9,3,11,6,12,4,17,13,15,13,12,8,7,4,7,12,9,5,6,5,4,7,3,5,4,2,3,0,1
14 | 0,0,0,0,1,3,1,6,6,5,5,6,3,6,13,3,10,13,9,16,15,9,11,4,6,4,11,11,12,3,5,8,7,4,6,4,1,3,0,0
15 | 0,1,2,1,1,1,4,1,5,2,3,3,10,7,13,5,7,17,6,9,12,13,10,4,12,4,6,7,6,10,8,2,5,1,3,4,2,0,2,0
16 | 0,1,1,0,1,2,4,3,6,4,7,5,5,7,5,10,7,8,18,17,9,8,12,11,11,11,14,6,11,2,10,9,5,6,5,3,4,2,2,0
17 | 0,0,0,0,2,3,6,5,7,4,3,2,10,7,9,11,12,5,12,9,13,19,14,17,5,13,8,11,5,10,9,8,7,5,3,1,4,0,2,1
18 | 0,0,0,1,2,1,4,3,6,7,4,2,12,6,12,4,14,7,8,14,13,19,6,9,12,6,4,13,6,7,2,3,6,5,4,2,3,0,1,0
19 | 0,0,2,1,2,5,4,2,7,8,4,7,11,9,8,11,15,17,11,12,7,12,7,6,7,4,13,5,7,6,6,9,2,1,1,2,2,0,1,0
20 | 0,1,2,0,1,4,3,2,2,7,3,3,12,13,11,13,6,5,9,16,9,19,16,11,8,9,14,12,11,9,6,6,6,1,1,2,4,3,1,1
21 | 0,1,1,3,1,4,4,1,8,2,2,3,12,12,10,15,13,6,5,5,18,19,9,6,11,12,7,6,3,6,3,2,4,3,1,5,4,2,2,0
22 | 0,0,2,3,2,3,2,6,3,8,7,4,6,6,9,5,12,12,8,5,12,10,16,7,14,12,5,4,6,9,8,5,6,6,1,4,3,0,2,0
23 | 0,0,0,3,4,5,1,7,7,8,2,5,12,4,10,14,5,5,17,13,16,15,13,6,12,9,10,3,3,7,4,4,8,2,6,5,1,0,1,0
24 | 0,1,1,1,1,3,3,2,6,3,9,7,8,8,4,13,7,14,11,15,14,13,5,13,7,14,9,10,5,11,5,3,5,1,1,4,4,1,2,0
25 | 0,1,1,1,2,3,5,3,6,3,7,10,3,8,12,4,12,9,15,5,17,16,5,10,10,15,7,5,3,11,5,5,6,1,1,1,1,0,2,1
26 | 0,0,2,1,3,3,2,7,4,4,3,8,12,9,12,9,5,16,8,17,7,11,14,7,13,11,7,12,12,7,8,5,7,2,2,4,1,1,1,0
27 | 0,0,1,2,4,2,2,3,5,7,10,5,5,12,3,13,4,13,7,15,9,12,18,14,16,12,3,11,3,2,7,4,8,2,2,1,3,0,1,1
28 | 0,0,1,1,1,5,1,5,2,2,4,10,4,8,14,6,15,6,12,15,15,13,7,17,4,5,11,4,8,7,9,4,5,3,2,5,4,3,2,1
29 | 0,0,2,2,3,4,6,3,7,6,4,5,8,4,7,7,6,11,12,19,20,18,9,5,4,7,14,8,4,3,7,7,8,3,5,4,1,3,1,0
30 | 0,0,0,1,4,4,6,3,8,6,4,10,12,3,3,6,8,7,17,16,14,15,17,4,14,13,4,4,12,11,6,9,5,5,2,5,2,1,0,1
31 | 0,1,1,0,3,2,4,6,8,6,2,3,11,3,14,14,12,8,8,16,13,7,6,9,15,7,6,4,10,8,10,4,2,6,5,5,2,3,2,1
32 | 0,0,2,3,3,4,5,3,6,7,10,5,10,13,14,3,8,10,9,9,19,15,15,6,8,8,11,5,5,7,3,6,6,4,5,2,2,3,0,0
33 | 0,1,2,2,2,3,6,6,6,7,6,3,11,12,13,15,15,10,14,11,11,8,6,12,10,5,12,7,7,11,5,8,5,2,5,5,2,0,2,1
34 | 0,0,2,1,3,5,6,7,5,8,9,3,12,10,12,4,12,9,13,10,10,6,10,11,4,15,13,7,3,4,2,9,7,2,4,2,1,2,1,1
35 | 0,0,1,2,4,1,5,5,2,3,4,8,8,12,5,15,9,17,7,19,14,18,12,17,14,4,13,13,8,11,5,6,6,2,3,5,2,1,1,1
36 | 0,0,0,3,1,3,6,4,3,4,8,3,4,8,3,11,5,7,10,5,15,9,16,17,16,3,8,9,8,3,3,9,5,1,6,5,4,2,2,0
37 | 0,1,2,2,2,5,5,1,4,6,3,6,5,9,6,7,4,7,16,7,16,13,9,16,12,6,7,9,10,3,6,4,5,4,6,3,4,3,2,1
38 | 0,1,1,2,3,1,5,1,2,2,5,7,6,6,5,10,6,7,17,13,15,16,17,14,4,4,10,10,10,11,9,9,5,4,4,2,1,0,1,0
39 | 0,1,0,3,2,4,1,1,5,9,10,7,12,10,9,15,12,13,13,6,19,9,10,6,13,5,13,6,7,2,5,5,2,1,1,1,1,3,0,1
40 | 0,1,1,3,1,1,5,5,3,7,2,2,3,12,4,6,8,15,16,16,15,4,14,5,13,10,7,10,6,3,2,3,6,3,3,5,4,3,2,1
41 | 0,0,0,2,2,1,3,4,5,5,6,5,5,12,13,5,7,5,11,15,18,7,9,10,14,12,11,9,10,3,2,9,6,2,2,5,3,0,0,1
42 | 0,0,1,3,3,1,2,1,8,9,2,8,10,3,8,6,10,13,11,17,19,6,4,11,6,12,7,5,5,4,4,8,2,6,6,4,2,2,0,0
43 | 0,1,1,3,4,5,2,1,3,7,9,6,10,5,8,15,11,12,15,6,12,16,6,4,14,3,12,9,6,11,5,8,5,5,6,1,2,1,2,0
44 | 0,0,1,3,1,4,3,6,7,8,5,7,11,3,6,11,6,10,6,19,18,14,6,10,7,9,8,5,8,3,10,2,5,1,5,4,2,1,0,1
45 | 0,1,1,3,3,4,4,6,3,4,9,9,7,6,8,15,12,15,6,11,6,18,5,14,15,12,9,8,3,6,10,6,8,7,2,5,4,3,1,1
46 | 0,1,2,2,4,3,1,4,8,9,5,10,10,3,4,6,7,11,16,6,14,9,11,10,10,7,10,8,8,4,5,8,4,4,5,2,4,1,1,0
47 | 0,0,2,3,4,5,4,6,2,9,7,4,9,10,8,11,16,12,15,17,19,10,18,13,15,11,8,4,7,11,6,7,6,5,1,3,1,0,0,0
48 | 0,1,1,3,1,4,6,2,8,2,10,3,11,9,13,15,5,15,6,10,10,5,14,15,12,7,4,5,11,4,6,9,5,6,1,1,2,1,2,1
49 | 0,0,1,3,2,5,1,2,7,6,6,3,12,9,4,14,4,6,12,9,12,7,11,7,16,8,13,6,7,6,10,7,6,3,1,5,4,3,0,0
50 | 0,0,1,2,3,4,5,7,5,4,10,5,12,12,5,4,7,9,18,16,16,10,15,15,10,4,3,7,5,9,4,6,2,4,1,4,2,2,2,1
51 | 0,1,2,1,1,3,5,3,6,3,10,10,11,10,13,10,13,6,6,14,5,4,5,5,9,4,12,7,7,4,7,9,3,3,6,3,4,1,2,0
52 | 0,1,2,2,3,5,2,4,5,6,8,3,5,4,3,15,15,12,16,7,20,15,12,8,9,6,12,5,8,3,8,5,4,1,3,2,1,3,1,0
53 | 0,0,0,2,4,4,5,3,3,3,10,4,4,4,14,11,15,13,10,14,11,17,9,11,11,7,10,12,10,10,10,8,7,5,2,2,4,1,2,1
54 | 0,0,2,1,1,4,4,7,2,9,4,10,12,7,6,6,11,12,9,15,15,6,6,13,5,12,9,6,4,7,7,6,5,4,1,4,2,2,2,1
55 | 0,1,2,1,1,4,5,4,4,5,9,7,10,3,13,13,8,9,17,16,16,15,12,13,5,12,10,9,11,9,4,5,5,2,2,5,1,0,0,1
56 | 0,0,1,3,2,3,6,4,5,7,2,4,11,11,3,8,8,16,5,13,16,5,8,8,6,9,10,10,9,3,3,5,3,5,4,5,3,3,0,1
57 | 0,1,1,2,2,5,1,7,4,2,5,5,4,6,6,4,16,11,14,16,14,14,8,17,4,14,13,7,6,3,7,7,5,6,3,4,2,2,1,1
58 | 0,1,1,1,4,1,6,4,6,3,6,5,6,4,14,13,13,9,12,19,9,10,15,10,9,10,10,7,5,6,8,6,6,4,3,5,2,1,1,1
59 | 0,0,0,1,4,5,6,3,8,7,9,10,8,6,5,12,15,5,10,5,8,13,18,17,14,9,13,4,10,11,10,8,8,6,5,5,2,0,2,0
60 | 0,0,1,0,3,2,5,4,8,2,9,3,3,10,12,9,14,11,13,8,6,18,11,9,13,11,8,5,5,2,8,5,3,5,4,1,3,1,1,0
61 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_01_solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Solution for Exercise 02\n",
  8 |     "\n",
  9 |     "The goal of this exercise is to evalutate the impact of using an arbitrary\n",
 10 |     "integer encoding for categorical variables along with a linear\n",
 11 |     "classification model such as Logistic Regression.\n",
 12 |     "\n",
 13 |     "To do so, let's try to use `OrdinalEncoder` to preprocess the categorical\n",
 14 |     "variables. This preprocessor is assembled in a pipeline with\n",
 15 |     "`LogisticRegression`. The performance of the pipeline can be evaluated as\n",
 16 |     "usual by cross-validation and then compared to the score obtained when using\n",
 17 |     "`OneHotEncoding` or to some other baseline score.\n",
 18 |     "\n",
 19 |     "Because `OrdinalEncoder` can raise errors if it sees an unknown category at\n",
 20 |     "prediction time, we need to pre-compute the list of all possible categories\n",
 21 |     "ahead of time:\n",
 22 |     "\n",
 23 |     "```python\n",
 24 |     "categories = [data[column].unique()\n",
 25 |     "              for column in data[categorical_columns]]\n",
 26 |     "OrdinalEncoder(categories=categories)\n",
 27 |     "```"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import pandas as pd\n",
 37 |     "\n",
 38 |     "df = pd.read_csv(\n",
 39 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
 40 |     "\n",
 41 |     "# Or use the local copy:\n",
 42 |     "# df = pd.read_csv('../datasets/adult-census.csv')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "target_name = \"class\"\n",
 52 |     "target = df[target_name].to_numpy()\n",
 53 |     "data = df.drop(columns=[target_name, \"fnlwgt\"])\n",
 54 |     "categorical_columns = [\n",
 55 |     "    c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n",
 56 |     "data_categorical = data[categorical_columns]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "categories = [\n",
 66 |     "    data[column].unique() for column in data[categorical_columns]]\n",
 67 |     "\n",
 68 |     "categories"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "from sklearn.model_selection import cross_val_score\n",
 78 |     "from sklearn.pipeline import make_pipeline\n",
 79 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 80 |     "from sklearn.linear_model import LogisticRegression\n",
 81 |     "\n",
 82 |     "model = make_pipeline(\n",
 83 |     "    OrdinalEncoder(categories=categories),\n",
 84 |     "    LogisticRegression(solver='lbfgs', max_iter=1000))\n",
 85 |     "scores = cross_val_score(model, data_categorical, target)\n",
 86 |     "print(f\"The different scores obtained are: \\n{scores}\")"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Using an arbitrary mapping from string labels to integers as done here causes the linear model to make bad assumptions on the relative ordering of  categories.\n",
103 |     "\n",
104 |     "This prevent the model to learning anything predictive enough and the cross-validated score is even lower that the baseline we obtained by ignoring the input data and just always predict the most frequent class:"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "from sklearn.dummy import DummyClassifier\n",
114 |     "\n",
115 |     "scores = cross_val_score(DummyClassifier(strategy=\"most_frequent\"),\n",
116 |     "                         data_categorical, target)\n",
117 |     "print(f\"The different scores obtained are: \\n{scores}\")\n",
118 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "By comparison, a categorical encoding that does not assume any ordering in the\n",
126 |     "categories can lead to a significantly higher score:"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "from sklearn.preprocessing import OneHotEncoder\n",
136 |     "\n",
137 |     "model = make_pipeline(\n",
138 |     "    OneHotEncoder(handle_unknown=\"ignore\"),\n",
139 |     "    LogisticRegression(solver='lbfgs', max_iter=1000))\n",
140 |     "scores = cross_val_score(model, data_categorical, target)\n",
141 |     "print(f\"The different scores obtained are: \\n{scores}\")\n",
142 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
143 |    ]
144 |   }
145 |  ],
146 |  "metadata": {
147 |   "jupytext": {
148 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
149 |   },
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 2
158 | }
159 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/numpy_intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to numpy"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "__numpy array__ <br>\n",
 15 |     "NumPy array is a data container. It is similar to Python lists, but it’s specialised for working on numerical data. NumPy is at the center of scientific Python ecosystem and it is a work-horse of many scientific libraries including scikit-learn, scikit-image, matplotlib, SciPy.\n",
 16 |     "\n",
 17 |     "In general you should use this library if you want to do fancy things with **numbers**, especially if you have **matrices** or **arrays.** <br>\n",
 18 |     "\n",
 19 |     "To use NumPy we need to start python interpreter and import numpy package:"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import numpy as np"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Let's create a simple numpy array"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "x = np.array([2, 1, 5])\n",
 45 |     "print(x)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Lists vs Numpy arrays\n",
 53 |     "\n",
 54 |     "The Python core library provides Lists. A list is the Python equivalent of an array, but it is resizeable and can contain elements of different types.\n",
 55 |     "\n",
 56 |     "Pros of an array:\n",
 57 |     " - **Size** - Numpy data structures take up less space\n",
 58 |     " - **Performance** - faster than lists\n",
 59 |     " - **Functionality** - SciPy and NumPy have optimized functions such as linear algebra operations built in."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "L = range(1000)\n",
 69 |     "%timeit [i**2 for i in L]"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "__np.arange__ works like Python built-in range, but it returns an array; "
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "np.arange(5)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "a = np.arange(1000)\n",
 95 |     "%timeit a**2"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "### Memory layout\n",
103 |     "\n",
104 |     "NumPy array is just a memory block with extra information how to interpret its contents. \n",
105 |     "\n",
106 |     "To construct an array with pre-defined elements we can also use one of the built-in helper functions:"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "__np.ones__ and __np.zeros__ return arrays of 0s or 1s; "
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "np.ones(5)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": [
131 |     "np.zeros(5)"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "__np.random.rand__ creates an array of random numbers from an interval [0, 1]:"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "np.random.rand(5)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "We can also construct a two- or more dimensional arrays:"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "np.array([[1, 2], [5, 6]])"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "np.ones((2, 2))"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "Alternatively, a n-dimensional array can be obtained by reshaping a 1-D array:"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "a = np.arange(12)\n",
189 |     "a.reshape((4,3))"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "[Previous: Index](../01-numpy-introduction.ipynb)<br>[Next: Working with a dataset](dataset_intro.ipynb)"
197 |    ]
198 |   }
199 |  ],
200 |  "metadata": {
201 |   "kernelspec": {
202 |    "display_name": "Python 3",
203 |    "language": "python",
204 |    "name": "python3"
205 |   },
206 |   "language_info": {
207 |    "codemirror_mode": {
208 |     "name": "ipython",
209 |     "version": 3
210 |    },
211 |    "file_extension": ".py",
212 |    "mimetype": "text/x-python",
213 |    "name": "python",
214 |    "nbconvert_exporter": "python",
215 |    "pygments_lexer": "ipython3",
216 |    "version": "3.7.2"
217 |   }
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 2
221 | }
222 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/numpy_intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to numpy"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "__numpy array__ <br>\n",
 15 |     "NumPy array is a data container. It is similar to Python lists, but it’s specialised for working on numerical data. NumPy is at the center of scientific Python ecosystem and it is a work-horse of many scientific libraries including scikit-learn, scikit-image, matplotlib, SciPy.\n",
 16 |     "\n",
 17 |     "In general you should use this library if you want to do fancy things with **numbers**, especially if you have **matrices** or **arrays.** <br>\n",
 18 |     "\n",
 19 |     "To use NumPy we need to start python interpreter and import numpy package:"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import numpy as np"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Let's create a simple numpy array"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "x = np.array([2, 1, 5])\n",
 45 |     "print(x)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Lists vs Numpy arrays\n",
 53 |     "\n",
 54 |     "The Python core library provides Lists. A list is the Python equivalent of an array, but it is resizeable and can contain elements of different types.\n",
 55 |     "\n",
 56 |     "Pros of an array:\n",
 57 |     " - **Size** - Numpy data structures take up less space\n",
 58 |     " - **Performance** - faster than lists\n",
 59 |     " - **Functionality** - SciPy and NumPy have optimized functions such as linear algebra operations built in."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "L = range(1000)\n",
 69 |     "%timeit [i**2 for i in L]"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "__np.arange__ works like Python built-in range, but it returns an array; "
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "np.arange(5)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "scrolled": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "a = np.arange(1000)\n",
 97 |     "%timeit a**2"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "### Memory layout\n",
105 |     "\n",
106 |     "NumPy array is just a memory block with extra information how to interpret its contents. \n",
107 |     "\n",
108 |     "To construct an array with pre-defined elements we can also use other built-in helper functions:"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "__np.ones__ and __np.zeros__ return arrays of 0s or 1s; "
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "np.ones(5)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "np.zeros(5)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "__np.random.rand__ creates an array of random numbers from an interval [0, 1]:"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "np.random.rand(5)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "We can also construct a two- or more dimensional arrays:"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "np.array([[1, 2], [5, 6]])"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "np.ones((2, 2))"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "Alternatively, a n-dimensional array can be obtained by reshaping a 1-D array:"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "a = np.arange(12)\n",
191 |     "a.reshape((4,3))"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "[Previous: Index](../01-numpy-introduction.ipynb)<br>[Next: Working with a dataset](dataset_intro.ipynb)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.7.2"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 2
230 | }
231 | 


--------------------------------------------------------------------------------
/figures/polynomial_overfit_0.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8" standalone="no"?>
 2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 3 |   "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 4 | <!-- Created with matplotlib (https://matplotlib.org/) -->
 5 | <svg height="276.48pt" version="1.1" viewBox="0 0 368.64 276.48" width="368.64pt" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 6 |  <defs>
 7 |   <style type="text/css">
 8 | *{stroke-linecap:butt;stroke-linejoin:round;}
 9 |   </style>
10 |  </defs>
11 |  <g id="figure_1">
12 |   <g id="patch_1">
13 |    <path d="M 0 276.48 
14 | L 368.64 276.48 
15 | L 368.64 0 
16 | L 0 0 
17 | z
18 | " style="fill:none;"/>
19 |   </g>
20 |   <g id="axes_1">
21 |    <g id="PathCollection_1">
22 |     <defs>
23 |      <path d="M 0 2.236068 
24 | C 0.593012 2.236068 1.161816 2.000462 1.581139 1.581139 
25 | C 2.000462 1.161816 2.236068 0.593012 2.236068 0 
26 | C 2.236068 -0.593012 2.000462 -1.161816 1.581139 -1.581139 
27 | C 1.161816 -2.000462 0.593012 -2.236068 0 -2.236068 
28 | C -0.593012 -2.236068 -1.161816 -2.000462 -1.581139 -1.581139 
29 | C -2.000462 -1.161816 -2.236068 -0.593012 -2.236068 0 
30 | C -2.236068 0.593012 -2.000462 1.161816 -1.581139 1.581139 
31 | C -1.161816 2.000462 -0.593012 2.236068 0 2.236068 
32 | z
33 | " id="m842403f9a5" style="stroke:#000000;"/>
34 |     </defs>
35 |     <g clip-path="url(#pae15dcba4d)">
36 |      <use style="stroke:#000000;" x="200.678736" xlink:href="#m842403f9a5" y="270.029652"/>
37 |      <use style="stroke:#000000;" x="256.435825" xlink:href="#m842403f9a5" y="113.452439"/>
38 |      <use style="stroke:#000000;" x="218.75881" xlink:href="#m842403f9a5" y="221.568757"/>
39 |      <use style="stroke:#000000;" x="199.361579" xlink:href="#m842403f9a5" y="222.787439"/>
40 |      <use style="stroke:#000000;" x="158.734641" xlink:href="#m842403f9a5" y="251.570041"/>
41 |      <use style="stroke:#000000;" x="233.213096" xlink:href="#m842403f9a5" y="168.555121"/>
42 |      <use style="stroke:#000000;" x="163.403772" xlink:href="#m842403f9a5" y="265.923336"/>
43 |      <use style="stroke:#000000;" x="315.613817" xlink:href="#m842403f9a5" y="158.167234"/>
44 |      <use style="stroke:#000000;" x="339.706036" xlink:href="#m842403f9a5" y="186.23399"/>
45 |      <use style="stroke:#000000;" x="145.258074" xlink:href="#m842403f9a5" y="186.855361"/>
46 |      <use style="stroke:#000000;" x="282.085016" xlink:href="#m842403f9a5" y="189.276718"/>
47 |      <use style="stroke:#000000;" x="194.003476" xlink:href="#m842403f9a5" y="251.08056"/>
48 |      <use style="stroke:#000000;" x="207.123588" xlink:href="#m842403f9a5" y="206.275753"/>
49 |      <use style="stroke:#000000;" x="326.949041" xlink:href="#m842403f9a5" y="131.953804"/>
50 |      <use style="stroke:#000000;" x="40.562484" xlink:href="#m842403f9a5" y="99.047746"/>
51 |      <use style="stroke:#000000;" x="45.955768" xlink:href="#m842403f9a5" y="102.114748"/>
52 |      <use style="stroke:#000000;" x="23.5321" xlink:href="#m842403f9a5" y="73.387151"/>
53 |      <use style="stroke:#000000;" x="295.789982" xlink:href="#m842403f9a5" y="174.561718"/>
54 |      <use style="stroke:#000000;" x="277.537913" xlink:href="#m842403f9a5" y="198.37254"/>
55 |      <use style="stroke:#000000;" x="308.321162" xlink:href="#m842403f9a5" y="166.96463"/>
56 |      <use style="stroke:#000000;" x="344.71806" xlink:href="#m842403f9a5" y="188.3365"/>
57 |      <use style="stroke:#000000;" x="284.576194" xlink:href="#m842403f9a5" y="233.031132"/>
58 |      <use style="stroke:#000000;" x="171.410684" xlink:href="#m842403f9a5" y="200.026824"/>
59 |      <use style="stroke:#000000;" x="278.332978" xlink:href="#m842403f9a5" y="187.722908"/>
60 |      <use style="stroke:#000000;" x="56.393349" xlink:href="#m842403f9a5" y="193.241429"/>
61 |      <use style="stroke:#000000;" x="231.21135" xlink:href="#m842403f9a5" y="181.097995"/>
62 |      <use style="stroke:#000000;" x="64.79796" xlink:href="#m842403f9a5" y="178.433928"/>
63 |      <use style="stroke:#000000;" x="333.340681" xlink:href="#m842403f9a5" y="147.163754"/>
64 |      <use style="stroke:#000000;" x="191.641968" xlink:href="#m842403f9a5" y="179.850401"/>
65 |      <use style="stroke:#000000;" x="155.720889" xlink:href="#m842403f9a5" y="199.273353"/>
66 |      <use style="stroke:#000000;" x="105.416164" xlink:href="#m842403f9a5" y="139.916094"/>
67 |      <use style="stroke:#000000;" x="276.223188" xlink:href="#m842403f9a5" y="220.269649"/>
68 |      <use style="stroke:#000000;" x="169.62478" xlink:href="#m842403f9a5" y="191.423784"/>
69 |      <use style="stroke:#000000;" x="207.254083" xlink:href="#m842403f9a5" y="230.798415"/>
70 |      <use style="stroke:#000000;" x="23.053338" xlink:href="#m842403f9a5" y="80.355935"/>
71 |      <use style="stroke:#000000;" x="223.742863" xlink:href="#m842403f9a5" y="222.657904"/>
72 |      <use style="stroke:#000000;" x="221.886334" xlink:href="#m842403f9a5" y="213.246979"/>
73 |      <use style="stroke:#000000;" x="223.507771" xlink:href="#m842403f9a5" y="198.994186"/>
74 |      <use style="stroke:#000000;" x="333.032083" xlink:href="#m842403f9a5" y="192.555385"/>
75 |      <use style="stroke:#000000;" x="245.252941" xlink:href="#m842403f9a5" y="158.608723"/>
76 |      <use style="stroke:#000000;" x="137.237266" xlink:href="#m842403f9a5" y="181.199994"/>
77 |      <use style="stroke:#000000;" x="163.21769" xlink:href="#m842403f9a5" y="262.989582"/>
78 |      <use style="stroke:#000000;" x="250.551604" xlink:href="#m842403f9a5" y="133.958551"/>
79 |      <use style="stroke:#000000;" x="36.939562" xlink:href="#m842403f9a5" y="21.647584"/>
80 |      <use style="stroke:#000000;" x="240.208075" xlink:href="#m842403f9a5" y="150.588383"/>
81 |      <use style="stroke:#000000;" x="241.505404" xlink:href="#m842403f9a5" y="200.789776"/>
82 |      <use style="stroke:#000000;" x="87.261298" xlink:href="#m842403f9a5" y="208.757318"/>
83 |      <use style="stroke:#000000;" x="59.963082" xlink:href="#m842403f9a5" y="98.344445"/>
84 |      <use style="stroke:#000000;" x="122.465007" xlink:href="#m842403f9a5" y="207.318761"/>
85 |      <use style="stroke:#000000;" x="138.645762" xlink:href="#m842403f9a5" y="153.422164"/>
86 |     </g>
87 |    </g>
88 |   </g>
89 |  </g>
90 |  <defs>
91 |   <clipPath id="pae15dcba4d">
92 |    <rect height="265.4208" width="368.64" x="0" y="11.0592"/>
93 |   </clipPath>
94 |  </defs>
95 | </svg>
96 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Operations\n",
  8 |     "\n",
  9 |     "Arrays also know how to perform common mathematical operations on their values. The simplest operations with data are arithmetic: addition, subtraction, multiplication, and division. When you do such operations on arrays, the operation is done element-by-element. <br>"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "doubledata = data + data\n",
 37 |     "print(doubledata)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Operations by scalar:"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "tripledata = data * 3\n",
 54 |     "print(tripledata)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "One of the advantages of NumPy is that it allows to apply functions (called ufuncs) to all elements of an array without the need of `for` loops:"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "np.sin(data[0,:])"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "This is not only convenient but also more efficient than iterating through the elements using for loops."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "expdata = np.exp(data)\n",
 87 |     "print(expdata)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "**Warning** Standard Python installation also includes the `math` library, but it does not play nicely with NumPy array and it may give different results than numpy function, so avoid using it with NumPy arrays.\n",
 95 |     "\n",
 96 |     "Some functions (such as mean, max, etc.) aggregate the data return arrays of less dimensions or scalars:"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "sumdata = np.sum(data)\n",
106 |     "print('sum data: {}'.format(sumdata))"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "meandata = np.mean(data)\n",
116 |     "print(meandata)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "It's also possbile to average over a single axis:"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "## Axis\n",
131 |     "<img src=\"../images/axis.png\" alt=\"drawing\" width=200 >"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "np.mean(data, 0)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## $\\color{green}{\\text{Excercise}}$\n",
148 |     "\n",
149 |     "Create array `a` with random elements of shape 1000 x 3.\n",
150 |     "Select the second and third column (index 1 and 2) and calculate the mean for each of the columns (i.e. your answer should be an array with two elements)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": []
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## $\\color{green}{\\text{Excercise}}$\n",
165 |     "\n",
166 |     "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2."
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": []
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## $\\color{green}{\\text{Excercise}}$ operations\n",
181 |     "\n",
182 |     "Average the inflammation data over the first ten patients (rows) and plot them across time (columns). Then repeat it for the next ten patients and so on. Try putting all averages on a single plot"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "\n",
192 |     "    "
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "[Previous: Slices](slices.ipynb)<br>[Next: Stacking](stacking.ipynb) "
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.7.2"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 2
231 | }
232 | 


--------------------------------------------------------------------------------
/img/webfont-ubuntu-mono-400-700-400italic.css:
--------------------------------------------------------------------------------
  1 | /* cyrillic-ext */
  2 | @font-face {
  3 |   font-family: 'Ubuntu Mono';
  4 |   font-style: normal;
  5 |   font-weight: 400;
  6 |   src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkTTOQ_MqJVwkKsUn0wKzc2I.woff2) format('woff2');
  7 |   unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F;
  8 | }
  9 | /* cyrillic */
 10 | @font-face {
 11 |   font-family: 'Ubuntu Mono';
 12 |   font-style: normal;
 13 |   font-weight: 400;
 14 |   src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkTUj_cnvWIuuBMVgbX098Mw.woff2) format('woff2');
 15 |   unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116;
 16 | }
 17 | /* greek-ext */
 18 | @font-face {
 19 |   font-family: 'Ubuntu Mono';
 20 |   font-style: normal;
 21 |   font-weight: 400;
 22 |   src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkUbcKLIaa1LC45dFaAfauRA.woff2) format('woff2');
 23 |   unicode-range: U+1F00-1FFF;
 24 | }
 25 | /* greek */
 26 | @font-face {
 27 |   font-family: 'Ubuntu Mono';
 28 |   font-style: normal;
 29 |   font-weight: 400;
 30 |   src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkWo_sUJ8uO4YLWRInS22T3Y.woff2) format('woff2');
 31 |   unicode-range: U+0370-03FF;
 32 | }
 33 | /* latin-ext */
 34 | @font-face {
 35 |   font-family: 'Ubuntu Mono';
 36 |   font-style: normal;
 37 |   font-weight: 400;
 38 |   src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkSYE0-AqJ3nfInTTiDXDjU4.woff2) format('woff2');
 39 |   unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF;
 40 | }
 41 | /* latin */
 42 | @font-face {
 43 |   font-family: 'Ubuntu Mono';
 44 |   font-style: normal;
 45 |   font-weight: 400;
 46 |   src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkY4P5ICox8Kq3LLUNMylGO4.woff2) format('woff2');
 47 |   unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000;
 48 | }
 49 | /* cyrillic-ext */
 50 | @font-face {
 51 |   font-family: 'Ubuntu Mono';
 52 |   font-style: normal;
 53 |   font-weight: 700;
 54 |   src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytp6iIh_FvlUHQwED9Yt5Kbw.woff2) format('woff2');
 55 |   unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F;
 56 | }
 57 | /* cyrillic */
 58 | @font-face {
 59 |   font-family: 'Ubuntu Mono';
 60 |   font-style: normal;
 61 |   font-weight: 700;
 62 |   src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molyti_vZmeiCMnoWNN9rHBYaTc.woff2) format('woff2');
 63 |   unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116;
 64 | }
 65 | /* greek-ext */
 66 | @font-face {
 67 |   font-family: 'Ubuntu Mono';
 68 |   font-style: normal;
 69 |   font-weight: 700;
 70 |   src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytiFaMxiho_5XQnyRZzQsrZs.woff2) format('woff2');
 71 |   unicode-range: U+1F00-1FFF;
 72 | }
 73 | /* greek */
 74 | @font-face {
 75 |   font-family: 'Ubuntu Mono';
 76 |   font-style: normal;
 77 |   font-weight: 700;
 78 |   src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytgalQocB-__pDVGhF3uS2Ks.woff2) format('woff2');
 79 |   unicode-range: U+0370-03FF;
 80 | }
 81 | /* latin-ext */
 82 | @font-face {
 83 |   font-family: 'Ubuntu Mono';
 84 |   font-style: normal;
 85 |   font-weight: 700;
 86 |   src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytujkDdvhIIFj_YMdgqpnSB0.woff2) format('woff2');
 87 |   unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF;
 88 | }
 89 | /* latin */
 90 | @font-face {
 91 |   font-family: 'Ubuntu Mono';
 92 |   font-style: normal;
 93 |   font-weight: 700;
 94 |   src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytolIZu-HDpmDIZMigmsroc4.woff2) format('woff2');
 95 |   unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000;
 96 | }
 97 | /* cyrillic-ext */
 98 | @font-face {
 99 |   font-family: 'Ubuntu Mono';
100 |   font-style: italic;
101 |   font-weight: 400;
102 |   src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKAxNcqx07xvyppV96iFRdwiM.woff2) format('woff2');
103 |   unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F;
104 | }
105 | /* cyrillic */
106 | @font-face {
107 |   font-family: 'Ubuntu Mono';
108 |   font-style: italic;
109 |   font-weight: 400;
110 |   src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA-fhZE2STYI3KzBGzrJG_ik.woff2) format('woff2');
111 |   unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116;
112 | }
113 | /* greek-ext */
114 | @font-face {
115 |   font-family: 'Ubuntu Mono';
116 |   font-style: italic;
117 |   font-weight: 400;
118 |   src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA26cj8HaeL2jS4NIBPr3RFo.woff2) format('woff2');
119 |   unicode-range: U+1F00-1FFF;
120 | }
121 | /* greek */
122 | @font-face {
123 |   font-family: 'Ubuntu Mono';
124 |   font-style: italic;
125 |   font-weight: 400;
126 |   src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA9cKKn5Xt5n-nnvkqIBMZms.woff2) format('woff2');
127 |   unicode-range: U+0370-03FF;
128 | }
129 | /* latin-ext */
130 | @font-face {
131 |   font-family: 'Ubuntu Mono';
132 |   font-style: italic;
133 |   font-weight: 400;
134 |   src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA0_0lycXMw8PhobHtu2Qgco.woff2) format('woff2');
135 |   unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF;
136 | }
137 | /* latin */
138 | @font-face {
139 |   font-family: 'Ubuntu Mono';
140 |   font-style: italic;
141 |   font-weight: 400;
142 |   src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA8u2Q0OS-KeTAWjgkS85mDg.woff2) format('woff2');
143 |   unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000;
144 | }
145 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/operations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Operations\n",
  8 |     "\n",
  9 |     "Arrays also know how to perform common mathematical operations on their values. The simplest operations with data are arithmetic: addition, subtraction, multiplication, and division. When you do such operations on arrays, the operation is done element-by-element. <br>"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "doubledata = data + data\n",
 37 |     "print(doubledata)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Operations by scalar:"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "tripledata = data * 3\n",
 54 |     "print(tripledata)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "One of the advantages of NumPy is that it allows to apply functions (called ufuncs) to all elements of an array without the need of `for` loops:"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "np.sin(data[0,:])"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "This is not only convenient but also more efficient than iterating through the elements using for loops."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "expdata = np.exp(data)\n",
 87 |     "print(expdata)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "**Warning** Standard Python installation also includes the `math` library, but it does not play nicely with NumPy array and it may give different results than numpy function, so avoid using it with NumPy arrays.\n",
 95 |     "\n",
 96 |     "Some functions (such as mean, max, etc.) aggregate the data return arrays of less dimensions or scalars:"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "sumdata = np.sum(data)\n",
106 |     "print('sum data: {}'.format(sumdata))"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "meandata = np.mean(data)\n",
116 |     "print(meandata)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "It's also possbile to average over a single axis:"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "Axis\n",
131 |     "<img src=\"../images/axis.png\" alt=\"drawing\" width=200 >"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "np.mean(data, 0)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## $\\color{green}{\\text{Excercise}}$\n",
148 |     "\n",
149 |     "Create array `a` with random elements of shape 1000 x 3.\n",
150 |     "Select the second and third column (index 1 and 2) and calculate the mean for each of the columns (i.e. your answer should be an array with two elements)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "a = np.random.randn(1000,3)\n",
160 |     "print(a.shape)\n",
161 |     "\n",
162 |     "a_select = a[:,[1,2]]\n",
163 |     "print(a_select.shape)\n",
164 |     "\n",
165 |     "mean_select = a_select.mean(0)\n",
166 |     "print(mean_select)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "## $\\color{green}{\\text{Excercise}}$\n",
174 |     "\n",
175 |     "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "rand_array = np.random.rand(10,3)\n",
185 |     "rand_array2 = rand_array - 0.75\n",
186 |     "closest = np.argmin(np.abs(rand_array2),1)\n",
187 |     "print(rand_array)\n",
188 |     "print(closest)"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "## $\\color{green}{\\text{Excercise}}$ operations\n",
196 |     "\n",
197 |     "Average the inflammation data over the first ten patients (rows) and plot them across time (columns). Then repeat it for the next ten patients and so on. Try putting all averages on a single plot"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "import matplotlib.pylab as plt\n",
207 |     "%matplotlib inline\n",
208 |     "\n",
209 |     "pat_0 = 0\n",
210 |     "pat_last = 10\n",
211 |     "while pat_last <= len(data):\n",
212 |     "    ave_inflammation = np.mean(data[pat_0:pat_last], axis=0)\n",
213 |     "    plt.plot(ave_inflammation)\n",
214 |     "    pat_0 = pat_last\n",
215 |     "    pat_last += 10\n",
216 |     "    "
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "[Previous: Slices](slices.ipynb)<br>[Next: Stacking](stacking.ipynb) "
224 |    ]
225 |   }
226 |  ],
227 |  "metadata": {
228 |   "kernelspec": {
229 |    "display_name": "Python 3",
230 |    "language": "python",
231 |    "name": "python3"
232 |   },
233 |   "language_info": {
234 |    "codemirror_mode": {
235 |     "name": "ipython",
236 |     "version": 3
237 |    },
238 |    "file_extension": ".py",
239 |    "mimetype": "text/x-python",
240 |    "name": "python",
241 |    "nbconvert_exporter": "python",
242 |    "pygments_lexer": "ipython3",
243 |    "version": "3.7.2"
244 |   }
245 |  },
246 |  "nbformat": 4,
247 |  "nbformat_minor": 2
248 | }
249 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpys/dataset_intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Working with a dataset"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "We will use first dataset called `inflammation-01.csv` from the data provided by Software Carpentry: [Analyzing Patient Data](https://swcarpentry.github.io/python-novice-inflammation-2.7/). <br>\n",
 15 |     "You should be able to find it in your `data/` directory. "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "`numpy.loadtxt` has two parameters: the name of the file we want to read and the delimiter that separates values on a line. These both need to be character strings."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "We are studying inflammation in patients who have been given a new treatment for arthritis, and need to analyze the first dozen data sets of their daily inflammation. The data sets are stored in comma-separated values (CSV) format: each row holds information for a single patient, and the columns represent successive days:"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### Explore array"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "print(data)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "In contrast to lists NumPy arrays can store elements of pre-determined type only.\n",
 71 |     "The type function will only tell you that a variable is a NumPy array but won’t tell you the type of thing inside the array. We can find out the type of the data contained in the NumPy array."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "print(data.dtype)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "and the shape of the array"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "print(data.shape)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "meaning that data array variable contains 60 rows and 40 columns"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## Plotting the data"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "import matplotlib.pyplot as plt\n",
120 |     "plt.imshow(data)\n",
121 |     "plt.show()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "%matplotlib inline"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "plt.imshow(data)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "## Indexing"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "If we want to get a single number from the array, we must provide an index in square brackets after the variable name. <br>\n",
154 |     "Note that the NumPy arrays are zero-indexed:"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "data[0, 0]"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "It means that that the third element in the first row has an index of [0, 2]:"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "data[0, 2]"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "We can also assign the element with a new value:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "data[0, 2] = 100.\n",
196 |     "print(data[0, 2])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "NumPy (and Python in general) checks the bounds of the array:"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "print(data.shape)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "#data[60, 0]"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "Finally, we can ask for several elements at once:"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "data[0, [0, 10]]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## $\\color{green}{\\text{Excercise}}$\n",
245 |     "\n",
246 |     "Create the following array, call it `a`:\n",
247 |     "\n",
248 |     "a = <br>\n",
249 |     "```\n",
250 |     "2  7 12  0\n",
251 |     "3  9  3  4\n",
252 |     "4  0  1  3\n",
253 |     "```\n",
254 |     "\n",
255 |     "use `a` to assign 4, 1 and 3 from the third row to array `b` \n"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": []
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "[Previous: Numpy intro](numpy_intro.ipynb)<br>[Filtering data](filtering_data.ipynb)"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": []
278 |   }
279 |  ],
280 |  "metadata": {
281 |   "kernelspec": {
282 |    "display_name": "Python 3",
283 |    "language": "python",
284 |    "name": "python3"
285 |   },
286 |   "language_info": {
287 |    "codemirror_mode": {
288 |     "name": "ipython",
289 |     "version": 3
290 |    },
291 |    "file_extension": ".py",
292 |    "mimetype": "text/x-python",
293 |    "name": "python",
294 |    "nbconvert_exporter": "python",
295 |    "pygments_lexer": "ipython3",
296 |    "version": "3.7.2"
297 |   }
298 |  },
299 |  "nbformat": 4,
300 |  "nbformat_minor": 2
301 | }
302 | 


--------------------------------------------------------------------------------
/Day_1_Scientific_Python/numpy_with_answers/numpys/dataset_intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Working with a dataset"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "We will use first dataset called `inflammation-01.csv` from the data provided by Software Carpentry: [Analyzing Patient Data](https://swcarpentry.github.io/python-novice-inflammation-2.7/). <br>\n",
 15 |     "You should be able to find it in your `data/` directory. "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import numpy as np"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "`numpy.loadtxt` has two parameters: the name of the file we want to read and the delimiter that separates values on a line. These both need to be character strings."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "We are studying inflammation in patients who have been given a new treatment for arthritis, and need to analyze the first dozen data sets of their daily inflammation. The data sets are stored in comma-separated values (CSV) format: each row holds information for a single patient, and the columns represent successive days:"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### Explore array"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "print(data)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "In contrast to lists NumPy arrays can store elements of pre-determined type only.\n",
 71 |     "The type function will only tell you that a variable is a NumPy array but won’t tell you the type of thing inside the array. We can find out the type of the data contained in the NumPy array."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "print(data.dtype)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "and the shape of the array"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "print(data.shape)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "meaning that data array variable contains 60 rows and 40 columns"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## Plotting the data"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "import matplotlib.pyplot as plt\n",
120 |     "plt.imshow(data)\n",
121 |     "plt.show()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "%matplotlib inline"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "plt.imshow(data)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "## Indexing"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "If we want to get a single number from the array, we must provide an index in square brackets after the variable name. <br>\n",
154 |     "Note that the NumPy arrays are zero-indexed:"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "data[0, 0]"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "It means that that the third element in the first row has an index of [0, 2]:"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "data[0, 2]"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "We can also assign the element with a new value:"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "data[0, 2] = 100.\n",
196 |     "print(data[0, 2])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "NumPy (and Python in general) checks the bounds of the array:"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "print(data.shape)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "#data[60, 0]"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "Finally, we can ask for several elements at once:"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "data[0, [0, 10]]"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## $\\color{green}{\\text{Excercise}}$\n",
245 |     "\n",
246 |     "Create the following array, call it `a`:\n",
247 |     "\n",
248 |     "a = <br>\n",
249 |     "```\n",
250 |     "2  7 12  0\n",
251 |     "3  9  3  4\n",
252 |     "4  0  1  3\n",
253 |     "```\n",
254 |     "\n",
255 |     "use `a` to assign 4, 1 and 3 from the third row to array `b` \n"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "a = np.array([[2, 7, 12, 0],[3, 9, 3, 4],[4, 0, 1, 3]])\n",
265 |     "print(a)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": null,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "b = a[2, [0,2,3]]\n",
275 |     "print(b)\n",
276 |     "type(b)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "[Previous: Numpy intro](numpy_intro.ipynb)<br>[Filtering data](filtering_data.ipynb)"
284 |    ]
285 |   }
286 |  ],
287 |  "metadata": {
288 |   "kernelspec": {
289 |    "display_name": "Python 3",
290 |    "language": "python",
291 |    "name": "python3"
292 |   },
293 |   "language_info": {
294 |    "codemirror_mode": {
295 |     "name": "ipython",
296 |     "version": 3
297 |    },
298 |    "file_extension": ".py",
299 |    "mimetype": "text/x-python",
300 |    "name": "python",
301 |    "nbconvert_exporter": "python",
302 |    "pygments_lexer": "ipython3",
303 |    "version": "3.7.2"
304 |   }
305 |  },
306 |  "nbformat": 4,
307 |  "nbformat_minor": 2
308 | }
309 | 


--------------------------------------------------------------------------------
/Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_02_solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Solution for Exercise 03\n",
  8 |     "\n",
  9 |     "The goal of this exercise is to evaluate the impact of feature preprocessing on a pipeline that uses a  decision-tree-based classifier instead of logistic regression.\n",
 10 |     "\n",
 11 |     "- The first question is to empirically evaluate whether scaling numerical feature is helpful or not;\n",
 12 |     "\n",
 13 |     "- The second question is to evaluate whether it is empirically better (both from a computational and a statistical perspective) to use integer coded or one-hot encoded categories.\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "Hint: `HistGradientBoostingClassifier` does not yet support sparse input data. You might want to use\n",
 17 |     "`OneHotEncoder(handle_unknown=\"ignore\", sparse=False)` to force the use a dense representation as a workaround."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import pandas as pd\n",
 27 |     "from sklearn.model_selection import cross_val_score\n",
 28 |     "from sklearn.pipeline import make_pipeline\n",
 29 |     "from sklearn.compose import ColumnTransformer\n",
 30 |     "from sklearn.preprocessing import OrdinalEncoder\n",
 31 |     "from sklearn.experimental import enable_hist_gradient_boosting\n",
 32 |     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
 33 |     "\n",
 34 |     "df = pd.read_csv(\n",
 35 |     "    \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n",
 36 |     "\n",
 37 |     "# Or use the local copy:\n",
 38 |     "# df = pd.read_csv('../datasets/adult-census.csv')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "target_name = \"class\"\n",
 48 |     "target = df[target_name].to_numpy()\n",
 49 |     "data = df.drop(columns=[target_name, \"fnlwgt\"])"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "numerical_columns = [\n",
 59 |     "    c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n",
 60 |     "categorical_columns = [\n",
 61 |     "    c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n",
 62 |     "\n",
 63 |     "categories = [\n",
 64 |     "    data[column].unique() for column in data[categorical_columns]]"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Reference pipeline (no numerical scaling and integer-coded categories)\n",
 72 |     "\n",
 73 |     "First let's time the pipeline we used in the main notebook to serve as a reference:"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "%%time\n",
 83 |     "\n",
 84 |     "preprocessor = ColumnTransformer([\n",
 85 |     "    ('categorical', OrdinalEncoder(categories=categories),\n",
 86 |     "     categorical_columns),], remainder=\"passthrough\")\n",
 87 |     "\n",
 88 |     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
 89 |     "scores = cross_val_score(model, data, target)\n",
 90 |     "print(f\"The different scores obtained are: \\n{scores}\")\n",
 91 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "## Scaling numerical features"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "%%time\n",
108 |     "from sklearn.preprocessing import StandardScaler\n",
109 |     "\n",
110 |     "preprocessor = ColumnTransformer([\n",
111 |     "    ('numerical', StandardScaler(), numerical_columns),\n",
112 |     "    ('categorical', OrdinalEncoder(categories=categories),\n",
113 |     "     categorical_columns),])\n",
114 |     "\n",
115 |     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
116 |     "scores = cross_val_score(model, data, target)\n",
117 |     "print(f\"The different scores obtained are: \\n{scores}\")\n",
118 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "### Analysis\n",
126 |     "\n",
127 |     "We can observe that both the accuracy and the training time are approximately the same as the reference pipeline (any time difference you might observe is not significant).\n",
128 |     "\n",
129 |     "Scaling numerical features is indeed useless for most decision tree models in general and for `HistGradientBoostingClassifier` in particular."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## One-hot encoding of categorical variables\n",
137 |     "\n",
138 |     "For linear models, we have observed that integer coding of categorical\n",
139 |     "variables can be very detrimental. However for\n",
140 |     "`HistGradientBoostingClassifier` models, it does not seem to be the\n",
141 |     "case as the cross-validation of the reference pipeline with\n",
142 |     "`OrdinalEncoder` is good.\n",
143 |     "\n",
144 |     "Let's see if we can get an even better accuracy with `OneHotEncoding`:"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "%%time\n",
154 |     "from sklearn.preprocessing import OneHotEncoder\n",
155 |     "\n",
156 |     "preprocessor = ColumnTransformer([\n",
157 |     "    ('categorical',\n",
158 |     "     OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n",
159 |     "     categorical_columns),], remainder=\"passthrough\")\n",
160 |     "\n",
161 |     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
162 |     "scores = cross_val_score(model, data, target)\n",
163 |     "print(f\"The different scores obtained are: \\n{scores}\")\n",
164 |     "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Analysis\n",
172 |     "\n",
173 |     "From an accuracy point of view, the result is almost exactly the same.\n",
174 |     "The reason is that `HistGradientBoostingClassifier` is expressive\n",
175 |     "and robust enough to deal with misleading ordering of integer coded\n",
176 |     "categories (which was not the case for linear models).\n",
177 |     "\n",
178 |     "However from a computation point of view, the training time is\n",
179 |     "significantly longer: this is caused by the fact that `OneHotEncoder`\n",
180 |     "generates approximately 10 times more features than `OrdinalEncoder`.\n",
181 |     "\n",
182 |     "Note that the current implementation `HistGradientBoostingClassifier`\n",
183 |     "is still incomplete, and once sparse representation are handled\n",
184 |     "correctly, training time might improve with such kinds of encodings.\n",
185 |     "\n",
186 |     "The main take away message is that arbitrary integer coding of\n",
187 |     "categories is perfectly fine for `HistGradientBoostingClassifier`\n",
188 |     "and yields fast training times."
189 |    ]
190 |   }
191 |  ],
192 |  "metadata": {
193 |   "jupytext": {
194 |    "formats": "python_scripts//py:percent,notebooks//ipynb"
195 |   },
196 |   "kernelspec": {
197 |    "display_name": "Python 3",
198 |    "language": "python",
199 |    "name": "python3"
200 |   }
201 |  },
202 |  "nbformat": 4,
203 |  "nbformat_minor": 2
204 | }
205 | 


--------------------------------------------------------------------------------