├── Day_1_Scientific_Python ├── pandas │ └── _solutions │ │ ├── pandas_01_data_structures3.py │ │ ├── pandas_01_data_structures2.py │ │ ├── pandas_01_data_structures4.py │ │ ├── pandas_03a_selecting_data10.py │ │ ├── pandas_02_basic_operations1.py │ │ ├── pandas_02_basic_operations5.py │ │ ├── pandas_04_time_series_data1.py │ │ ├── pandas_02_basic_operations4.py │ │ ├── pandas_02_basic_operations6.py │ │ ├── pandas_02_basic_operations9.py │ │ ├── pandas_03a_selecting_data5.py │ │ ├── pandas_01_data_structures5.py │ │ ├── pandas_02_basic_operations7.py │ │ ├── pandas_02_basic_operations8.py │ │ ├── pandas_03a_selecting_data18.py │ │ ├── pandas_03a_selecting_data4.py │ │ ├── pandas_04_time_series_data2.py │ │ ├── pandas_07_reshaping_data3.py │ │ ├── pandas_02_basic_operations2.py │ │ ├── pandas_03a_selecting_data11.py │ │ ├── pandas_03b_indexing6.py │ │ ├── pandas_03b_indexing7.py │ │ ├── pandas_06_groupby_operations1.py │ │ ├── pandas_01_data_structures1.py │ │ ├── pandas_01_data_structures6.py │ │ ├── pandas_03a_selecting_data1.py │ │ ├── pandas_03a_selecting_data12.py │ │ ├── pandas_03a_selecting_data15.py │ │ ├── pandas_03a_selecting_data17.py │ │ ├── pandas_03a_selecting_data19.py │ │ ├── pandas_03a_selecting_data9.py │ │ ├── pandas_04_time_series_data3.py │ │ ├── pandas_04_time_series_data5.py │ │ ├── pandas_06_groupby_operations18.py │ │ ├── pandas_06_groupby_operations4.py │ │ ├── pandas_02_basic_operations3.py │ │ ├── pandas_03a_selecting_data16.py │ │ ├── pandas_03a_selecting_data3.py │ │ ├── pandas_03a_selecting_data6.py │ │ ├── pandas_06_groupby_operations12.py │ │ ├── pandas_06_groupby_operations7.py │ │ ├── pandas_02_basic_operations10.py │ │ ├── pandas_03a_selecting_data8.py │ │ ├── pandas_04_time_series_data7.py │ │ ├── pandas_06_groupby_operations29.py │ │ ├── pandas_04_time_series_data4.py │ │ ├── pandas_06_groupby_operations21.py │ │ ├── pandas_07_reshaping_data5.py │ │ ├── pandas_03a_selecting_data13.py │ │ ├── pandas_03a_selecting_data2.py │ │ ├── pandas_03a_selecting_data7.py │ │ ├── pandas_03b_indexing2.py │ │ ├── pandas_07_reshaping_data8.py │ │ ├── pandas_03a_selecting_data14.py │ │ ├── pandas_03b_indexing4.py │ │ ├── pandas_03b_indexing5.py │ │ ├── pandas_05_combining_datasets.py │ │ ├── pandas_06_groupby_operations8.py │ │ ├── pandas_03b_indexing1.py │ │ ├── pandas_07_reshaping_data9.py │ │ ├── pandas_04_time_series_data10.py │ │ ├── pandas_04_time_series_data8.py │ │ ├── pandas_06_groupby_operations15.py │ │ ├── pandas_06_groupby_operations16.py │ │ ├── pandas_06_groupby_operations19.py │ │ ├── pandas_06_groupby_operations2.py │ │ ├── pandas_06_groupby_operations28.py │ │ ├── pandas_06_groupby_operations6.py │ │ ├── pandas_06_groupby_operations3.py │ │ ├── pandas_07_reshaping_data12.py │ │ ├── pandas_03b_indexing3.py │ │ ├── pandas_06_groupby_operations20.py │ │ ├── pandas_06_groupby_operations22.py │ │ ├── pandas_06_groupby_operations13.py │ │ ├── pandas_06_groupby_operations14.py │ │ ├── pandas_07_reshaping_data1.py │ │ ├── pandas_07_reshaping_data4.py │ │ ├── pandas_07_reshaping_data6.py │ │ ├── pandas_04_time_series_data9.py │ │ ├── pandas_06_groupby_operations23.py │ │ ├── pandas_06_groupby_operations26.py │ │ ├── pandas_06_groupby_operations30.py │ │ ├── pandas_06_groupby_operations31.py │ │ ├── pandas_06_groupby_operations24.py │ │ ├── pandas_06_groupby_operations11.py │ │ ├── pandas_06_groupby_operations17.py │ │ ├── pandas_06_groupby_operations5.py │ │ ├── pandas_03a_selecting_data20.py │ │ ├── pandas_03a_selecting_data21.py │ │ ├── pandas_07_reshaping_data10.py │ │ ├── pandas_06_groupby_operations9.py │ │ ├── pandas_06_groupby_operations25.py │ │ ├── pandas_06_groupby_operations27.py │ │ ├── pandas_07_reshaping_data11.py │ │ ├── pandas_06_groupby_operations10.py │ │ ├── pandas_04_time_series_data6.py │ │ ├── pandas_07_reshaping_data7.py │ │ └── pandas_07_reshaping_data2.py ├── images │ ├── axis.png │ ├── broadcasting.png │ ├── kmeans_illustration.png │ └── tidyr-spread-gather.gif ├── data │ ├── spectra.mat │ ├── kmeans_data.csv │ ├── brain_size.csv │ └── inflammation-01.csv ├── img │ ├── dataframe.png │ ├── pivot_excel.png │ ├── splitApplyCombine.png │ └── logoUPSayPlusCDS_990.png ├── README.md ├── numpy_with_answers │ ├── numpys │ │ ├── broadcasting.ipynb │ │ ├── stacking.ipynb │ │ ├── savez.ipynb │ │ ├── test_yourself.ipynb │ │ ├── fancy_indexing.ipynb │ │ ├── filtering_data.ipynb │ │ ├── boolean_mask.ipynb │ │ ├── numpy_intro.ipynb │ │ ├── operations.ipynb │ │ └── dataset_intro.ipynb │ └── 01-numpy-introduction.ipynb ├── numpys │ ├── broadcasting.ipynb │ ├── stacking.ipynb │ ├── test_yourself.ipynb │ ├── savez.ipynb │ ├── fancy_indexing.ipynb │ ├── filtering_data.ipynb │ ├── boolean_mask.ipynb │ ├── numpy_intro.ipynb │ ├── operations.ipynb │ └── dataset_intro.ipynb └── 01-numpy-introduction.ipynb ├── figures ├── README.md ├── style_figs.py ├── plot_iris_visualization.py ├── plot_splines.py └── polynomial_overfit_0.svg ├── img ├── postit.jpg ├── git │ ├── coding.png │ ├── writing.png │ ├── commit_1.png │ ├── commit_2.png │ ├── commit_3.png │ └── git-transport.png ├── sphinx-logo.png ├── splitApplyCombine.png ├── logoUPSayPlusCDS_990.png ├── slides.css ├── webfont-ubuntu-400-300-100.css └── webfont-ubuntu-mono-400-700-400italic.css ├── datasets └── README.md ├── requirements.txt ├── Day_2_Machine_Learning_Python ├── figures │ ├── simple-decision-tree-adult-census.png │ └── plot-simple-decision-tree-adult-census.py ├── 02_basic_preprocessing_exercise_01.ipynb ├── 03_basic_preprocessing_categorical_variables_exercise_01.ipynb ├── 02_basic_preprocessing_exercise_01_solution.ipynb ├── 04_basic_parameters_tuning_exercise_01.ipynb ├── 04_basic_parameters_tuning_exercise_02.ipynb ├── 04_basic_parameters_tuning_exercise_01_solution.ipynb ├── 03_basic_preprocessing_categorical_variables_exercise_02.ipynb ├── 03_basic_preprocessing_categorical_variables_exercise_01_solution.ipynb └── 03_basic_preprocessing_categorical_variables_exercise_02_solution.ipynb ├── environment.yml ├── LICENSE ├── .gitignore ├── check_env.py ├── index.html └── README.md /Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures3.py: -------------------------------------------------------------------------------- 1 | len(df) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures2.py: -------------------------------------------------------------------------------- 1 | df.head() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures4.py: -------------------------------------------------------------------------------- 1 | df['Age'] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data10.py: -------------------------------------------------------------------------------- 1 | len(titles) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations1.py: -------------------------------------------------------------------------------- 1 | df['Age'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations5.py: -------------------------------------------------------------------------------- 1 | df['Fare'].max() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data1.py: -------------------------------------------------------------------------------- 1 | data['2012':] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations4.py: -------------------------------------------------------------------------------- 1 | df['Survived'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations6.py: -------------------------------------------------------------------------------- 1 | df['Fare'].median() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations9.py: -------------------------------------------------------------------------------- 1 | np.log(df['Fare']) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data5.py: -------------------------------------------------------------------------------- 1 | (df['Age'] > 70).sum() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures5.py: -------------------------------------------------------------------------------- 1 | df['Fare'].plot(kind='box') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations7.py: -------------------------------------------------------------------------------- 1 | df['Fare'].quantile(0.75) -------------------------------------------------------------------------------- /figures/README.md: -------------------------------------------------------------------------------- 1 | This directory contains didactic figures and scripts that generate them. 2 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations8.py: -------------------------------------------------------------------------------- 1 | df['Fare'] / df['Fare'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data18.py: -------------------------------------------------------------------------------- 1 | inception['n'].isnull().sum() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data4.py: -------------------------------------------------------------------------------- 1 | len(df.loc[df['Age'] > 70, :]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data2.py: -------------------------------------------------------------------------------- 1 | data[data.index.month == 1] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data3.py: -------------------------------------------------------------------------------- 1 | df['Underaged'] = df['Age'] <= 18 -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations2.py: -------------------------------------------------------------------------------- 1 | df['Age'].hist() #bins=30, log=True -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data11.py: -------------------------------------------------------------------------------- 1 | titles.sort_values('year').head(2) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing6.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Sex'] == 'male', 'Age'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing7.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Sex'] == 'female', 'Age'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations1.py: -------------------------------------------------------------------------------- 1 | df.groupby('Sex')['Age'].mean() -------------------------------------------------------------------------------- /img/postit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/postit.jpg -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures1.py: -------------------------------------------------------------------------------- 1 | df = pd.read_csv("../data/titanic.csv") -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_01_data_structures6.py: -------------------------------------------------------------------------------- 1 | df.sort_values(by='Age', ascending=False) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data1.py: -------------------------------------------------------------------------------- 1 | males = df.loc[df['Sex'] == 'male', :] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data12.py: -------------------------------------------------------------------------------- 1 | len(titles[titles['title'] == 'Hamlet']) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data15.py: -------------------------------------------------------------------------------- 1 | len(titles[titles['year'] // 10 == 195]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data17.py: -------------------------------------------------------------------------------- 1 | len(inception[inception['n'].isnull()]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data19.py: -------------------------------------------------------------------------------- 1 | len(inception[inception['n'].notnull()]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data9.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Surname'].str.len() > 15, :] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data3.py: -------------------------------------------------------------------------------- 1 | data[data.index.month.isin([4, 5, 6])] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data5.py: -------------------------------------------------------------------------------- 1 | data.resample('M').std().plot() # 'A' -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations18.py: -------------------------------------------------------------------------------- 1 | cast.character.value_counts().head(11) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations4.py: -------------------------------------------------------------------------------- 1 | df.groupby('Sex')['Survived'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations3.py: -------------------------------------------------------------------------------- 1 | df['Survived'].sum() / len(df['Survived']) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data16.py: -------------------------------------------------------------------------------- 1 | inception = cast[cast['title'] == 'Inception'] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data3.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Sex'] == 'female', 'Age'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data6.py: -------------------------------------------------------------------------------- 1 | df.loc[(df['Age'] > 30) & (df['Age'] <= 40), :] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations12.py: -------------------------------------------------------------------------------- 1 | cast1990['name'].value_counts().head(10) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations7.py: -------------------------------------------------------------------------------- 1 | titles['decade'] = titles['year'] // 10 * 10 -------------------------------------------------------------------------------- /img/git/coding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/coding.png -------------------------------------------------------------------------------- /img/git/writing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/writing.png -------------------------------------------------------------------------------- /img/sphinx-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/sphinx-logo.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_02_basic_operations10.py: -------------------------------------------------------------------------------- 1 | df['Fare_log'] = np.log(df['Fare']) 2 | df.head() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data8.py: -------------------------------------------------------------------------------- 1 | df.loc[df['Surname'].str.startswith('Williams'), :] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data7.py: -------------------------------------------------------------------------------- 1 | subset.resample('M').agg(['mean', 'median']).plot() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations29.py: -------------------------------------------------------------------------------- 1 | t = titles 2 | t.year.value_counts().head(3) -------------------------------------------------------------------------------- /img/git/commit_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/commit_1.png -------------------------------------------------------------------------------- /img/git/commit_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/commit_2.png -------------------------------------------------------------------------------- /img/git/commit_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/commit_3.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data4.py: -------------------------------------------------------------------------------- 1 | data[(data.index.hour > 8) & (data.index.hour < 20)] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations21.py: -------------------------------------------------------------------------------- 1 | cast[cast.year == 2010].name.value_counts().head(10) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data5.py: -------------------------------------------------------------------------------- 1 | df.groupby(['Pclass', 'Sex'])['Survived'].mean().unstack() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data13.py: -------------------------------------------------------------------------------- 1 | titles[titles.title == 'Treasure Island'].sort_values('year') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data2.py: -------------------------------------------------------------------------------- 1 | males.loc[:,'Age'].mean() 2 | # or 3 | males.loc['Age'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data7.py: -------------------------------------------------------------------------------- 1 | df['Surname'] = df['Name'].apply(lambda x: x.split(',')[0]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing2.py: -------------------------------------------------------------------------------- 1 | countries.loc[countries['density'] > 300, ['capital', 'population']] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data8.py: -------------------------------------------------------------------------------- 1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot() -------------------------------------------------------------------------------- /img/git/git-transport.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/git/git-transport.png -------------------------------------------------------------------------------- /img/splitApplyCombine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/splitApplyCombine.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data14.py: -------------------------------------------------------------------------------- 1 | len(titles[(titles['year'] >= 1950) & (titles['year'] <= 1959)]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing4.py: -------------------------------------------------------------------------------- 1 | countries.loc['United Kingdom', 'capital'] = 'Cambridge' 2 | countries -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing5.py: -------------------------------------------------------------------------------- 1 | countries[(countries['density'] > 100) & (countries['density'] < 300)] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_05_combining_datasets.py: -------------------------------------------------------------------------------- 1 | pd.merge(countries, country_economics, on='country', how='right') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations8.py: -------------------------------------------------------------------------------- 1 | titles.groupby('decade').size().plot(kind='bar', color='green') -------------------------------------------------------------------------------- /img/logoUPSayPlusCDS_990.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/img/logoUPSayPlusCDS_990.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing1.py: -------------------------------------------------------------------------------- 1 | countries['density'] = countries['population']*1000000 / countries['area'] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data9.py: -------------------------------------------------------------------------------- 1 | pd.crosstab(index=cast['year'], columns=cast['type']).plot(kind='area') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data10.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots() 2 | data['2013'].mean().plot(kind='barh', ax=ax) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data8.py: -------------------------------------------------------------------------------- 1 | daily = data['LS06_348'].resample('D').mean() # daily averages calculated -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations15.py: -------------------------------------------------------------------------------- 1 | title_longest = titles['title'].str.len().nlargest(10) 2 | title_longest -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations16.py: -------------------------------------------------------------------------------- 1 | pd.options.display.max_colwidth = 210 2 | titles.loc[title_longest.index] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations19.py: -------------------------------------------------------------------------------- 1 | cast[cast.name == 'Brad Pitt'].year.value_counts().sort_index().plot() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations2.py: -------------------------------------------------------------------------------- 1 | # df['Survived'].sum() / len(df['Survived']) 2 | df['Survived'].mean() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations28.py: -------------------------------------------------------------------------------- 1 | ratios_decade[:, 'actor'].plot() 2 | ratios_decade[:, 'actress'].plot() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations6.py: -------------------------------------------------------------------------------- 1 | df.groupby('AgeClass')['Fare'].mean().plot(kind='bar', rot=0, color="C0") -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations3.py: -------------------------------------------------------------------------------- 1 | df25 = df[df['Age'] < 25] 2 | df25['Survived'].sum() / len(df25['Survived']) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data12.py: -------------------------------------------------------------------------------- 1 | d = c.Superman - c.Batman 2 | print('Superman years:') 3 | print(len(d[d > 0.0])) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/images/axis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/axis.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03b_indexing3.py: -------------------------------------------------------------------------------- 1 | countries['density_ratio'] = countries['density'] / countries['density'].mean() 2 | countries -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations20.py: -------------------------------------------------------------------------------- 1 | titles[titles['title'].str.startswith('The Life')]['title'].value_counts().head(10) -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | `cps_85_wages.csv` is available at https://www.openml.org/d/534 2 | `adult-census.csv` is available at https://www.openml.org/d/15950 3 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/data/spectra.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/data/spectra.mat -------------------------------------------------------------------------------- /Day_1_Scientific_Python/img/dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/dataframe.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations22.py: -------------------------------------------------------------------------------- 1 | pink = cast[cast['title'] == 'The Pink Panther'] 2 | pink.groupby(['year'])[['n']].max() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/img/pivot_excel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/pivot_excel.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations13.py: -------------------------------------------------------------------------------- 1 | hamlets = titles[titles['title'].str.contains('Hamlet')] 2 | hamlets['title'].value_counts() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/images/broadcasting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/broadcasting.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations14.py: -------------------------------------------------------------------------------- 1 | hamlets = titles[titles['title'].str.startswith('Hamlet')] 2 | hamlets['title'].value_counts() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data1.py: -------------------------------------------------------------------------------- 1 | df.pivot_table(index='Pclass', columns='Sex', 2 | values='Survived', aggfunc='mean') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data4.py: -------------------------------------------------------------------------------- 1 | df.pivot_table(index='Underaged', columns='Sex', 2 | values='Fare', aggfunc='mean') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data6.py: -------------------------------------------------------------------------------- 1 | grouped = cast.groupby(['year', 'type']).size() 2 | table = grouped.unstack('type') 3 | table.plot() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/img/splitApplyCombine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/splitApplyCombine.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/img/logoUPSayPlusCDS_990.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/img/logoUPSayPlusCDS_990.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data9.py: -------------------------------------------------------------------------------- 1 | daily.resample('M').agg(['min', 'max']).plot() # monthly minimum and maximum values of these daily averages -------------------------------------------------------------------------------- /Day_1_Scientific_Python/images/kmeans_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/kmeans_illustration.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/images/tidyr-spread-gather.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_1_Scientific_Python/images/tidyr-spread-gather.gif -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations23.py: -------------------------------------------------------------------------------- 1 | oz = cast[cast['name'] == 'Frank Oz'] 2 | oz_roles = oz.groupby(['year', 'title']).size() 3 | oz_roles[oz_roles > 1] -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations26.py: -------------------------------------------------------------------------------- 1 | leading = cast[cast['n'] == 1] 2 | sums_decade = leading.groupby([cast['year'] // 10 * 10, 'type']).size() 3 | sums_decade -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations30.py: -------------------------------------------------------------------------------- 1 | cast1950 = cast[cast['year'] // 10 == 195] 2 | cast1950 = cast1950[cast1950['n'] == 1] 3 | cast1950['type'].value_counts() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations31.py: -------------------------------------------------------------------------------- 1 | cast2000 = cast[cast['year'] // 10 == 200] 2 | cast2000 = cast2000[cast2000['n'] == 1] 3 | cast2000['type'].value_counts() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | joblib 4 | scikit-learn 5 | pandas 6 | pandas-profiling 7 | ipython 8 | jupyter 9 | pillow 10 | matplotlib 11 | mplleaflet 12 | seaborn 13 | plotly 14 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations24.py: -------------------------------------------------------------------------------- 1 | oz = cast[cast['name'] == 'Frank Oz'] 2 | oz_roles = oz.groupby(['character']).size() 3 | oz_roles[oz_roles > 1].sort_values() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations11.py: -------------------------------------------------------------------------------- 1 | cast1990 = cast[cast['year'] >= 1990] 2 | cast1990 = cast1990[cast1990['n'] == 1] 3 | cast1990.groupby('name').size().nlargest(10) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations17.py: -------------------------------------------------------------------------------- 1 | cast1950 = cast[cast['year'] // 10 == 195] 2 | cast1950 = cast1950[cast1950['n'] == 1] 3 | cast1950.groupby(['year', 'type']).size() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations5.py: -------------------------------------------------------------------------------- 1 | df.groupby('Pclass')['Survived'].mean().plot(kind='bar', color="C0") #and what if you would compare the total number of survivors? -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data20.py: -------------------------------------------------------------------------------- 1 | titanic = cast[(cast['title'] == 'Titanic') & (cast['year'] == 1997)] 2 | titanic = titanic[titanic['n'].notnull()] 3 | titanic.sort_values('n') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_03a_selecting_data21.py: -------------------------------------------------------------------------------- 1 | brad = cast[cast['name'] == 'Brad Pitt'] 2 | brad = brad[brad['year'] // 10 == 199] 3 | brad = brad[brad['n'] == 2] 4 | brad.sort_values('year') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data10.py: -------------------------------------------------------------------------------- 1 | grouped = cast.groupby(['year', 'type']).size() 2 | table = grouped.unstack('type') 3 | (table['actor'] / (table['actor'] + table['actress'])).plot(ylim=[0,1]) -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/figures/simple-decision-tree-adult-census.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paris-saclay-cds/data-science-workshop-2019/HEAD/Day_2_Machine_Learning_Python/figures/simple-decision-tree-adult-census.png -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations9.py: -------------------------------------------------------------------------------- 1 | titles['decade'] = titles['year'] // 10 * 10 2 | hamlet = titles[titles['title'] == 'Hamlet'] 3 | hamlet.groupby('decade').size().plot(kind='bar', color="orange") -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations25.py: -------------------------------------------------------------------------------- 1 | cast['n_total'] = cast.groupby('title')['n'].transform('max') # transform will return an element for each row, so the max value is given to the whole group 2 | cast.head() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations27.py: -------------------------------------------------------------------------------- 1 | #sums_decade.groupby(level='year').transform(lambda x: x / x.sum()) 2 | ratios_decade = sums_decade / sums_decade.groupby(level='year').transform('sum') 3 | ratios_decade -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data11.py: -------------------------------------------------------------------------------- 1 | c = cast 2 | c = c[(c.character == 'Superman') | (c.character == 'Batman')] 3 | c = c.groupby(['year', 'character']).size() 4 | c = c.unstack() 5 | c = c.fillna(0) 6 | c.head() -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_06_groupby_operations10.py: -------------------------------------------------------------------------------- 1 | titles['decade'] = titles['year'] // 10 * 10 2 | hamlet = titles[titles['title'].str.contains('Hamlet')] 3 | hamlet.groupby('decade').size().plot(kind='bar', color="lightblue") -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_04_time_series_data6.py: -------------------------------------------------------------------------------- 1 | subset = data['2011':'2012']['L06_347'] 2 | fig, ax = plt.subplots() 3 | subset.resample('M').mean().plot(ax=ax) 4 | subset.resample('M').median().plot(ax=ax) 5 | ax.legend(["mean", "median"]) -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data7.py: -------------------------------------------------------------------------------- 1 | cast.pivot_table(index='year', columns='type', values="character", aggfunc='count').plot() 2 | # for values in using the , take a column with no Nan values in order to count effectively all values -> at this stage: aha-erlebnis about crosstab function(!) -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: python-workshop 2 | 3 | dependencies: 4 | - python=3.7 5 | - numpy 6 | - scipy 7 | - joblib 8 | - scikit-learn 9 | - pandas 10 | - conda-forge::pandas-profiling 11 | - ipython 12 | - jupyter 13 | - pillow 14 | - matplotlib 15 | - mplleaflet 16 | - seaborn 17 | - plotly 18 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/pandas/_solutions/pandas_07_reshaping_data2.py: -------------------------------------------------------------------------------- 1 | fig, ax1 = plt.subplots() 2 | df.pivot_table(index='Pclass', columns='Sex', 3 | values='Survived', aggfunc='mean').plot(kind='bar', 4 | rot=0, 5 | ax=ax1) 6 | ax1.set_ylabel('Survival ratio') -------------------------------------------------------------------------------- /Day_1_Scientific_Python/README.md: -------------------------------------------------------------------------------- 1 | # Day 1 - Scientific programming with Python 2 | 3 | Goal: introducing the most important packages for scientific computing and data analysis in Python. 4 | 5 | Overview: 6 | 7 | 1. Introduction to numpy: [01-numpy-introduction.ipynb](01-numpy-introduction.ipynb) 8 | 9 | 2. Introduction to pandas: [02-pandas_introduction.ipynb](02-pandas_introduction.ipynb) 10 | 11 | 3. Short overview of matplotlib and seaborn: [03-matplotib_seaborn.ipynb](03-matplotib_seaborn.ipynb) 12 | 13 | -------------------------------------------------------------------------------- /figures/style_figs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple styling used for matplotlib figures 3 | """ 4 | 5 | from matplotlib import pyplot as plt 6 | 7 | # Configuration settings to help visibility on small screen / prints 8 | plt.rcParams['xtick.labelsize'] = 20 9 | plt.rcParams['ytick.labelsize'] = 20 10 | plt.rcParams['figure.titlesize'] = 15 11 | plt.rcParams['font.size'] = 20 12 | plt.rcParams['axes.labelsize'] = 20 13 | plt.rcParams['axes.facecolor'] = 'none' 14 | plt.rcParams['legend.fontsize'] = 18 15 | plt.rcParams['lines.linewidth'] = 3 16 | plt.rcParams['figure.figsize'] = [.8 * 6.4, .8 * 4.8] 17 | plt.rcParams['legend.frameon'] = False 18 | plt.rcParams['legend.columnspacing'] = 1.8 19 | plt.rcParams['legend.handlelength'] = 1.5 20 | plt.rcParams['legend.handletextpad'] = 0.5 21 | 22 | # Utility functions 23 | def light_axis(): 24 | "Hide the top and right spines" 25 | ax = plt.gca() 26 | for s in ('top', 'right'): 27 | ax.spines[s].set_visible(False) 28 | plt.xticks(()) 29 | plt.yticks(()) 30 | plt.subplots_adjust(left=.01, bottom=.01, top=.99, right=.99) 31 | 32 | def no_axis(): 33 | plt.axis('off') 34 | plt.subplots_adjust(left=.0, bottom=.0, top=1, right=1) 35 | -------------------------------------------------------------------------------- /figures/plot_iris_visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some simple visualizations on the iris data. 3 | """ 4 | 5 | import numpy as np 6 | from sklearn import datasets 7 | from matplotlib import pyplot as plt 8 | import style_figs 9 | 10 | iris = datasets.load_iris() 11 | 12 | # Plot the histograms of each class for each feature 13 | 14 | 15 | X = iris.data 16 | y = iris.target 17 | for x, feature_name in zip(X.T, iris.feature_names): 18 | plt.figure(figsize=(2.5, 2)) 19 | patches = list() 20 | for this_y, target_name in enumerate(iris.target_names): 21 | patch = plt.hist(x[y == this_y], 22 | bins=np.linspace(x.min(), x.max(), 20), 23 | label=target_name) 24 | patches.append(patch[-1][0]) 25 | style_figs.light_axis() 26 | feature_name = feature_name.replace(' ', '_') 27 | feature_name = feature_name.replace('(', '') 28 | feature_name = feature_name.replace(')', '') 29 | plt.savefig('iris_{}_hist.svg'.format(feature_name)) 30 | 31 | plt.figure(figsize=(6, .25)) 32 | plt.legend(patches, iris.target_names, ncol=3, loc=(0, -.37), 33 | borderaxespad=0) 34 | style_figs.no_axis() 35 | plt.savefig('legend_irises.svg') 36 | 37 | 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, Paris-Saclay Center for Data Science 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/data/kmeans_data.csv: -------------------------------------------------------------------------------- 1 | 2.766997449176169521e+00 3.006059008441898772e+00 2 | 2.088094777054783080e+00 2.903810986407362904e+00 3 | 2.822686541285641670e+00 3.149696392939078660e+00 4 | 2.960355618458414551e+00 1.927877097616605750e+00 5 | 2.659122371155123066e+00 3.006991014034997356e+00 6 | 2.032954596475295084e+00 3.081060886926234588e+00 7 | 2.854008323349864984e+00 1.999801525369903965e+00 8 | 1.859387078586314690e+00 2.932274234602143093e+00 9 | 2.979922522654017136e+00 1.866821022670417829e+00 10 | 2.995763163325959599e+00 1.985595460871006912e+00 11 | 2.010735721721112146e+00 2.908542147932652089e+00 12 | 2.843601373873463789e+00 1.955819928799183538e+00 13 | 3.014684940949362346e+00 2.071817254798781871e+00 14 | 2.955848758498268669e+00 2.002549532855308456e+00 15 | 2.953853347117966432e+00 2.014228889781465970e+00 16 | 2.723478874518370674e+00 2.909772625778544786e+00 17 | 1.822563743484528320e+00 2.925210296677877242e+00 18 | 2.914468938893320260e+00 3.054336780969086451e+00 19 | 2.746014094374167325e+00 3.104808300944346566e+00 20 | 1.898162333714168204e+00 2.857695041511004952e+00 21 | 2.706409191152936433e+00 2.957686421588001213e+00 22 | 2.858956467111453126e+00 3.033124170697728328e+00 23 | 2.939364448465704438e+00 1.980821208728706928e+00 24 | 2.717811191556968708e+00 3.082535431495139644e+00 25 | 2.919293342210746545e+00 3.080578453448903353e+00 26 | 1.981070710113886468e+00 3.050562435643691561e+00 27 | 2.891639482190973887e+00 2.023183666995666652e+00 28 | 1.929576687174035410e+00 2.985977074479872595e+00 29 | 2.140504867514827492e+00 2.921581307550503936e+00 30 | 1.975460027120392370e+00 3.035061741609020647e+00 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/broadcasting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Broadcasting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "It’s possible to do operations on arrays of different sizes. In some case NumPy can transform these arrays automatically so that they all have the same size: this conversion is called broadcasting." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "\"drawing\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "You can find the full tutorial on broadcasting: [Broadcasting](https://paris-swc.github.io/advanced-numpy-lesson/03-broadcasting.html)
\n", 29 | "And explanation on what it is: [Broadcasting](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n", 30 | "\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "[Previous: Savez() and load()](savez.ipynb)
[Next: Test yourself](test_yourself.ipynb)" 38 | ] 39 | } 40 | ], 41 | "metadata": { 42 | "kernelspec": { 43 | "display_name": "Python 3", 44 | "language": "python", 45 | "name": "python3" 46 | }, 47 | "language_info": { 48 | "codemirror_mode": { 49 | "name": "ipython", 50 | "version": 3 51 | }, 52 | "file_extension": ".py", 53 | "mimetype": "text/x-python", 54 | "name": "python", 55 | "nbconvert_exporter": "python", 56 | "pygments_lexer": "ipython3", 57 | "version": "3.7.2" 58 | } 59 | }, 60 | "nbformat": 4, 61 | "nbformat_minor": 2 62 | } 63 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/broadcasting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Broadcasting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "It’s possible to do operations on arrays of different sizes. In some case NumPy can transform these arrays automatically so that they all have the same size: this conversion is called broadcasting." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "\"drawing\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "You can find the full tutorial on broadcasting: [Broadcasting](https://paris-swc.github.io/advanced-numpy-lesson/03-broadcasting.html)
\n", 29 | "And explanation on what it is: [Broadcasting](https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)\n", 30 | "\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "[Previous: Savez() and load()](savez.ipynb)
[Next: Test yourself](test_yourself.ipynb)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [] 46 | } 47 | ], 48 | "metadata": { 49 | "kernelspec": { 50 | "display_name": "Python 3", 51 | "language": "python", 52 | "name": "python3" 53 | }, 54 | "language_info": { 55 | "codemirror_mode": { 56 | "name": "ipython", 57 | "version": 3 58 | }, 59 | "file_extension": ".py", 60 | "mimetype": "text/x-python", 61 | "name": "python", 62 | "nbconvert_exporter": "python", 63 | "pygments_lexer": "ipython3", 64 | "version": "3.7.2" 65 | } 66 | }, 67 | "nbformat": 4, 68 | "nbformat_minor": 2 69 | } 70 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/data/brain_size.csv: -------------------------------------------------------------------------------- 1 | "";"Gender";"FSIQ";"VIQ";"PIQ";"Weight";"Height";"MRI_Count" 2 | "1";"Female";133;132;124;"118";"64.5";816932 3 | "2";"Male";140;150;124;".";"72.5";1001121 4 | "3";"Male";139;123;150;"143";"73.3";1038437 5 | "4";"Male";133;129;128;"172";"68.8";965353 6 | "5";"Female";137;132;134;"147";"65.0";951545 7 | "6";"Female";99;90;110;"146";"69.0";928799 8 | "7";"Female";138;136;131;"138";"64.5";991305 9 | "8";"Female";92;90;98;"175";"66.0";854258 10 | "9";"Male";89;93;84;"134";"66.3";904858 11 | "10";"Male";133;114;147;"172";"68.8";955466 12 | "11";"Female";132;129;124;"118";"64.5";833868 13 | "12";"Male";141;150;128;"151";"70.0";1079549 14 | "13";"Male";135;129;124;"155";"69.0";924059 15 | "14";"Female";140;120;147;"155";"70.5";856472 16 | "15";"Female";96;100;90;"146";"66.0";878897 17 | "16";"Female";83;71;96;"135";"68.0";865363 18 | "17";"Female";132;132;120;"127";"68.5";852244 19 | "18";"Male";100;96;102;"178";"73.5";945088 20 | "19";"Female";101;112;84;"136";"66.3";808020 21 | "20";"Male";80;77;86;"180";"70.0";889083 22 | "21";"Male";83;83;86;".";".";892420 23 | "22";"Male";97;107;84;"186";"76.5";905940 24 | "23";"Female";135;129;134;"122";"62.0";790619 25 | "24";"Male";139;145;128;"132";"68.0";955003 26 | "25";"Female";91;86;102;"114";"63.0";831772 27 | "26";"Male";141;145;131;"171";"72.0";935494 28 | "27";"Female";85;90;84;"140";"68.0";798612 29 | "28";"Male";103;96;110;"187";"77.0";1062462 30 | "29";"Female";77;83;72;"106";"63.0";793549 31 | "30";"Female";130;126;124;"159";"66.5";866662 32 | "31";"Female";133;126;132;"127";"62.5";857782 33 | "32";"Male";144;145;137;"191";"67.0";949589 34 | "33";"Male";103;96;110;"192";"75.5";997925 35 | "34";"Male";90;96;86;"181";"69.0";879987 36 | "35";"Female";83;90;81;"143";"66.5";834344 37 | "36";"Female";133;129;128;"153";"66.5";948066 38 | "37";"Male";140;150;124;"144";"70.5";949395 39 | "38";"Female";88;86;94;"139";"64.5";893983 40 | "39";"Male";81;90;74;"148";"74.0";930016 41 | "40";"Male";89;91;89;"179";"75.5";935863 42 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/02_basic_preprocessing_exercise_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 01\n", 8 | "\n", 9 | "The goal of is to compare the performance of our classifier (81% accuracy) to some baseline classifiers that would ignore the input data and instead make constant predictions.\n", 10 | "\n", 11 | "The online [documentation for DummyClassifier](https://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators) gives instructions on how to use it." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "\n", 22 | "df = pd.read_csv(\n", 23 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "target_name = \"class\"\n", 33 | "target = df[target_name].to_numpy()\n", 34 | "data = df.drop(columns=[target_name, \"fnlwgt\"])\n", 35 | "numerical_columns = [\n", 36 | " c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n", 37 | "data_numeric = data[numerical_columns]" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from sklearn.model_selection import cross_val_score\n", 47 | "from sklearn.dummy import DummyClassifier\n", 48 | "\n", 49 | "# TODO: write me!" 50 | ] 51 | } 52 | ], 53 | "metadata": { 54 | "jupytext": { 55 | "formats": "python_scripts//py:percent,notebooks//ipynb" 56 | }, 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 2 65 | } 66 | -------------------------------------------------------------------------------- /check_env.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from distutils.version import LooseVersion as Version 3 | import sys 4 | 5 | OK = '\x1b[42m[ OK ]\x1b[0m' 6 | FAIL = "\x1b[41m[FAIL]\x1b[0m" 7 | 8 | try: 9 | import importlib 10 | except ImportError: 11 | print(FAIL, "Python version 3.6 or above is required," 12 | " but %s is installed." % sys.version) 13 | 14 | 15 | def import_version(pkg, min_ver, fail_msg=""): 16 | mod = None 17 | try: 18 | mod = importlib.import_module(pkg) 19 | if pkg in {'PIL'}: 20 | try: 21 | ver = mod.VERSION 22 | except AttributeError: 23 | try: 24 | ver = mod.PILLOW_VERSION 25 | except: 26 | raise 27 | else: 28 | ver = mod.__version__ 29 | if Version(ver) < min_ver: 30 | print(FAIL, "%s version %s or higher required, but %s installed." 31 | % (lib, min_ver, ver)) 32 | else: 33 | print(OK, '%s version %s' % (pkg, ver)) 34 | except ImportError: 35 | print(FAIL, '%s not installed. %s' % (pkg, fail_msg)) 36 | return mod 37 | 38 | 39 | # first check the python version 40 | print('Using python in', sys.prefix) 41 | print(sys.version) 42 | pyversion = Version(sys.version) 43 | if pyversion >= "3": 44 | if pyversion < "3.6": 45 | print(FAIL, "Python version 3.6 or above is required," 46 | " but %s is installed." % sys.version) 47 | elif pyversion >= "2": 48 | print(FAIL, "Python version 3.6 or above is required," 49 | " but %s is installed." % sys.version) 50 | else: 51 | print(FAIL, "Unknown Python version: %s" % sys.version) 52 | 53 | print() 54 | requirements = {'numpy': "1.16", 'scipy': "1.2", 'matplotlib': "3.0", 55 | 'IPython': "3.0", 'sklearn': "0.21", 'pandas': "0.24", 56 | 'PIL': "1.1.7", 'notebook': "5.7", 'plotly': "4.3", 57 | 'pandas_profiling': "2.3"} 58 | 59 | # now the dependencies 60 | for lib, required_version in list(requirements.items()): 61 | import_version(lib, required_version) 62 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/stacking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Stacking\n", 8 | "\n", 9 | "Arrays can be concatenated and stacked on top of one another, using NumPy’s vstack and hstack functions for vertical and horizontal stacking, respectively." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "A = np.array([[1,2,3], [4,5,6], [7, 8, 9]])\n", 28 | "print('A = ')\n", 29 | "print(A)\n", 30 | "\n", 31 | "B = np.hstack([A, A])\n", 32 | "print('B = ')\n", 33 | "print(B)\n", 34 | "\n", 35 | "C = np.vstack([A, A])\n", 36 | "print('C = ')\n", 37 | "print(C)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## $\\color{green}{\\text{Excercise}}$ stacking\n", 45 | "Write some additional code that slices the first and last columns of A, and stacks them into a 3x2 array. Make sure to print the results to verify your solution.
\n", 46 | "__Tip__: A ‘gotcha’ with array indexing is that singleton dimensions are dropped by default. That means A[:, 0] is a one dimensional array, which won’t stack as desired. To preserve singleton dimensions, the index itself can be a slice or array. For example, A[:, :1] returns a two dimensional array with one singleton dimension (i.e. a column vector)." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "[Previous: Operations](operations.ipynb)
[Next: K-means clustering](k_means.ipynb)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [] 69 | } 70 | ], 71 | "metadata": { 72 | "kernelspec": { 73 | "display_name": "Python 3", 74 | "language": "python", 75 | "name": "python3" 76 | }, 77 | "language_info": { 78 | "codemirror_mode": { 79 | "name": "ipython", 80 | "version": 3 81 | }, 82 | "file_extension": ".py", 83 | "mimetype": "text/x-python", 84 | "name": "python", 85 | "nbconvert_exporter": "python", 86 | "pygments_lexer": "ipython3", 87 | "version": "3.7.2" 88 | } 89 | }, 90 | "nbformat": 4, 91 | "nbformat_minor": 2 92 | } 93 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/stacking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Stacking\n", 8 | "\n", 9 | "Arrays can be concatenated and stacked on top of one another, using NumPy’s vstack and hstack functions for vertical and horizontal stacking, respectively." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "A = np.array([[1,2,3], [4,5,6], [7, 8, 9]])\n", 28 | "print('A = ')\n", 29 | "print(A)\n", 30 | "\n", 31 | "B = np.hstack([A, A])\n", 32 | "print('B = ')\n", 33 | "print(B)\n", 34 | "\n", 35 | "C = np.vstack([A, A])\n", 36 | "print('C = ')\n", 37 | "print(C)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## $\\color{green}{\\text{Excercise}}$ stacking\n", 45 | "Write some additional code that slices the first and last columns of A, and stacks them into a 3x2 array. Make sure to print the results to verify your solution.
\n", 46 | "__Tip__: A ‘gotcha’ with array indexing is that singleton dimensions are dropped by default. That means A[:, 0] is a one dimensional array, which won’t stack as desired. To preserve singleton dimensions, the index itself can be a slice or array. For example, A[:, :1] returns a two dimensional array with one singleton dimension (i.e. a column vector)." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "D = np.hstack((A[:, :1], A[:, -1:]))\n", 56 | "print('D = ')\n", 57 | "print(D)\n", 58 | "print(D.shape)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "[Previous: Operations](operations.ipynb)
[Next: K-means clustering](k_means.ipynb)" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.7.2" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 02\n", 8 | "\n", 9 | "The goal of this exercise is to evalutate the impact of using an arbitrary\n", 10 | "integer encoding for categorical variables along with a linear\n", 11 | "classification model such as Logistic Regression.\n", 12 | "\n", 13 | "To do so, let's try to use `OrdinalEncoder` to preprocess the categorical\n", 14 | "variables. This preprocessor is assembled in a pipeline with\n", 15 | "`LogisticRegression`. The performance of the pipeline can be evaluated as\n", 16 | "usual by cross-validation and then compared to the score obtained when using\n", 17 | "`OneHotEncoding` or to some other baseline score.\n", 18 | "\n", 19 | "Because `OrdinalEncoder` can raise errors if it sees an unknown category at\n", 20 | "prediction time, we need to pre-compute the list of all possible categories\n", 21 | "ahead of time:\n", 22 | "\n", 23 | "```python\n", 24 | "categories = [data[column].unique()\n", 25 | " for column in data[categorical_columns]]\n", 26 | "OrdinalEncoder(categories=categories)\n", 27 | "```" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "\n", 38 | "df = pd.read_csv(\n", 39 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 40 | "\n", 41 | "# Or use the local copy:\n", 42 | "# df = pd.read_csv('../datasets/adult-census.csv')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "target_name = \"class\"\n", 52 | "target = df[target_name].to_numpy()\n", 53 | "data = df.drop(columns=[target_name, \"fnlwgt\"])\n", 54 | "categorical_columns = [\n", 55 | " c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n", 56 | "data_categorical = data[categorical_columns]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from sklearn.model_selection import cross_val_score\n", 66 | "from sklearn.pipeline import make_pipeline\n", 67 | "from sklearn.preprocessing import OrdinalEncoder\n", 68 | "from sklearn.linear_model import LogisticRegression\n", 69 | "\n", 70 | "# TODO: write me!" 71 | ] 72 | } 73 | ], 74 | "metadata": { 75 | "jupytext": { 76 | "formats": "python_scripts//py:percent,notebooks//ipynb" 77 | }, 78 | "kernelspec": { 79 | "display_name": "Python 3", 80 | "language": "python", 81 | "name": "python3" 82 | } 83 | }, 84 | "nbformat": 4, 85 | "nbformat_minor": 2 86 | } 87 | -------------------------------------------------------------------------------- /figures/plot_splines.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple example of overfit with splines 3 | """ 4 | import numpy as np 5 | from matplotlib import pyplot as plt 6 | import style_figs 7 | 8 | from sklearn import datasets, linear_model 9 | 10 | # Load the diabetes dataset 11 | diabetes = datasets.load_diabetes() 12 | 13 | 14 | # Use only one feature 15 | diabetes_X = diabetes.data[:, np.newaxis] 16 | diabetes_X_temp = diabetes_X[:, :, 2] 17 | 18 | # Split the data into training/testing sets 19 | diabetes_X_train = diabetes_X_temp[:-200:3] 20 | diabetes_X_test = diabetes_X_temp[-200:].T 21 | 22 | # Split the targets into training/testing sets 23 | diabetes_y_train = diabetes.target[:-200:3] 24 | diabetes_y_test = diabetes.target[-200:] 25 | 26 | # Sort the data and remove duplicates (for interpolation) 27 | order = np.argsort(diabetes_X_train.ravel()) 28 | X_train = diabetes_X_train.ravel()[order] 29 | y_train = diabetes_y_train[order] 30 | # Avoid duplicates 31 | y_train_ = list() 32 | for this_x in np.unique(X_train): 33 | y_train_.append(np.mean(y_train[X_train == this_x])) 34 | X_train = np.unique(X_train) 35 | 36 | y_train = np.array(y_train_) 37 | 38 | # Create linear regression object 39 | regr = linear_model.LinearRegression() 40 | 41 | # Train the model using the training sets 42 | regr.fit(X_train.reshape((-1, 1)), y_train) 43 | 44 | 45 | plt.figure(1, figsize=(.8*4, .8*3), facecolor='none') 46 | # Plot with test data 47 | plt.clf() 48 | ax = plt.axes([.1, .1, .9, .9]) 49 | 50 | plt.scatter(X_train, y_train, color='k', s=9) 51 | 52 | plt.plot([-.08, .12], regr.predict([[-.08, ], [.12, ]]), 53 | linewidth=3) 54 | 55 | plt.axis('tight') 56 | ymin, ymax = plt.ylim() 57 | style_figs.light_axis() 58 | plt.ylabel('y', size=16, weight=600) 59 | plt.xlabel('x', size=16, weight=600) 60 | 61 | plt.savefig('ols_simple.svg', facecolor='none', edgecolor='none') 62 | 63 | plt.scatter(diabetes_X_test, diabetes_y_test, color='C1', s=9) 64 | plt.ylim(ymin, ymax) 65 | plt.xlim(-.08, .12) 66 | 67 | plt.savefig('ols_test.svg', facecolor='none', edgecolor='none') 68 | 69 | 70 | # Plot cubic splines 71 | plt.clf() 72 | ax = plt.axes([.1, .1, .9, .9]) 73 | 74 | from scipy import interpolate 75 | f = interpolate.interp1d(X_train, y_train, 76 | kind="quadratic", 77 | bounds_error=False, fill_value="extrapolate") 78 | plt.scatter(X_train, y_train, color='k', s=9, zorder=20) 79 | x_spline = np.linspace(-.08, .12, 600) 80 | y_spline = f(x_spline) 81 | plt.plot(x_spline, y_spline, linewidth=3) 82 | 83 | plt.axis('tight') 84 | plt.xlim(-.08, .12) 85 | plt.ylim(ymin, ymax) 86 | 87 | style_figs.light_axis() 88 | 89 | plt.ylabel('y', size=16, weight=600) 90 | plt.xlabel('x', size=16, weight=600) 91 | 92 | 93 | plt.savefig('splines_cubic.svg', facecolor='none', edgecolor='none') 94 | 95 | 96 | plt.scatter(diabetes_X_test, diabetes_y_test, color='C1', s=9) 97 | plt.savefig('splines_test.svg', facecolor='none', edgecolor='none') 98 | 99 | plt.show() 100 | 101 | -------------------------------------------------------------------------------- /img/slides.css: -------------------------------------------------------------------------------- 1 | @import url(webfont-ubuntu-400-300-100.css); 2 | @import url(webfont-ubuntu-mono-400-700-400italic.css); 3 | 4 | body { 5 | font-family: 'Ubuntu'; 6 | font-weight: normal; 7 | } 8 | 9 | h1, h2, h3, h4, h5, h6 { 10 | font-family: 'Ubuntu'; 11 | font-weight: 300; 12 | margin-top: 0; 13 | } 14 | h1 { 15 | margin-top: 0.5em; 16 | } 17 | h2 { 18 | font-size: 140%; 19 | line-height: 150%; 20 | } 21 | h3 { 22 | font-size: 120%; 23 | line-height: 140%; 24 | } 25 | 26 | 27 | 28 | li { 29 | font-size: 120%; 30 | line-height: 160%; 31 | } 32 | 33 | p { 34 | font-size: 120%; 35 | line-height: 140%; 36 | } 37 | 38 | .singleimg .middlebelowheader { 39 | text-align: center; 40 | } 41 | 42 | .singleimg img { 43 | max-width: 90%; 44 | max-height: 600px; 45 | /*border: 2px solid #ddd;*/ 46 | } 47 | table { 48 | margin: 0 auto 0.8em; 49 | border-collapse: collapse; 50 | } 51 | td, th { 52 | border: 1px solid #ddd; 53 | padding: 0.3em 0.5em; 54 | } 55 | 56 | .bgheader h1 { 57 | background-color: rgba(0, 0, 0, 0.9); 58 | opacity: 50%; 59 | padding: 0.5em; 60 | color: white; 61 | border-radius: .5em; 62 | } 63 | .middlebelowheader { 64 | /* This fixed size height was found to work well with the slide 65 | scaling mechanism of remark.js: 66 | */ 67 | height: 500px; 68 | display: table-cell; 69 | vertical-align: middle; 70 | } 71 | .widespace h2 { 72 | line-height: 200%; 73 | } 74 | .big .remark-code { 75 | font-size: 200%; 76 | } 77 | .remark-code, .remark-inline-code { 78 | font-family: 'Ubuntu Mono'; 79 | } 80 | 81 | .medium .remark-code { 82 | font-size: 120%; 83 | } 84 | 85 | .mmedium .remark-code { 86 | font-size: 99%; 87 | } 88 | 89 | .affiliations img { 90 | /*height: 100px;*/ 91 | margin: 2em; 92 | margin-right: 0.5em; 93 | margin-left:0.5em; 94 | } 95 | 96 | .hidden { 97 | visibility: hidden; 98 | } 99 | 100 | .small { 101 | font-size: 90%; 102 | } 103 | 104 | .credits { 105 | font-style: italic; 106 | font-size: 70%; 107 | } 108 | 109 | .bunchoflogos img { 110 | max-height: 100px; 111 | padding: 1em; 112 | } 113 | 114 | .bunchoflogos p { 115 | text-align: center; 116 | width: 750px; 117 | } 118 | 119 | a:visited { 120 | color: blue; 121 | } 122 | 123 | .inverse a:visited { 124 | color: Maroon; 125 | } 126 | 127 | .inverse { 128 | background: #272822; 129 | color: #777872; 130 | text-shadow: 0 0 20px #333; 131 | } 132 | .inverse h1, .inverse h2 { 133 | color: #f3f3f3; 134 | } 135 | 136 | code { 137 | background: #e7e8e2; 138 | border-radius: 5px; 139 | } 140 | .pull-left { 141 | float: left; 142 | width: 47%; 143 | } 144 | .pull-right { 145 | float: right; 146 | width: 47%; 147 | } 148 | .pull-right ~ p { 149 | clear: both; 150 | } 151 | 152 | @page { 153 | size: 1024px 768px; 154 | margin: 0; 155 | } 156 | 157 | @media print { 158 | .remark-slide-scaler { 159 | width: 100% !important; 160 | height: 100% !important; 161 | transform: scale(1) !important; 162 | top: 0 !important; 163 | left: 0 !important; 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | 8 | 9 | Python workshop - Paris-Saclay Center for Data Science 10 | 11 | 12 | 13 | 14 | 124 | 126 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/test_yourself.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## $\\color{green}{\\text{Excercise}}$ Rectification\n", 8 | "\n", 9 | "Rectify an array (replace negative elements with zeros) of random numbers from normal distribution (generated with np.random.randn) using boolean indexing." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## $\\color{green}{\\text{Excercise}}$ Sub-arrays\n", 17 | "\n", 18 | "Let \n", 19 | "`x = np.array([1, 5, 10])`.\n", 20 | "\n", 21 | "Which of the following will show `[1, 10]`:\n", 22 | "\n", 23 | "a) x[::2]\n", 24 | "\n", 25 | "b) x[[1, 3]]\n", 26 | "\n", 27 | "c) x[[0, 2]]\n", 28 | "\n", 29 | "d) x[0, 2]\n", 30 | "\n", 31 | "e) x[[1, -1]]\n", 32 | "\n", 33 | "f) x[[False, True, False]]\n", 34 | "\n", 35 | "For each statement predict whether it returns a copy or a view." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## $\\color{green}{\\text{Excercise}}$ Random elements\n", 43 | "\n", 44 | "Using fancy indexing select randomly with repetition 10 elements from a random array of 100 elements (Hint: you can use np.random.randint(max_int, size=n) to generate n random numbers from 0 to max_int)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## $\\color{green}{\\text{Excercise}}$ Drawing random integers without repetition\n", 52 | "\n", 53 | "Generate a random sequence of 10 integers from 1 to 100 without repetition (Hint: you may want to use np.random.rand and np.argsort)." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## $\\color{green}{\\text{Excercise}}$\n", 61 | "\n", 62 | "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "[Previous: Broadcasting](broadcasting.ipynb)
[Back to index](../01-numpy-introduction.ipynb)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.7.2" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/savez.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Saving and loading the data in .npz format\n", 8 | "The .npz file format is a zipped archive of files named after the variables they contain. The archive is not compressed and each file in the archive contains one variable in .npy format" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "use __np.savez(filename, args)__ to save the data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import matplotlib.pylab as plt\n", 26 | "\n", 27 | "%matplotlib inline\n", 28 | "\n", 29 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "filename = 'datainfo.npz'\n", 39 | "np.savez(filename, data=data, mean_daily=np.mean(data,0))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "use __np.load()__ to load it :" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "patient = np.load(filename)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "to check the keys in the loaded data" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "list(patient)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "patient['mean_daily']" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "We can plot this data using matplotlib.pyplot.plt:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "plt.plot(patient['mean_daily'])" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "[Previous: K-means clustering](k_means.ipynb)
[Next: Fancy indexing](fancy_indexing.ipynb)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.2" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/savez.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Saving and loading the data in .npz format\n", 8 | "The .npz file format is a zipped archive of files named after the variables they contain. The archive is not compressed and each file in the archive contains one variable in .npy format" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "use __np.savez(filename, args)__ to save the data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import matplotlib.pylab as plt\n", 26 | "\n", 27 | "%matplotlib inline\n", 28 | "\n", 29 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "filename = 'datainfo.npz'\n", 39 | "np.savez(filename, data=data, mean_daily=np.mean(data,0))" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "use __np.load()__ to load it :" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "patient = np.load(filename)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "to check the keys in the loaded data" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "list(patient)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "patient['mean_daily']" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "We can plot this data using matplotlib.pyplot.plt:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "plt.plot(patient['mean_daily'])" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "[Previous: K-means clustering](k_means.ipynb)
[Next: Fancy indexing](fancy_indexing.ipynb)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.7.2" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 2 135 | } 136 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/test_yourself.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## $\\color{green}{\\text{Excercise}}$ Rectification\n", 8 | "\n", 9 | "Rectify an array (replace negative elements with zeros) of random numbers from normal distribution (generated with np.random.randn) using boolean indexing." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## $\\color{green}{\\text{Excercise}}$ Sub-arrays\n", 17 | "\n", 18 | "Let \n", 19 | "`x = np.array([1, 5, 10])`.\n", 20 | "\n", 21 | "Which of the following will show `[1, 10]`:\n", 22 | "\n", 23 | "a) x[::2]\n", 24 | "\n", 25 | "b) x[[1, 3]]\n", 26 | "\n", 27 | "c) x[[0, 2]]\n", 28 | "\n", 29 | "d) x[0, 2]\n", 30 | "\n", 31 | "e) x[[1, -1]]\n", 32 | "\n", 33 | "f) x[[False, True, False]]\n", 34 | "\n", 35 | "For each statement predict whether it returns a copy or a view." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## $\\color{green}{\\text{Excercise}}$ Random elements\n", 43 | "\n", 44 | "Using fancy indexing select randomly with repetition 10 elements from a random array of 100 elements (Hint: you can use np.random.randint(max_int, size=n) to generate n random numbers from 0 to max_int)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## $\\color{green}{\\text{Excercise}}$ Drawing random integers without repetition\n", 52 | "\n", 53 | "Generate a random sequence of 10 integers from 1 to 100 without repetition (Hint: you may want to use np.random.rand and np.argsort)." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## $\\color{green}{\\text{Excercise}}$\n", 61 | "\n", 62 | "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import numpy as np\n", 72 | "\n", 73 | "rand_array = np.random.rand(10,3)\n", 74 | "rand_array2 = rand_array - 0.75\n", 75 | "closest = np.argmin(np.abs(rand_array2),1)\n", 76 | "print(rand_array)\n", 77 | "print(closest)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "[Previous: Broadcasting](broadcasting.ipynb)
[Back to index](../01-numpy-introduction.ipynb)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.7.2" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/fancy_indexing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Fancy indexing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Indexing can be done with a list or an array of integers. In this case the same index can be also repeated several times:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import matplotlib.pylab as plt" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "a = np.arange(0, 100, 10)\n", 34 | "a" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "a[[2, 3, 2, 4, 2]] " 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "New values can be also assigned with this kind of indexing:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "a[[9, 7]] = -100\n", 60 | "a" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "When a new array is created by indexing with an array of integers, the new array has the same shape than the array of integers. " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "a = np.arange(10)\n", 77 | "idx = np.array([[3, 4], [9, 7]])\n", 78 | "print('idx shape: {}'.format(idx.shape))\n", 79 | "a[idx]" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Fancy indexing is often used to re-order or sort data. You can easily obtain the indices required to sort data using np.argsort:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "a = np.random.randint(10, size=5)\n", 96 | "a" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "i = np.argsort(a)\n", 106 | "a[i]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Note that fancy indexing returns a copy and not a view." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "[Previous: Savez() and load()](savez.ipynb)
[Next: Broadcasting](broadcasting.ipynb)" 121 | ] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 3", 127 | "language": "python", 128 | "name": "python3" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 3 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython3", 140 | "version": "3.7.2" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/fancy_indexing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Fancy indexing" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Indexing can be done with a list or an array of integers. In this case the same index can be also repeated several times:" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import matplotlib.pylab as plt" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "a = np.arange(0, 100, 10)\n", 34 | "a" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "a[[2, 3, 2, 4, 2]] " 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "New values can be also assigned with this kind of indexing:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "a[[9, 7]] = -100\n", 60 | "a" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "When a new array is created by indexing with an array of integers, the new array has the same shape than the array of integers. " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "a = np.arange(10)\n", 77 | "idx = np.array([[3, 4], [9, 7]])\n", 78 | "print('idx shape: {}'.format(idx.shape))\n", 79 | "a[idx]" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Fancy indexing is often used to re-order or sort data. You can easily obtain the indices required to sort data using np.argsort:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "a = np.random.randint(10, size=5)\n", 96 | "a" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "i = np.argsort(a)\n", 106 | "a[i]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Note that fancy indexing returns a copy and not a view." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "[Previous: Savez() and load()](savez.ipynb)
[Next: Broadcasting](broadcasting.ipynb)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Python 3", 134 | "language": "python", 135 | "name": "python3" 136 | }, 137 | "language_info": { 138 | "codemirror_mode": { 139 | "name": "ipython", 140 | "version": 3 141 | }, 142 | "file_extension": ".py", 143 | "mimetype": "text/x-python", 144 | "name": "python", 145 | "nbconvert_exporter": "python", 146 | "pygments_lexer": "ipython3", 147 | "version": "3.7.2" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/02_basic_preprocessing_exercise_01_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Solution for Exercise 01\n", 8 | "\n", 9 | "The goal of is to compare the performance of our classifier to some baseline classifier that would ignore the input data and instead make constant predictions:" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "\n", 20 | "df = pd.read_csv(\n", 21 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "target_name = \"class\"\n", 31 | "target = df[target_name].to_numpy()\n", 32 | "data = df.drop(columns=[target_name, \"fnlwgt\"])\n", 33 | "numerical_columns = [\n", 34 | " c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n", 35 | "data_numeric = data[numerical_columns]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from sklearn.model_selection import cross_val_score\n", 45 | "from sklearn.dummy import DummyClassifier\n", 46 | "\n", 47 | "high_revenue_clf = DummyClassifier(strategy=\"constant\",\n", 48 | " constant=\" >50K\")\n", 49 | "scores = cross_val_score(high_revenue_clf, data_numeric, target)\n", 50 | "print(f\"{scores.mean():.3f} +/- {scores.std():.3f}\")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "low_revenue_clf = DummyClassifier(strategy=\"constant\",\n", 60 | " constant=\" <=50K\")\n", 61 | "scores = cross_val_score(low_revenue_clf, data_numeric, target)\n", 62 | "print(f\"{scores.mean():.3f} +/- {scores.std():.3f}\")" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "most_freq_revenue_clf = DummyClassifier(strategy=\"most_frequent\")\n", 72 | "scores = cross_val_score(most_freq_revenue_clf, data_numeric, target)\n", 73 | "print(f\"{scores.mean():.3f} +/- {scores.std():.3f}\")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "So 81% accuracy is significantly better than 76% which is the score of a baseline model that would always predict the most frequent class which is the low revenue class: `\" <=50K\"`.\n", 81 | "\n", 82 | "In this dataset, we can see that the target classes are imbalanced: almost 3/4 of the records are people with a revenue below 50K:" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "df[\"class\"].value_counts()" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "(target == \" <=50K\").mean()" 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "jupytext": { 106 | "formats": "python_scripts//py:percent,notebooks//ipynb" 107 | }, 108 | "kernelspec": { 109 | "display_name": "Python 3", 110 | "language": "python", 111 | "name": "python3" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/04_basic_parameters_tuning_exercise_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 01\n", 8 | "The goal is to write an exhaustive search to find the best parameters\n", 9 | "combination maximizing the model performance" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.preprocessing import OrdinalEncoder\n", 22 | "from sklearn.model_selection import RandomizedSearchCV\n", 23 | "from sklearn.compose import ColumnTransformer\n", 24 | "from sklearn.pipeline import Pipeline\n", 25 | "# This line is currently required to import HistGradientBoostingClassifier\n", 26 | "from sklearn.experimental import enable_hist_gradient_boosting\n", 27 | "from sklearn.ensemble import HistGradientBoostingClassifier\n", 28 | "\n", 29 | "from scipy.stats import expon, uniform\n", 30 | "from scipy.stats import randint\n", 31 | "\n", 32 | "df = pd.read_csv(\n", 33 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 34 | "# Or use the local copy:\n", 35 | "# df = pd.read_csv('../datasets/adult-census.csv')\n", 36 | "\n", 37 | "target_name = \"class\"\n", 38 | "target = df[target_name].to_numpy()\n", 39 | "data = df.drop(columns=target_name)\n", 40 | "\n", 41 | "df_train, df_test, target_train, target_test = train_test_split(\n", 42 | " data, target, random_state=42)\n", 43 | "\n", 44 | "from sklearn.compose import ColumnTransformer\n", 45 | "from sklearn.preprocessing import OrdinalEncoder\n", 46 | "\n", 47 | "categorical_columns = [\n", 48 | " 'workclass', 'education', 'marital-status', 'occupation',\n", 49 | " 'relationship', 'race', 'native-country', 'sex']\n", 50 | "\n", 51 | "categories = [data[column].unique()\n", 52 | " for column in data[categorical_columns]]\n", 53 | "\n", 54 | "categorical_preprocessor = OrdinalEncoder(categories=categories)\n", 55 | "\n", 56 | "preprocessor = ColumnTransformer(\n", 57 | " [('cat-preprocessor', categorical_preprocessor, categorical_columns)],\n", 58 | " remainder='passthrough', sparse_threshold=0)\n", 59 | "\n", 60 | "from sklearn.experimental import enable_hist_gradient_boosting\n", 61 | "from sklearn.ensemble import HistGradientBoostingClassifier\n", 62 | "from sklearn.pipeline import make_pipeline\n", 63 | "\n", 64 | "model = make_pipeline(\n", 65 | " preprocessor, HistGradientBoostingClassifier(random_state=42))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "TODO: write your solution here\n", 73 | "\n", 74 | "Use the previously defined model (called `model`) and using two nested `for`\n", 75 | "loops, make a search of the best combinations of the `learning_rate` and\n", 76 | "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n", 77 | "the model by setting the parameters. The evaluation of the model should be\n", 78 | "performed using `cross_val_score`. We can propose to define the following\n", 79 | "parameters search:\n", 80 | "- `learning_rate` for the values 0.01, 0.1, and 1;\n", 81 | "- `max_leaf_nodes` for the values 5, 25, 45." 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "jupytext": { 87 | "formats": "python_scripts//py:percent,notebooks//ipynb" 88 | }, 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/01-numpy-introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Science in Python\n", 8 | "\n", 9 | "### Numpy tutorial, March, 13th 2019\n", 10 | "\n", 11 | "Working efficiently with multi-dimensional arrays in NumPy\n", 12 | "\n", 13 | "Maria Teleńczuk
\n", 14 | "email: maria@telenczuk.pl" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Quick introduction to Jupyter notebook\n", 22 | "\n", 23 | "to run Jupyter notebook in your terminal type:
\n", 24 | " `jupyter notebook`\n", 25 | " \n", 26 | "**Esc** : takes you into command mode, there you can use:\n", 27 | " - __a__ : insert a new cell above
\n", 28 | " - __b__ : insert a new cell below
\n", 29 | " - **m** : change the current cell to Markdown
\n", 30 | " - **y** : change the current cell to code\n", 31 | "\n", 32 | "**Enter** : go back to edit mode\n", 33 | " \n", 34 | "**Shift + Enter** : execute the cell, move to the cell below\n", 35 | "\n", 36 | "__?__ : help" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "# Topics\n", 44 | "1. [Introduction to numpy](numpys/numpy_intro.ipynb)\n", 45 | "2. [Working with a dataset](numpys/dataset_intro.ipynb)\n", 46 | "3. [Filtering data](numpys/filtering_data.ipynb)\n", 47 | "4. [Slices](numpys/slices.ipynb)\n", 48 | "5. [Operations](numpys/operations.ipynb)\n", 49 | "6. [Stacking](numpys/stacking.ipynb) \n", 50 | "\n", 51 | "### Extra topics\n", 52 | "\n", 53 | "7. [K-means clustering](numpys/k_means.ipynb)\n", 54 | "8. [Savez() and load()](numpys/savez.ipynb)\n", 55 | "9. [Fancy indexing](numpys/fancy_indexing.ipynb)\n", 56 | "10. [Broadcasting](numpys/broadcasting.ipynb)\n", 57 | "11. [Test yourself](numpys/test_yourself.ipynb)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Based on material from:\n", 65 | " - Software Carpentry, [Python Novice inflammation](https://github.com/swcarpentry/python-novice-inflammation)\n", 66 | " - Paris Software Carpentry, [Advanced numpy lesson](https://paris-swc.github.io/advanced-numpy-lesson/)\n", 67 | " - Bartosz Teleńczuk, [Advanced Numpy tutorial](https://github.com/paris-saclay-cds/data-science-workshop-2019/blob/master/Day_1_Scientific_Python/01-numpy-introduction.ipynb)\n", 68 | " " 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | " ### You might also like:\n", 76 | " - Gaël Varoquaux, Emmanuelle Gouillart and Olav Vahtras (editors), [SciPy Lectures](http://scipy-lectures.org/)\n", 77 | " - NumPy community, [NumPy Docs](https://docs.scipy.org/doc/numpy/)\n", 78 | " - Juan Nuñez-Iglesias, [Lecture on Advanced NumPy patterns](https://github.com/jni/aspp2015)\n", 79 | " - Stéfan van der Walt, [Advanced NumPy tutorial](https://python.g-node.org/python-summerschool-2014/numpy.html)\n", 80 | " - Nicolas Rougier, [100 NumPy exercises](https://github.com/rougier/numpy-100)\n", 81 | " - Bartosz Teleńczuk, [Advanced NumPy lesson](https://github.com/paris-swc/advanced-numpy-lesson)" 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.7.2" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 2 106 | } 107 | -------------------------------------------------------------------------------- /img/webfont-ubuntu-400-300-100.css: -------------------------------------------------------------------------------- 1 | /* cyrillic-ext */ 2 | @font-face { 3 | font-family: 'Ubuntu'; 4 | font-style: normal; 5 | font-weight: 300; 6 | src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/X_EdMnknKUltk57alVVbVxJtnKITppOI_IvcXXDNrsc.woff2) format('woff2'); 7 | unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F; 8 | } 9 | /* cyrillic */ 10 | @font-face { 11 | font-family: 'Ubuntu'; 12 | font-style: normal; 13 | font-weight: 300; 14 | src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/nBF2d6Y3AbOwfkBM-9HcWBJtnKITppOI_IvcXXDNrsc.woff2) format('woff2'); 15 | unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; 16 | } 17 | /* greek-ext */ 18 | @font-face { 19 | font-family: 'Ubuntu'; 20 | font-style: normal; 21 | font-weight: 300; 22 | src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/CdlIlwqST01WNAKqZbtZkhJtnKITppOI_IvcXXDNrsc.woff2) format('woff2'); 23 | unicode-range: U+1F00-1FFF; 24 | } 25 | /* greek */ 26 | @font-face { 27 | font-family: 'Ubuntu'; 28 | font-style: normal; 29 | font-weight: 300; 30 | src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/7k0RmqCN8EFxqS6sChuRzRJtnKITppOI_IvcXXDNrsc.woff2) format('woff2'); 31 | unicode-range: U+0370-03FF; 32 | } 33 | /* latin-ext */ 34 | @font-face { 35 | font-family: 'Ubuntu'; 36 | font-style: normal; 37 | font-weight: 300; 38 | src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/WtcvfJHWXKxx4x0kuS1koRJtnKITppOI_IvcXXDNrsc.woff2) format('woff2'); 39 | unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF; 40 | } 41 | /* latin */ 42 | @font-face { 43 | font-family: 'Ubuntu'; 44 | font-style: normal; 45 | font-weight: 300; 46 | src: local('Ubuntu Light'), local('Ubuntu-Light'), url(https://fonts.gstatic.com/s/ubuntu/v9/_aijTyevf54tkVDLy-dlnFtXRa8TVwTICgirnJhmVJw.woff2) format('woff2'); 47 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000; 48 | } 49 | /* cyrillic-ext */ 50 | @font-face { 51 | font-family: 'Ubuntu'; 52 | font-style: normal; 53 | font-weight: 400; 54 | src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/ODszJI8YqNw8V2xPulzjO_esZW2xOQ-xsNqO47m55DA.woff2) format('woff2'); 55 | unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F; 56 | } 57 | /* cyrillic */ 58 | @font-face { 59 | font-family: 'Ubuntu'; 60 | font-style: normal; 61 | font-weight: 400; 62 | src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/iQ9VJx1UMASKNiGywyyCXvesZW2xOQ-xsNqO47m55DA.woff2) format('woff2'); 63 | unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; 64 | } 65 | /* greek-ext */ 66 | @font-face { 67 | font-family: 'Ubuntu'; 68 | font-style: normal; 69 | font-weight: 400; 70 | src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/WkvQmvwsfw_KKeau9SlQ2_esZW2xOQ-xsNqO47m55DA.woff2) format('woff2'); 71 | unicode-range: U+1F00-1FFF; 72 | } 73 | /* greek */ 74 | @font-face { 75 | font-family: 'Ubuntu'; 76 | font-style: normal; 77 | font-weight: 400; 78 | src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/gYAtqXUikkQjyJA1SnpDLvesZW2xOQ-xsNqO47m55DA.woff2) format('woff2'); 79 | unicode-range: U+0370-03FF; 80 | } 81 | /* latin-ext */ 82 | @font-face { 83 | font-family: 'Ubuntu'; 84 | font-style: normal; 85 | font-weight: 400; 86 | src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/Wu5Iuha-XnKDBvqRwQzAG_esZW2xOQ-xsNqO47m55DA.woff2) format('woff2'); 87 | unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF; 88 | } 89 | /* latin */ 90 | @font-face { 91 | font-family: 'Ubuntu'; 92 | font-style: normal; 93 | font-weight: 400; 94 | src: local('Ubuntu'), url(https://fonts.gstatic.com/s/ubuntu/v9/sDGTilo5QRsfWu6Yc11AXg.woff2) format('woff2'); 95 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000; 96 | } 97 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/figures/plot-simple-decision-tree-adult-census.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import pandas as pd 4 | 5 | import matplotlib.pyplot as plt 6 | from matplotlib.pyplot import cm 7 | from matplotlib.colors import ListedColormap 8 | 9 | import seaborn as sns 10 | 11 | from sklearn.preprocessing import LabelEncoder 12 | from sklearn.tree import DecisionTreeClassifier 13 | 14 | adult_census = pd.read_csv( 15 | "https://www.openml.org/data/get_csv/1595261/adult-census.csv") 16 | 17 | target_column = 'class' 18 | 19 | numerical_columns = [ 20 | 'age', 'education-num', 'capital-gain', 'capital-loss', 21 | 'hours-per-week'] 22 | categorical_columns = [ 23 | 'workclass', 'education', 'marital-status', 'occupation', 24 | 'relationship', 'race', 'sex', 'native-country'] 25 | all_columns = numerical_columns + categorical_columns + [ 26 | target_column] 27 | 28 | adult_census = adult_census[all_columns] 29 | 30 | n_samples_to_plot = 5000 31 | columns = ['age', 'education-num', 'hours-per-week'] 32 | _ = sns.pairplot(data=adult_census[:n_samples_to_plot], vars=columns, 33 | hue=target_column, plot_kws={'alpha': 0.2}, 34 | height=4, diag_kind='hist') 35 | 36 | _ = sns.pairplot(data=adult_census[:n_samples_to_plot], x_vars='age', 37 | y_vars='hours-per-week', hue=target_column, 38 | markers=['o', 39 | 'v'], plot_kws={'alpha': 0.2}, height=12) 40 | 41 | top = cm.get_cmap('Oranges', 128) 42 | bottom = cm.get_cmap('Blues_r', 128) 43 | 44 | colors = np.vstack([bottom(np.linspace(0, 1, 128)), 45 | top(np.linspace(0, 1, 128))]) 46 | blue_orange_cmap = ListedColormap(colors, name='BlueOrange') 47 | 48 | 49 | def plot_tree_decision_function(tree, X, y, ax): 50 | """Plot the different decision rules found by a `DecisionTreeClassifier`. 51 | 52 | Parameters 53 | ---------- 54 | tree : DecisionTreeClassifier instance 55 | The decision tree to inspect. 56 | X : dataframe of shape (n_samples, n_features) 57 | The data used to train the `tree` estimator. 58 | y : ndarray of shape (n_samples,) 59 | The target used to train the `tree` estimator. 60 | ax : matplotlib axis 61 | The matplotlib axis where to plot the different decision rules. 62 | """ 63 | import numpy as np 64 | from scipy import ndimage 65 | 66 | h = 0.02 67 | x_min, x_max = 0, 100 68 | y_min, y_max = 0, 100 69 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 70 | np.arange(y_min, y_max, h)) 71 | 72 | Z = tree.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] 73 | Z = Z.reshape(xx.shape) 74 | faces = tree.tree_.apply( 75 | np.c_[xx.ravel(), yy.ravel()].astype(np.float32)) 76 | faces = faces.reshape(xx.shape) 77 | border = ndimage.laplace(faces) != 0 78 | ax.scatter(X.iloc[:, 0], X.iloc[:, 1], 79 | c=np.array(['tab:blue', 80 | 'tab:orange'])[y], s=60, alpha=0.7, vmin=0, vmax=1) 81 | levels = np.linspace(0, 1, 101) 82 | contours = ax.contourf(xx, yy, Z, levels=levels, alpha=.4, cmap=blue_orange_cmap) 83 | ax.get_figure().colorbar(contours, ticks=np.linspace(0, 1, 11)) 84 | ax.scatter(xx[border], yy[border], marker='.', s=1) 85 | ax.set_xlabel(X.columns[0]) 86 | ax.set_ylabel(X.columns[1]) 87 | ax.set_xlim([x_min, x_max]) 88 | ax.set_ylim([y_min, y_max]) 89 | sns.despine(offset=10) 90 | 91 | 92 | # select a subset of data 93 | data_subset = adult_census[:n_samples_to_plot] 94 | X = data_subset[["age", "hours-per-week"]] 95 | y = LabelEncoder().fit_transform( 96 | data_subset[target_column].to_numpy()) 97 | 98 | max_leaf_nodes = 3 99 | tree = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, 100 | random_state=0) 101 | tree.fit(X, y) 102 | 103 | # plot the decision function learned by the tree 104 | fig, ax = plt.subplots() 105 | plot_tree_decision_function(tree, X, y, ax=ax) 106 | 107 | fig.savefig('simple-decision-tree-adult-census.png') 108 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/01-numpy-introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to Data Science in Python\n", 8 | "\n", 9 | "### Numpy tutorial, November, 28th 2019\n", 10 | "\n", 11 | "Working efficiently with multi-dimensional arrays in NumPy\n", 12 | "\n", 13 | "Maria Teleńczuk
\n", 14 | "email: telenczukm at gmail.com" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Quick introduction to Jupyter notebook\n", 22 | "\n", 23 | "to run Jupyter notebook in your terminal type:
\n", 24 | " `jupyter notebook`\n", 25 | " \n", 26 | "alternatively you may want to run Jupyter lab which is more advanced product of jupyter:
\n", 27 | " `jupyter lab`\n", 28 | " \n", 29 | "**Esc** : takes you into command mode, there you can use:\n", 30 | " - __a__ : insert a new cell above
\n", 31 | " - __b__ : insert a new cell below
\n", 32 | " - **m** : change the current cell to Markdown
\n", 33 | " - **y** : change the current cell to code\n", 34 | "\n", 35 | "**Enter** : go back to edit mode\n", 36 | " \n", 37 | "**Shift + Enter** : execute the cell, move to the cell below\n", 38 | "\n", 39 | "__?__ : help" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Topics\n", 47 | "1. [Introduction to numpy](numpys/numpy_intro.ipynb)\n", 48 | "2. [Working with a dataset](numpys/dataset_intro.ipynb)\n", 49 | "3. [Filtering data](numpys/filtering_data.ipynb)\n", 50 | "4. [Slices](numpys/slices.ipynb)\n", 51 | "5. [Operations](numpys/operations.ipynb)\n", 52 | "6. [Stacking](numpys/stacking.ipynb) \n", 53 | "\n", 54 | "### Extra topics\n", 55 | "\n", 56 | "7. [K-means clustering](numpys/k_means.ipynb)\n", 57 | "8. [Savez() and load()](numpys/savez.ipynb)\n", 58 | "9. [Fancy indexing](numpys/fancy_indexing.ipynb)\n", 59 | "10. [Broadcasting](numpys/broadcasting.ipynb)\n", 60 | "11. [Test yourself](numpys/test_yourself.ipynb)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Based on material from:\n", 68 | " - Software Carpentry, [Python Novice inflammation](https://github.com/swcarpentry/python-novice-inflammation)\n", 69 | " - Paris Software Carpentry, [Advanced numpy lesson](https://paris-swc.github.io/advanced-numpy-lesson/)\n", 70 | " - Bartosz Teleńczuk, [Advanced Numpy tutorial](https://github.com/paris-saclay-cds/data-science-workshop-2019/blob/master/Day_1_Scientific_Python/01-numpy-introduction.ipynb)\n", 71 | " " 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | " ### You might also like:\n", 79 | " - Gaël Varoquaux, Emmanuelle Gouillart and Olav Vahtras (editors), [SciPy Lectures](http://scipy-lectures.org/)\n", 80 | " - NumPy community, [NumPy Docs](https://docs.scipy.org/doc/numpy/)\n", 81 | " - Juan Nuñez-Iglesias, [Lecture on Advanced NumPy patterns](https://github.com/jni/aspp2015)\n", 82 | " - Stéfan van der Walt, [Advanced NumPy tutorial](https://python.g-node.org/python-summerschool-2014/numpy.html)\n", 83 | " - Nicolas Rougier, [100 NumPy exercises](https://github.com/rougier/numpy-100)\n", 84 | " - Bartosz Teleńczuk, [Advanced NumPy lesson](https://github.com/paris-swc/advanced-numpy-lesson)" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.7.2" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 2 109 | } 110 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/filtering_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Filtering data\n", 8 | "It's also possible to select elements (filter) based on a condition. " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import numpy as np\n", 18 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "a = np.arange(4)\n", 35 | "print(a)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "mask = np.array([False, True, True, False])" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "a[mask]" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "odd = (a % 2) == 1\n", 70 | "odd" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "a[odd]" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "This could be also done in a single step:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "a[(a % 2) == 1]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## $\\color{green}{\\text{Excercise}}$ Filtering data\n", 103 | "In the `data` what do you have to do to select all measurments above 10 in the first patient (index 0)?" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "patient0_data = data[0, :]\n", 113 | "patient0_data[patient0_data > 10]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "We can also substitute the measurement with a new value:" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "patient1_data = data[1, :]\n", 130 | "patient1_data[patient1_data > 10] = 10\n", 131 | "print(patient1_data)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "[Previous: Working with a dataset](dataset_intro.ipynb)
[Next: Slices](slices.ipynb)" 139 | ] 140 | } 141 | ], 142 | "metadata": { 143 | "kernelspec": { 144 | "display_name": "Python 3", 145 | "language": "python", 146 | "name": "python3" 147 | }, 148 | "language_info": { 149 | "codemirror_mode": { 150 | "name": "ipython", 151 | "version": 3 152 | }, 153 | "file_extension": ".py", 154 | "mimetype": "text/x-python", 155 | "name": "python", 156 | "nbconvert_exporter": "python", 157 | "pygments_lexer": "ipython3", 158 | "version": "3.7.2" 159 | } 160 | }, 161 | "nbformat": 4, 162 | "nbformat_minor": 2 163 | } 164 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/filtering_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Filtering data\n", 8 | "It's also possible to select elements (filter) based on a condition. " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import numpy as np\n", 18 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "a = np.arange(4)\n", 35 | "print(a)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "mask = np.array([False, True, True, False])" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "a[mask]" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "odd = (a % 2) == 1\n", 70 | "odd" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "a[odd]" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "This could be also done in a single step:" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "a[(a % 2) == 1]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## $\\color{green}{\\text{Excercise}}$ Filtering data\n", 103 | "In the `data` what do you have to do to select all measurments above 10 in the first patient (index 0)?" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "We can also substitute the measurement with a new value:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "patient1_data = data[1, :]\n", 127 | "patient1_data[patient1_data > 10] = 10\n", 128 | "print(patient1_data)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "[Previous: Working with a dataset](dataset_intro.ipynb)
[Next: Slices](slices.ipynb)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | } 145 | ], 146 | "metadata": { 147 | "kernelspec": { 148 | "display_name": "Python 3", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.7.2" 163 | } 164 | }, 165 | "nbformat": 4, 166 | "nbformat_minor": 2 167 | } 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2-Day Workshop - Introduction to Data Science in Python 2 | 3 | Materials for the Paris-Saclay Center for Data Science python workshop 4 | 5 | Data science is gaining attention impacting many scientific fields and applications. Data science encompasses a large number of topics such as data mining, data wrangling, data visualisation, pattern recognition, or machine learning. 6 | 7 | This workshop intends to give an introduction to some of these topics using Python and the PyData ecosystem. It is not a course on deep learning. 8 | 9 | *Note: the material in this repo is WIP, not the finalized material.* 10 | 11 | You can run the notebooks in a binder: [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/paris-saclay-cds/data-science-workshop-2019/master) 12 | 13 | ## Program 14 | 15 | ### Day 1 - Data wrangling, exploration, and visualisation 16 | 17 | **Goal:** introduce the PyData ecosystem to manipulate, explore, and visualize data. 18 | 19 | * Introduction to the basics of numpy, pandas, and matplotlib. 20 | 21 | ### Day 2 - Machine learning 22 | 23 | **Goal:** introduce the basics of machine learning using the scikit-learn library. 24 | 25 | * Get familiar with general principles of machine learning; 26 | * Use these principles by using the scikit-learn library on some toy and real-world data examples. 27 | 28 | 29 | ## Getting started 30 | 31 | The course uses Python 3 and some data analysis packages such as Numpy, Pandas, scikit-learn, matplotlib, and seaborn. To install the required libraries, we highly recommend Anaconda or miniconda () or another Python distribution that includes the scientific libraries (this recommendation applies to all platforms, so for both Window, Linux and Mac). 32 | 33 | ### Install Anaconda 34 | 35 | For first time users and people not fully confident with using the command line, we advice to install Anaconda, by downloading and installing the Python 3.x version from . Recent computers will require the 64-Bit installer. 36 | 37 | For more detailed instructions to install Anaconda, check the [Windows](https://docs.anaconda.com/anaconda/install/windows/), [Mac](https://docs.anaconda.com/anaconda/install/mac-os/) or [linux](https://docs.anaconda.com/anaconda/install/linux/) installation tutorial. 38 | 39 | **Note:** When you are already familiar to the command line and Python environments you could opt to use Miniconda instead of Anaconda and download it from . The main difference is that Anaconda provides a graphical user interface (Anaconda navigator) and a whole lot of scientific packages (e.g ) when installing, whereas for Miniconda the user needs to install all packages using the command line. On the other hand, Miniconda requires less disc space. By choosing Miniconda, create the workshop environment using the `environment.yml` file: `conda env create -f environment.yml` 40 | 41 | ### Install/check of required packages 42 | 43 | This tutorial will require recent installations of 44 | 45 | - [NumPy](http://www.numpy.org) 46 | - [SciPy](http://www.scipy.org) 47 | - [matplotlib](http://matplotlib.org) 48 | - [pandas](http://pandas.pydata.org) 49 | - [pillow](https://python-pillow.org) 50 | - [scikit-learn](http://scikit-learn.org/stable/) 51 | - [seaborn](http://seaborn.pydata.org/) 52 | - [IPython](http://ipython.readthedocs.org/en/stable/) 53 | - [Jupyter notebook](http://jupyter.org) 54 | - [plotly](https://plot.ly/) 55 | - [pandas-profiling](https://pandas-profiling.github.io/pandas-profiling/docs/) 56 | 57 | 58 | The last one is important and you should be able to type: 59 | 60 | ```bash 61 | jupyter notebook 62 | ``` 63 | 64 | in your terminal window and see the notebook panel load in your web browser. Try opening and running a notebook from the material to see check that it works. Alternatively you can use Jupyter notebook. 65 | 66 | After obtaining the material, we **strongly recommend** you to open and execute the script using `python check_env.py` that is located at the top level of this repository. 67 | 68 | We also recommend you to update the scikit-learn the latest release version to ensure best compatibility with the teaching material. Please upgrade already installed packages by executing 69 | 70 | ```bash 71 | conda update [package-name] 72 | ``` 73 | 74 | Depending on how you installed ``scikit-learn``. 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/04_basic_parameters_tuning_exercise_02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 02\n", 8 | "The goal is to find the best set of hyper-parameters which maximize the\n", 9 | "performance on a training set." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "\n", 21 | "df = pd.read_csv(\n", 22 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 23 | "# Or use the local copy:\n", 24 | "# df = pd.read_csv('../datasets/adult-census.csv')\n", 25 | "\n", 26 | "target_name = \"class\"\n", 27 | "target = df[target_name].to_numpy()\n", 28 | "data = df.drop(columns=target_name)\n", 29 | "\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "\n", 32 | "df_train, df_test, target_train, target_test = train_test_split(\n", 33 | " data, target, random_state=42)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "TODO: create your machine learning pipeline\n", 41 | "\n", 42 | "You should:\n", 43 | "* preprocess the categorical columns using a `OneHotEncoder` and use a\n", 44 | " `StandardScaler` to normalize the numerical data.\n", 45 | "* use a `LogisticRegression` as a predictive model." 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "lines_to_next_cell": 0 52 | }, 53 | "source": [ 54 | "Start by defining the columns and the preprocessing pipelines to be applied\n", 55 | "on each columns." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "\n", 65 | "from sklearn.preprocessing import OneHotEncoder\n", 66 | "from sklearn.preprocessing import StandardScaler" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "lines_to_next_cell": 0 73 | }, 74 | "source": [ 75 | "Subsequently, create a `ColumnTransformer` to redirect the specific columns\n", 76 | "a preprocessing pipeline." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "\n", 86 | "from sklearn.compose import ColumnTransformer" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": { 92 | "lines_to_next_cell": 0 93 | }, 94 | "source": [ 95 | "Finally, concatenate the preprocessing pipeline with a logistic regression." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "lines_to_next_cell": 2 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "\n", 107 | "from sklearn.pipeline import make_pipeline\n", 108 | "from sklearn.linear_model import LogisticRegression" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "TODO: make your random search\n", 116 | "\n", 117 | "Use a `RandomizedSearchCV` to find the best set of hyper-parameters by tuning\n", 118 | "the following parameters for the `LogisticRegression` model:\n", 119 | "- `C` with values ranging from 0.001 to 10. You can use a reciprocal\n", 120 | " distribution (i.e. `scipy.stats.reciprocal`);\n", 121 | "- `solver` with possible values being `\"liblinear\"` and `\"lbfgs\"`;\n", 122 | "- `penalty` with possible values being `\"l2\"` and `\"l1\"`;\n", 123 | "In addition, try several preprocessing strategies with the `OneHotEncoder`\n", 124 | "by always (or not) dropping the first column when encoding the categorical\n", 125 | "data.\n", 126 | "\n", 127 | "Notes: You can accept failure during a grid-search or a randomized-search\n", 128 | "by settgin `error_score` to `np.nan` for instance." 129 | ] 130 | } 131 | ], 132 | "metadata": { 133 | "jupytext": { 134 | "formats": "python_scripts//py:percent,notebooks//ipynb" 135 | }, 136 | "kernelspec": { 137 | "display_name": "Python 3", 138 | "language": "python", 139 | "name": "python3" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 2 144 | } 145 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/boolean_mask.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Boolean mask\n", 8 | "\n", 9 | "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "[0 1 2 3]\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "a = np.arange(4)\n", 36 | "print(a)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "array([1, 2])" 48 | ] 49 | }, 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "mask = np.array([False, True, True, False])\n", 57 | "a[mask]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "array([False, True, False, True])" 76 | ] 77 | }, 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "odd = (a % 2) == 1\n", 85 | "odd" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "array([1, 3])" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "np.array([False, True, False, True], dtype=bool)\n", 106 | "a[odd]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "This could be also done in a single step:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "array([1, 3])" 125 | ] 126 | }, 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "a[(a % 2) == 1]" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## $\\color{green}{\\text{Quiz}}$ view or copy\n", 141 | "What are the final values of a and b at the end of the following program? Explain why.\n", 142 | "\n", 143 | "`a = np.arange(5)\n", 144 | "b = a[a < 3]\n", 145 | "b[::2] = 0`\n", 146 | "\n", 147 | "\n", 148 | "a) a = [0, 1, 2, 3, 4], b = [0, 1, 2]
\n", 149 | "b) a = [0, 1, 0, 3, 4], b = [0, 1, 0]
\n", 150 | "c) a = [0, 0, 2, 3, 4], b = [0, 0, 2]
\n", 151 | "d) a = [0, 1, 2, 3, 4], b = [0, 1, 0]
\n", 152 | "e) a = [0, 1, 2, 3, 4], b = [0, 1, 0, 3, 0]
" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "[Previous: Operations](operations.ipynb)
[Next: Stacking](stacking.ipynb)" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.7.2" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/boolean_mask.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Boolean mask\n", 8 | "\n", 9 | "Sometimes we may want to select array elements based on their values. For this case boolean mask is very useful. The mask is an array of the same length as the indexed array containg only False or True values:" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "[0 1 2 3]\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "a = np.arange(4)\n", 36 | "print(a)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "array([1, 2])" 48 | ] 49 | }, 50 | "execution_count": 3, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "mask = np.array([False, True, True, False])\n", 57 | "a[mask]" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "In most cases the mask is constructed from the values of the array itself. For example, to select only odd numbers we could use the following mask:" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "array([False, True, False, True])" 76 | ] 77 | }, 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "odd = (a % 2) == 1\n", 85 | "odd" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "array([1, 3])" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "np.array([False, True, False, True], dtype=bool)\n", 106 | "a[odd]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "This could be also done in a single step:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "array([1, 3])" 125 | ] 126 | }, 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "a[(a % 2) == 1]" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## $\\color{green}{\\text{Quiz}}$ view or copy\n", 141 | "What are the final values of a and b at the end of the following program? Explain why.\n", 142 | "\n", 143 | "`a = np.arange(5)\n", 144 | "b = a[a < 3]\n", 145 | "b[::2] = 0`\n", 146 | "\n", 147 | "\n", 148 | "a) a = [0, 1, 2, 3, 4], b = [0, 1, 2]
\n", 149 | "b) a = [0, 1, 0, 3, 4], b = [0, 1, 0]
\n", 150 | "c) a = [0, 0, 2, 3, 4], b = [0, 0, 2]
\n", 151 | "d) a = [0, 1, 2, 3, 4], b = [0, 1, 0]
\n", 152 | "e) a = [0, 1, 2, 3, 4], b = [0, 1, 0, 3, 0]
" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "[Previous: Operations](operations.ipynb)
[Next: Stacking](stacking.ipynb)" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.7.2" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/04_basic_parameters_tuning_exercise_01_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 01\n", 8 | "The goal is to write an exhaustive search to find the best parameters\n", 9 | "combination maximizing the model performance" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "\n", 20 | "from sklearn.model_selection import train_test_split\n", 21 | "from sklearn.preprocessing import OrdinalEncoder\n", 22 | "from sklearn.model_selection import RandomizedSearchCV\n", 23 | "from sklearn.compose import ColumnTransformer\n", 24 | "from sklearn.pipeline import Pipeline\n", 25 | "# This line is currently required to import HistGradientBoostingClassifier\n", 26 | "from sklearn.experimental import enable_hist_gradient_boosting\n", 27 | "from sklearn.ensemble import HistGradientBoostingClassifier\n", 28 | "\n", 29 | "from scipy.stats import expon, uniform\n", 30 | "from scipy.stats import randint\n", 31 | "\n", 32 | "df = pd.read_csv(\n", 33 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 34 | "# Or use the local copy:\n", 35 | "# df = pd.read_csv('../datasets/adult-census.csv')\n", 36 | "\n", 37 | "target_name = \"class\"\n", 38 | "target = df[target_name].to_numpy()\n", 39 | "data = df.drop(columns=target_name)\n", 40 | "\n", 41 | "df_train, df_test, target_train, target_test = train_test_split(\n", 42 | " data, target, random_state=42)\n", 43 | "\n", 44 | "from sklearn.compose import ColumnTransformer\n", 45 | "from sklearn.preprocessing import OrdinalEncoder\n", 46 | "\n", 47 | "categorical_columns = [\n", 48 | " 'workclass', 'education', 'marital-status', 'occupation',\n", 49 | " 'relationship', 'race', 'native-country', 'sex']\n", 50 | "\n", 51 | "categories = [data[column].unique()\n", 52 | " for column in data[categorical_columns]]\n", 53 | "\n", 54 | "categorical_preprocessor = OrdinalEncoder(categories=categories)\n", 55 | "\n", 56 | "preprocessor = ColumnTransformer(\n", 57 | " [('cat-preprocessor', categorical_preprocessor, categorical_columns)],\n", 58 | " remainder='passthrough', sparse_threshold=0)\n", 59 | "\n", 60 | "from sklearn.experimental import enable_hist_gradient_boosting\n", 61 | "from sklearn.ensemble import HistGradientBoostingClassifier\n", 62 | "from sklearn.pipeline import make_pipeline\n", 63 | "\n", 64 | "model = make_pipeline(\n", 65 | " preprocessor, HistGradientBoostingClassifier(random_state=42))" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "TODO: write your solution here\n", 73 | "\n", 74 | "Use the previously defined model (called `model`) and using two nested `for`\n", 75 | "loops, make a search of the best combinations of the `learning_rate` and\n", 76 | "`max_leaf_nodes` parameters. In this regard, you will need to train and test\n", 77 | "the model by setting the parameters. The evaluation of the model should be\n", 78 | "performed using `cross_val_score`. We can propose to define the following\n", 79 | "parameters search:\n", 80 | "- `learning_rate` for the values 0.01, 0.1, and 1;\n", 81 | "- `max_leaf_nodes` for the values 5, 25, 45." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "from sklearn.model_selection import cross_val_score\n", 91 | "\n", 92 | "learning_rate = [0.01, 0.1, 1, 10]\n", 93 | "max_leaf_nodes = [5, 25, 45]\n", 94 | "\n", 95 | "best_score = 0\n", 96 | "best_params = {}\n", 97 | "for lr in learning_rate:\n", 98 | " for mln in max_leaf_nodes:\n", 99 | " model.set_params(\n", 100 | " histgradientboostingclassifier__learning_rate=lr,\n", 101 | " histgradientboostingclassifier__max_leaf_nodes=mln\n", 102 | " )\n", 103 | " scores = cross_val_score(model, df_train, target_train, cv=3)\n", 104 | " if scores.mean() > best_score:\n", 105 | " best_score = scores.mean()\n", 106 | " best_params = {'learning-rate': lr, 'max leaf nodes': mln}\n", 107 | "print(f\"The best accuracy obtained is {best_score:.3f}\")\n", 108 | "print(f\"The best parameters found are:\\n {best_params}\")" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "jupytext": { 114 | "formats": "python_scripts//py:percent,notebooks//ipynb" 115 | }, 116 | "kernelspec": { 117 | "display_name": "Python 3", 118 | "language": "python", 119 | "name": "python3" 120 | } 121 | }, 122 | "nbformat": 4, 123 | "nbformat_minor": 2 124 | } 125 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exercise 03\n", 8 | "\n", 9 | "The goal of this exercise is to evaluate the impact of feature preprocessing on a pipeline that uses a decision-tree-based classifier instead of logistic regression.\n", 10 | "\n", 11 | "- The first question is to empirically evaluate whether scaling numerical feature is helpful or not;\n", 12 | "\n", 13 | "- The second question is to evaluate whether it is empirically better (both from a computational and a statistical perspective) to use integer coded or one-hot encoded categories." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "from sklearn.model_selection import cross_val_score\n", 24 | "from sklearn.pipeline import make_pipeline\n", 25 | "from sklearn.compose import ColumnTransformer\n", 26 | "from sklearn.preprocessing import OrdinalEncoder\n", 27 | "from sklearn.experimental import enable_hist_gradient_boosting\n", 28 | "from sklearn.ensemble import HistGradientBoostingClassifier\n", 29 | "\n", 30 | "df = pd.read_csv(\n", 31 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 32 | "\n", 33 | "# Or use the local copy:\n", 34 | "# df = pd.read_csv('../datasets/adult-census.csv')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "target_name = \"class\"\n", 44 | "target = df[target_name].to_numpy()\n", 45 | "data = df.drop(columns=[target_name, \"fnlwgt\"])" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "numerical_columns = [\n", 55 | " c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n", 56 | "categorical_columns = [\n", 57 | " c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n", 58 | "\n", 59 | "categories = [\n", 60 | " data[column].unique() for column in data[categorical_columns]]" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Reference pipeline (no numerical scaling and integer-coded categories)\n", 68 | "\n", 69 | "First let's time the pipeline we used in the main notebook to serve as a reference:" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "%%time\n", 79 | "\n", 80 | "preprocessor = ColumnTransformer([\n", 81 | " ('categorical', OrdinalEncoder(categories=categories),\n", 82 | " categorical_columns),], remainder=\"passthrough\")\n", 83 | "\n", 84 | "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", 85 | "scores = cross_val_score(model, data, target)\n", 86 | "print(f\"The different scores obtained are: \\n{scores}\")\n", 87 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Scaling numerical features\n", 95 | "\n", 96 | "Let's write a similar pipeline that also scales the numerical features using `StandardScaler` (or similar):" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# TODO write me!" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## One-hot encoding of categorical variables\n", 113 | "\n", 114 | "For linear models, we have observed that integer coding of categorical\n", 115 | "variables can be very detrimental. However for\n", 116 | "`HistGradientBoostingClassifier` models, it does not seem to be the\n", 117 | "case as the cross-validation of the reference pipeline with\n", 118 | "`OrdinalEncoder` is good.\n", 119 | "\n", 120 | "Let's see if we can get an even better accuracy with `OneHotEncoding`.\n", 121 | "\n", 122 | "Hint: `HistGradientBoostingClassifier` does not yet support sparse input data. You might want to use\n", 123 | "`OneHotEncoder(handle_unknown=\"ignore\", sparse=False)` to force the use a dense representation as a workaround." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# TODO: write me!" 133 | ] 134 | } 135 | ], 136 | "metadata": { 137 | "jupytext": { 138 | "formats": "python_scripts//py:percent,notebooks//ipynb" 139 | }, 140 | "kernelspec": { 141 | "display_name": "Python 3", 142 | "language": "python", 143 | "name": "python3" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 2 148 | } 149 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/data/inflammation-01.csv: -------------------------------------------------------------------------------- 1 | 0,0,1,3,1,2,4,7,8,3,3,3,10,5,7,4,7,7,12,18,6,13,11,11,7,7,4,6,8,8,4,4,5,7,3,4,2,3,0,0 2 | 0,1,2,1,2,1,3,2,2,6,10,11,5,9,4,4,7,16,8,6,18,4,12,5,12,7,11,5,11,3,3,5,4,4,5,5,1,1,0,1 3 | 0,1,1,3,3,2,6,2,5,9,5,7,4,5,4,15,5,11,9,10,19,14,12,17,7,12,11,7,4,2,10,5,4,2,2,3,2,2,1,1 4 | 0,0,2,0,4,2,2,1,6,7,10,7,9,13,8,8,15,10,10,7,17,4,4,7,6,15,6,4,9,11,3,5,6,3,3,4,2,3,2,1 5 | 0,1,1,3,3,1,3,5,2,4,4,7,6,5,3,10,8,10,6,17,9,14,9,7,13,9,12,6,7,7,9,6,3,2,2,4,2,0,1,1 6 | 0,0,1,2,2,4,2,1,6,4,7,6,6,9,9,15,4,16,18,12,12,5,18,9,5,3,10,3,12,7,8,4,7,3,5,4,4,3,2,1 7 | 0,0,2,2,4,2,2,5,5,8,6,5,11,9,4,13,5,12,10,6,9,17,15,8,9,3,13,7,8,2,8,8,4,2,3,5,4,1,1,1 8 | 0,0,1,2,3,1,2,3,5,3,7,8,8,5,10,9,15,11,18,19,20,8,5,13,15,10,6,10,6,7,4,9,3,5,2,5,3,2,2,1 9 | 0,0,0,3,1,5,6,5,5,8,2,4,11,12,10,11,9,10,17,11,6,16,12,6,8,14,6,13,10,11,4,6,4,7,6,3,2,1,0,0 10 | 0,1,1,2,1,3,5,3,5,8,6,8,12,5,13,6,13,8,16,8,18,15,16,14,12,7,3,8,9,11,2,5,4,5,1,4,1,2,0,0 11 | 0,1,0,0,4,3,3,5,5,4,5,8,7,10,13,3,7,13,15,18,8,15,15,16,11,14,12,4,10,10,4,3,4,5,5,3,3,2,2,1 12 | 0,1,0,0,3,4,2,7,8,5,2,8,11,5,5,8,14,11,6,11,9,16,18,6,12,5,4,3,5,7,8,3,5,4,5,5,4,0,1,1 13 | 0,0,2,1,4,3,6,4,6,7,9,9,3,11,6,12,4,17,13,15,13,12,8,7,4,7,12,9,5,6,5,4,7,3,5,4,2,3,0,1 14 | 0,0,0,0,1,3,1,6,6,5,5,6,3,6,13,3,10,13,9,16,15,9,11,4,6,4,11,11,12,3,5,8,7,4,6,4,1,3,0,0 15 | 0,1,2,1,1,1,4,1,5,2,3,3,10,7,13,5,7,17,6,9,12,13,10,4,12,4,6,7,6,10,8,2,5,1,3,4,2,0,2,0 16 | 0,1,1,0,1,2,4,3,6,4,7,5,5,7,5,10,7,8,18,17,9,8,12,11,11,11,14,6,11,2,10,9,5,6,5,3,4,2,2,0 17 | 0,0,0,0,2,3,6,5,7,4,3,2,10,7,9,11,12,5,12,9,13,19,14,17,5,13,8,11,5,10,9,8,7,5,3,1,4,0,2,1 18 | 0,0,0,1,2,1,4,3,6,7,4,2,12,6,12,4,14,7,8,14,13,19,6,9,12,6,4,13,6,7,2,3,6,5,4,2,3,0,1,0 19 | 0,0,2,1,2,5,4,2,7,8,4,7,11,9,8,11,15,17,11,12,7,12,7,6,7,4,13,5,7,6,6,9,2,1,1,2,2,0,1,0 20 | 0,1,2,0,1,4,3,2,2,7,3,3,12,13,11,13,6,5,9,16,9,19,16,11,8,9,14,12,11,9,6,6,6,1,1,2,4,3,1,1 21 | 0,1,1,3,1,4,4,1,8,2,2,3,12,12,10,15,13,6,5,5,18,19,9,6,11,12,7,6,3,6,3,2,4,3,1,5,4,2,2,0 22 | 0,0,2,3,2,3,2,6,3,8,7,4,6,6,9,5,12,12,8,5,12,10,16,7,14,12,5,4,6,9,8,5,6,6,1,4,3,0,2,0 23 | 0,0,0,3,4,5,1,7,7,8,2,5,12,4,10,14,5,5,17,13,16,15,13,6,12,9,10,3,3,7,4,4,8,2,6,5,1,0,1,0 24 | 0,1,1,1,1,3,3,2,6,3,9,7,8,8,4,13,7,14,11,15,14,13,5,13,7,14,9,10,5,11,5,3,5,1,1,4,4,1,2,0 25 | 0,1,1,1,2,3,5,3,6,3,7,10,3,8,12,4,12,9,15,5,17,16,5,10,10,15,7,5,3,11,5,5,6,1,1,1,1,0,2,1 26 | 0,0,2,1,3,3,2,7,4,4,3,8,12,9,12,9,5,16,8,17,7,11,14,7,13,11,7,12,12,7,8,5,7,2,2,4,1,1,1,0 27 | 0,0,1,2,4,2,2,3,5,7,10,5,5,12,3,13,4,13,7,15,9,12,18,14,16,12,3,11,3,2,7,4,8,2,2,1,3,0,1,1 28 | 0,0,1,1,1,5,1,5,2,2,4,10,4,8,14,6,15,6,12,15,15,13,7,17,4,5,11,4,8,7,9,4,5,3,2,5,4,3,2,1 29 | 0,0,2,2,3,4,6,3,7,6,4,5,8,4,7,7,6,11,12,19,20,18,9,5,4,7,14,8,4,3,7,7,8,3,5,4,1,3,1,0 30 | 0,0,0,1,4,4,6,3,8,6,4,10,12,3,3,6,8,7,17,16,14,15,17,4,14,13,4,4,12,11,6,9,5,5,2,5,2,1,0,1 31 | 0,1,1,0,3,2,4,6,8,6,2,3,11,3,14,14,12,8,8,16,13,7,6,9,15,7,6,4,10,8,10,4,2,6,5,5,2,3,2,1 32 | 0,0,2,3,3,4,5,3,6,7,10,5,10,13,14,3,8,10,9,9,19,15,15,6,8,8,11,5,5,7,3,6,6,4,5,2,2,3,0,0 33 | 0,1,2,2,2,3,6,6,6,7,6,3,11,12,13,15,15,10,14,11,11,8,6,12,10,5,12,7,7,11,5,8,5,2,5,5,2,0,2,1 34 | 0,0,2,1,3,5,6,7,5,8,9,3,12,10,12,4,12,9,13,10,10,6,10,11,4,15,13,7,3,4,2,9,7,2,4,2,1,2,1,1 35 | 0,0,1,2,4,1,5,5,2,3,4,8,8,12,5,15,9,17,7,19,14,18,12,17,14,4,13,13,8,11,5,6,6,2,3,5,2,1,1,1 36 | 0,0,0,3,1,3,6,4,3,4,8,3,4,8,3,11,5,7,10,5,15,9,16,17,16,3,8,9,8,3,3,9,5,1,6,5,4,2,2,0 37 | 0,1,2,2,2,5,5,1,4,6,3,6,5,9,6,7,4,7,16,7,16,13,9,16,12,6,7,9,10,3,6,4,5,4,6,3,4,3,2,1 38 | 0,1,1,2,3,1,5,1,2,2,5,7,6,6,5,10,6,7,17,13,15,16,17,14,4,4,10,10,10,11,9,9,5,4,4,2,1,0,1,0 39 | 0,1,0,3,2,4,1,1,5,9,10,7,12,10,9,15,12,13,13,6,19,9,10,6,13,5,13,6,7,2,5,5,2,1,1,1,1,3,0,1 40 | 0,1,1,3,1,1,5,5,3,7,2,2,3,12,4,6,8,15,16,16,15,4,14,5,13,10,7,10,6,3,2,3,6,3,3,5,4,3,2,1 41 | 0,0,0,2,2,1,3,4,5,5,6,5,5,12,13,5,7,5,11,15,18,7,9,10,14,12,11,9,10,3,2,9,6,2,2,5,3,0,0,1 42 | 0,0,1,3,3,1,2,1,8,9,2,8,10,3,8,6,10,13,11,17,19,6,4,11,6,12,7,5,5,4,4,8,2,6,6,4,2,2,0,0 43 | 0,1,1,3,4,5,2,1,3,7,9,6,10,5,8,15,11,12,15,6,12,16,6,4,14,3,12,9,6,11,5,8,5,5,6,1,2,1,2,0 44 | 0,0,1,3,1,4,3,6,7,8,5,7,11,3,6,11,6,10,6,19,18,14,6,10,7,9,8,5,8,3,10,2,5,1,5,4,2,1,0,1 45 | 0,1,1,3,3,4,4,6,3,4,9,9,7,6,8,15,12,15,6,11,6,18,5,14,15,12,9,8,3,6,10,6,8,7,2,5,4,3,1,1 46 | 0,1,2,2,4,3,1,4,8,9,5,10,10,3,4,6,7,11,16,6,14,9,11,10,10,7,10,8,8,4,5,8,4,4,5,2,4,1,1,0 47 | 0,0,2,3,4,5,4,6,2,9,7,4,9,10,8,11,16,12,15,17,19,10,18,13,15,11,8,4,7,11,6,7,6,5,1,3,1,0,0,0 48 | 0,1,1,3,1,4,6,2,8,2,10,3,11,9,13,15,5,15,6,10,10,5,14,15,12,7,4,5,11,4,6,9,5,6,1,1,2,1,2,1 49 | 0,0,1,3,2,5,1,2,7,6,6,3,12,9,4,14,4,6,12,9,12,7,11,7,16,8,13,6,7,6,10,7,6,3,1,5,4,3,0,0 50 | 0,0,1,2,3,4,5,7,5,4,10,5,12,12,5,4,7,9,18,16,16,10,15,15,10,4,3,7,5,9,4,6,2,4,1,4,2,2,2,1 51 | 0,1,2,1,1,3,5,3,6,3,10,10,11,10,13,10,13,6,6,14,5,4,5,5,9,4,12,7,7,4,7,9,3,3,6,3,4,1,2,0 52 | 0,1,2,2,3,5,2,4,5,6,8,3,5,4,3,15,15,12,16,7,20,15,12,8,9,6,12,5,8,3,8,5,4,1,3,2,1,3,1,0 53 | 0,0,0,2,4,4,5,3,3,3,10,4,4,4,14,11,15,13,10,14,11,17,9,11,11,7,10,12,10,10,10,8,7,5,2,2,4,1,2,1 54 | 0,0,2,1,1,4,4,7,2,9,4,10,12,7,6,6,11,12,9,15,15,6,6,13,5,12,9,6,4,7,7,6,5,4,1,4,2,2,2,1 55 | 0,1,2,1,1,4,5,4,4,5,9,7,10,3,13,13,8,9,17,16,16,15,12,13,5,12,10,9,11,9,4,5,5,2,2,5,1,0,0,1 56 | 0,0,1,3,2,3,6,4,5,7,2,4,11,11,3,8,8,16,5,13,16,5,8,8,6,9,10,10,9,3,3,5,3,5,4,5,3,3,0,1 57 | 0,1,1,2,2,5,1,7,4,2,5,5,4,6,6,4,16,11,14,16,14,14,8,17,4,14,13,7,6,3,7,7,5,6,3,4,2,2,1,1 58 | 0,1,1,1,4,1,6,4,6,3,6,5,6,4,14,13,13,9,12,19,9,10,15,10,9,10,10,7,5,6,8,6,6,4,3,5,2,1,1,1 59 | 0,0,0,1,4,5,6,3,8,7,9,10,8,6,5,12,15,5,10,5,8,13,18,17,14,9,13,4,10,11,10,8,8,6,5,5,2,0,2,0 60 | 0,0,1,0,3,2,5,4,8,2,9,3,3,10,12,9,14,11,13,8,6,18,11,9,13,11,8,5,5,2,8,5,3,5,4,1,3,1,1,0 61 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_01_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Solution for Exercise 02\n", 8 | "\n", 9 | "The goal of this exercise is to evalutate the impact of using an arbitrary\n", 10 | "integer encoding for categorical variables along with a linear\n", 11 | "classification model such as Logistic Regression.\n", 12 | "\n", 13 | "To do so, let's try to use `OrdinalEncoder` to preprocess the categorical\n", 14 | "variables. This preprocessor is assembled in a pipeline with\n", 15 | "`LogisticRegression`. The performance of the pipeline can be evaluated as\n", 16 | "usual by cross-validation and then compared to the score obtained when using\n", 17 | "`OneHotEncoding` or to some other baseline score.\n", 18 | "\n", 19 | "Because `OrdinalEncoder` can raise errors if it sees an unknown category at\n", 20 | "prediction time, we need to pre-compute the list of all possible categories\n", 21 | "ahead of time:\n", 22 | "\n", 23 | "```python\n", 24 | "categories = [data[column].unique()\n", 25 | " for column in data[categorical_columns]]\n", 26 | "OrdinalEncoder(categories=categories)\n", 27 | "```" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd\n", 37 | "\n", 38 | "df = pd.read_csv(\n", 39 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 40 | "\n", 41 | "# Or use the local copy:\n", 42 | "# df = pd.read_csv('../datasets/adult-census.csv')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "target_name = \"class\"\n", 52 | "target = df[target_name].to_numpy()\n", 53 | "data = df.drop(columns=[target_name, \"fnlwgt\"])\n", 54 | "categorical_columns = [\n", 55 | " c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n", 56 | "data_categorical = data[categorical_columns]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "categories = [\n", 66 | " data[column].unique() for column in data[categorical_columns]]\n", 67 | "\n", 68 | "categories" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "from sklearn.model_selection import cross_val_score\n", 78 | "from sklearn.pipeline import make_pipeline\n", 79 | "from sklearn.preprocessing import OrdinalEncoder\n", 80 | "from sklearn.linear_model import LogisticRegression\n", 81 | "\n", 82 | "model = make_pipeline(\n", 83 | " OrdinalEncoder(categories=categories),\n", 84 | " LogisticRegression(solver='lbfgs', max_iter=1000))\n", 85 | "scores = cross_val_score(model, data_categorical, target)\n", 86 | "print(f\"The different scores obtained are: \\n{scores}\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Using an arbitrary mapping from string labels to integers as done here causes the linear model to make bad assumptions on the relative ordering of categories.\n", 103 | "\n", 104 | "This prevent the model to learning anything predictive enough and the cross-validated score is even lower that the baseline we obtained by ignoring the input data and just always predict the most frequent class:" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from sklearn.dummy import DummyClassifier\n", 114 | "\n", 115 | "scores = cross_val_score(DummyClassifier(strategy=\"most_frequent\"),\n", 116 | " data_categorical, target)\n", 117 | "print(f\"The different scores obtained are: \\n{scores}\")\n", 118 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "By comparison, a categorical encoding that does not assume any ordering in the\n", 126 | "categories can lead to a significantly higher score:" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from sklearn.preprocessing import OneHotEncoder\n", 136 | "\n", 137 | "model = make_pipeline(\n", 138 | " OneHotEncoder(handle_unknown=\"ignore\"),\n", 139 | " LogisticRegression(solver='lbfgs', max_iter=1000))\n", 140 | "scores = cross_val_score(model, data_categorical, target)\n", 141 | "print(f\"The different scores obtained are: \\n{scores}\")\n", 142 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 143 | ] 144 | } 145 | ], 146 | "metadata": { 147 | "jupytext": { 148 | "formats": "python_scripts//py:percent,notebooks//ipynb" 149 | }, 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 2 158 | } 159 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/numpy_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to numpy" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "__numpy array__
\n", 15 | "NumPy array is a data container. It is similar to Python lists, but it’s specialised for working on numerical data. NumPy is at the center of scientific Python ecosystem and it is a work-horse of many scientific libraries including scikit-learn, scikit-image, matplotlib, SciPy.\n", 16 | "\n", 17 | "In general you should use this library if you want to do fancy things with **numbers**, especially if you have **matrices** or **arrays.**
\n", 18 | "\n", 19 | "To use NumPy we need to start python interpreter and import numpy package:" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Let's create a simple numpy array" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "x = np.array([2, 1, 5])\n", 45 | "print(x)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Lists vs Numpy arrays\n", 53 | "\n", 54 | "The Python core library provides Lists. A list is the Python equivalent of an array, but it is resizeable and can contain elements of different types.\n", 55 | "\n", 56 | "Pros of an array:\n", 57 | " - **Size** - Numpy data structures take up less space\n", 58 | " - **Performance** - faster than lists\n", 59 | " - **Functionality** - SciPy and NumPy have optimized functions such as linear algebra operations built in." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "L = range(1000)\n", 69 | "%timeit [i**2 for i in L]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "__np.arange__ works like Python built-in range, but it returns an array; " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "np.arange(5)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "a = np.arange(1000)\n", 95 | "%timeit a**2" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Memory layout\n", 103 | "\n", 104 | "NumPy array is just a memory block with extra information how to interpret its contents. \n", 105 | "\n", 106 | "To construct an array with pre-defined elements we can also use one of the built-in helper functions:" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "__np.ones__ and __np.zeros__ return arrays of 0s or 1s; " 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "np.ones(5)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "np.zeros(5)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "__np.random.rand__ creates an array of random numbers from an interval [0, 1]:" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "np.random.rand(5)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "We can also construct a two- or more dimensional arrays:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "np.array([[1, 2], [5, 6]])" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "np.ones((2, 2))" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "Alternatively, a n-dimensional array can be obtained by reshaping a 1-D array:" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "a = np.arange(12)\n", 189 | "a.reshape((4,3))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "[Previous: Index](../01-numpy-introduction.ipynb)
[Next: Working with a dataset](dataset_intro.ipynb)" 197 | ] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.7.2" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/numpy_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction to numpy" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "__numpy array__
\n", 15 | "NumPy array is a data container. It is similar to Python lists, but it’s specialised for working on numerical data. NumPy is at the center of scientific Python ecosystem and it is a work-horse of many scientific libraries including scikit-learn, scikit-image, matplotlib, SciPy.\n", 16 | "\n", 17 | "In general you should use this library if you want to do fancy things with **numbers**, especially if you have **matrices** or **arrays.**
\n", 18 | "\n", 19 | "To use NumPy we need to start python interpreter and import numpy package:" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import numpy as np" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Let's create a simple numpy array" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "x = np.array([2, 1, 5])\n", 45 | "print(x)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Lists vs Numpy arrays\n", 53 | "\n", 54 | "The Python core library provides Lists. A list is the Python equivalent of an array, but it is resizeable and can contain elements of different types.\n", 55 | "\n", 56 | "Pros of an array:\n", 57 | " - **Size** - Numpy data structures take up less space\n", 58 | " - **Performance** - faster than lists\n", 59 | " - **Functionality** - SciPy and NumPy have optimized functions such as linear algebra operations built in." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "L = range(1000)\n", 69 | "%timeit [i**2 for i in L]" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "__np.arange__ works like Python built-in range, but it returns an array; " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "np.arange(5)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "scrolled": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "a = np.arange(1000)\n", 97 | "%timeit a**2" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### Memory layout\n", 105 | "\n", 106 | "NumPy array is just a memory block with extra information how to interpret its contents. \n", 107 | "\n", 108 | "To construct an array with pre-defined elements we can also use other built-in helper functions:" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "__np.ones__ and __np.zeros__ return arrays of 0s or 1s; " 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "np.ones(5)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "np.zeros(5)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "__np.random.rand__ creates an array of random numbers from an interval [0, 1]:" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "np.random.rand(5)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "We can also construct a two- or more dimensional arrays:" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "np.array([[1, 2], [5, 6]])" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "np.ones((2, 2))" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Alternatively, a n-dimensional array can be obtained by reshaping a 1-D array:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "a = np.arange(12)\n", 191 | "a.reshape((4,3))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "[Previous: Index](../01-numpy-introduction.ipynb)
[Next: Working with a dataset](dataset_intro.ipynb)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.7.2" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /figures/polynomial_overfit_0.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | 23 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Operations\n", 8 | "\n", 9 | "Arrays also know how to perform common mathematical operations on their values. The simplest operations with data are arithmetic: addition, subtraction, multiplication, and division. When you do such operations on arrays, the operation is done element-by-element.
" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "doubledata = data + data\n", 37 | "print(doubledata)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Operations by scalar:" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "tripledata = data * 3\n", 54 | "print(tripledata)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "One of the advantages of NumPy is that it allows to apply functions (called ufuncs) to all elements of an array without the need of `for` loops:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "np.sin(data[0,:])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "This is not only convenient but also more efficient than iterating through the elements using for loops." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "expdata = np.exp(data)\n", 87 | "print(expdata)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "**Warning** Standard Python installation also includes the `math` library, but it does not play nicely with NumPy array and it may give different results than numpy function, so avoid using it with NumPy arrays.\n", 95 | "\n", 96 | "Some functions (such as mean, max, etc.) aggregate the data return arrays of less dimensions or scalars:" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "sumdata = np.sum(data)\n", 106 | "print('sum data: {}'.format(sumdata))" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "meandata = np.mean(data)\n", 116 | "print(meandata)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "It's also possbile to average over a single axis:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Axis\n", 131 | "\"drawing\"" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "np.mean(data, 0)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## $\\color{green}{\\text{Excercise}}$\n", 148 | "\n", 149 | "Create array `a` with random elements of shape 1000 x 3.\n", 150 | "Select the second and third column (index 1 and 2) and calculate the mean for each of the columns (i.e. your answer should be an array with two elements)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## $\\color{green}{\\text{Excercise}}$\n", 165 | "\n", 166 | "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## $\\color{green}{\\text{Excercise}}$ operations\n", 181 | "\n", 182 | "Average the inflammation data over the first ten patients (rows) and plot them across time (columns). Then repeat it for the next ten patients and so on. Try putting all averages on a single plot" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "\n", 192 | " " 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "[Previous: Slices](slices.ipynb)
[Next: Stacking](stacking.ipynb) " 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.7.2" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 2 231 | } 232 | -------------------------------------------------------------------------------- /img/webfont-ubuntu-mono-400-700-400italic.css: -------------------------------------------------------------------------------- 1 | /* cyrillic-ext */ 2 | @font-face { 3 | font-family: 'Ubuntu Mono'; 4 | font-style: normal; 5 | font-weight: 400; 6 | src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkTTOQ_MqJVwkKsUn0wKzc2I.woff2) format('woff2'); 7 | unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F; 8 | } 9 | /* cyrillic */ 10 | @font-face { 11 | font-family: 'Ubuntu Mono'; 12 | font-style: normal; 13 | font-weight: 400; 14 | src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkTUj_cnvWIuuBMVgbX098Mw.woff2) format('woff2'); 15 | unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; 16 | } 17 | /* greek-ext */ 18 | @font-face { 19 | font-family: 'Ubuntu Mono'; 20 | font-style: normal; 21 | font-weight: 400; 22 | src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkUbcKLIaa1LC45dFaAfauRA.woff2) format('woff2'); 23 | unicode-range: U+1F00-1FFF; 24 | } 25 | /* greek */ 26 | @font-face { 27 | font-family: 'Ubuntu Mono'; 28 | font-style: normal; 29 | font-weight: 400; 30 | src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkWo_sUJ8uO4YLWRInS22T3Y.woff2) format('woff2'); 31 | unicode-range: U+0370-03FF; 32 | } 33 | /* latin-ext */ 34 | @font-face { 35 | font-family: 'Ubuntu Mono'; 36 | font-style: normal; 37 | font-weight: 400; 38 | src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkSYE0-AqJ3nfInTTiDXDjU4.woff2) format('woff2'); 39 | unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF; 40 | } 41 | /* latin */ 42 | @font-face { 43 | font-family: 'Ubuntu Mono'; 44 | font-style: normal; 45 | font-weight: 400; 46 | src: local('Ubuntu Mono'), local('UbuntuMono-Regular'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ViZhet7Ak-LRXZMXzuAfkY4P5ICox8Kq3LLUNMylGO4.woff2) format('woff2'); 47 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000; 48 | } 49 | /* cyrillic-ext */ 50 | @font-face { 51 | font-family: 'Ubuntu Mono'; 52 | font-style: normal; 53 | font-weight: 700; 54 | src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytp6iIh_FvlUHQwED9Yt5Kbw.woff2) format('woff2'); 55 | unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F; 56 | } 57 | /* cyrillic */ 58 | @font-face { 59 | font-family: 'Ubuntu Mono'; 60 | font-style: normal; 61 | font-weight: 700; 62 | src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molyti_vZmeiCMnoWNN9rHBYaTc.woff2) format('woff2'); 63 | unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; 64 | } 65 | /* greek-ext */ 66 | @font-face { 67 | font-family: 'Ubuntu Mono'; 68 | font-style: normal; 69 | font-weight: 700; 70 | src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytiFaMxiho_5XQnyRZzQsrZs.woff2) format('woff2'); 71 | unicode-range: U+1F00-1FFF; 72 | } 73 | /* greek */ 74 | @font-face { 75 | font-family: 'Ubuntu Mono'; 76 | font-style: normal; 77 | font-weight: 700; 78 | src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytgalQocB-__pDVGhF3uS2Ks.woff2) format('woff2'); 79 | unicode-range: U+0370-03FF; 80 | } 81 | /* latin-ext */ 82 | @font-face { 83 | font-family: 'Ubuntu Mono'; 84 | font-style: normal; 85 | font-weight: 700; 86 | src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytujkDdvhIIFj_YMdgqpnSB0.woff2) format('woff2'); 87 | unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF; 88 | } 89 | /* latin */ 90 | @font-face { 91 | font-family: 'Ubuntu Mono'; 92 | font-style: normal; 93 | font-weight: 700; 94 | src: local('Ubuntu Mono Bold'), local('UbuntuMono-Bold'), url(https://fonts.gstatic.com/s/ubuntumono/v6/ceqTZGKHipo8pJj4molytolIZu-HDpmDIZMigmsroc4.woff2) format('woff2'); 95 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000; 96 | } 97 | /* cyrillic-ext */ 98 | @font-face { 99 | font-family: 'Ubuntu Mono'; 100 | font-style: italic; 101 | font-weight: 400; 102 | src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKAxNcqx07xvyppV96iFRdwiM.woff2) format('woff2'); 103 | unicode-range: U+0460-052F, U+20B4, U+2DE0-2DFF, U+A640-A69F; 104 | } 105 | /* cyrillic */ 106 | @font-face { 107 | font-family: 'Ubuntu Mono'; 108 | font-style: italic; 109 | font-weight: 400; 110 | src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA-fhZE2STYI3KzBGzrJG_ik.woff2) format('woff2'); 111 | unicode-range: U+0400-045F, U+0490-0491, U+04B0-04B1, U+2116; 112 | } 113 | /* greek-ext */ 114 | @font-face { 115 | font-family: 'Ubuntu Mono'; 116 | font-style: italic; 117 | font-weight: 400; 118 | src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA26cj8HaeL2jS4NIBPr3RFo.woff2) format('woff2'); 119 | unicode-range: U+1F00-1FFF; 120 | } 121 | /* greek */ 122 | @font-face { 123 | font-family: 'Ubuntu Mono'; 124 | font-style: italic; 125 | font-weight: 400; 126 | src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA9cKKn5Xt5n-nnvkqIBMZms.woff2) format('woff2'); 127 | unicode-range: U+0370-03FF; 128 | } 129 | /* latin-ext */ 130 | @font-face { 131 | font-family: 'Ubuntu Mono'; 132 | font-style: italic; 133 | font-weight: 400; 134 | src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA0_0lycXMw8PhobHtu2Qgco.woff2) format('woff2'); 135 | unicode-range: U+0100-024F, U+1E00-1EFF, U+20A0-20AB, U+20AD-20CF, U+2C60-2C7F, U+A720-A7FF; 136 | } 137 | /* latin */ 138 | @font-face { 139 | font-family: 'Ubuntu Mono'; 140 | font-style: italic; 141 | font-weight: 400; 142 | src: local('Ubuntu Mono Italic'), local('UbuntuMono-Italic'), url(https://fonts.gstatic.com/s/ubuntumono/v6/KAKuHXAHZOeECOWAHsRKA8u2Q0OS-KeTAWjgkS85mDg.woff2) format('woff2'); 143 | unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2212, U+2215, U+E0FF, U+EFFD, U+F000; 144 | } 145 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/operations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Operations\n", 8 | "\n", 9 | "Arrays also know how to perform common mathematical operations on their values. The simplest operations with data are arithmetic: addition, subtraction, multiplication, and division. When you do such operations on arrays, the operation is done element-by-element.
" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "doubledata = data + data\n", 37 | "print(doubledata)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "Operations by scalar:" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "tripledata = data * 3\n", 54 | "print(tripledata)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "One of the advantages of NumPy is that it allows to apply functions (called ufuncs) to all elements of an array without the need of `for` loops:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "np.sin(data[0,:])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "This is not only convenient but also more efficient than iterating through the elements using for loops." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "expdata = np.exp(data)\n", 87 | "print(expdata)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "**Warning** Standard Python installation also includes the `math` library, but it does not play nicely with NumPy array and it may give different results than numpy function, so avoid using it with NumPy arrays.\n", 95 | "\n", 96 | "Some functions (such as mean, max, etc.) aggregate the data return arrays of less dimensions or scalars:" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "sumdata = np.sum(data)\n", 106 | "print('sum data: {}'.format(sumdata))" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "meandata = np.mean(data)\n", 116 | "print(meandata)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "It's also possbile to average over a single axis:" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "Axis\n", 131 | "\"drawing\"" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "np.mean(data, 0)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## $\\color{green}{\\text{Excercise}}$\n", 148 | "\n", 149 | "Create array `a` with random elements of shape 1000 x 3.\n", 150 | "Select the second and third column (index 1 and 2) and calculate the mean for each of the columns (i.e. your answer should be an array with two elements)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "a = np.random.randn(1000,3)\n", 160 | "print(a.shape)\n", 161 | "\n", 162 | "a_select = a[:,[1,2]]\n", 163 | "print(a_select.shape)\n", 164 | "\n", 165 | "mean_select = a_select.mean(0)\n", 166 | "print(mean_select)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "## $\\color{green}{\\text{Excercise}}$\n", 174 | "\n", 175 | "Generate a 10 x 3 array of random numbers (using np.random.rand). From each row, find the column index of the element closest to 0.75. Make use of np.abs and np.argmin. The result should be a one-dimensional array of integers from 0 to 2." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "rand_array = np.random.rand(10,3)\n", 185 | "rand_array2 = rand_array - 0.75\n", 186 | "closest = np.argmin(np.abs(rand_array2),1)\n", 187 | "print(rand_array)\n", 188 | "print(closest)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "## $\\color{green}{\\text{Excercise}}$ operations\n", 196 | "\n", 197 | "Average the inflammation data over the first ten patients (rows) and plot them across time (columns). Then repeat it for the next ten patients and so on. Try putting all averages on a single plot" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "import matplotlib.pylab as plt\n", 207 | "%matplotlib inline\n", 208 | "\n", 209 | "pat_0 = 0\n", 210 | "pat_last = 10\n", 211 | "while pat_last <= len(data):\n", 212 | " ave_inflammation = np.mean(data[pat_0:pat_last], axis=0)\n", 213 | " plt.plot(ave_inflammation)\n", 214 | " pat_0 = pat_last\n", 215 | " pat_last += 10\n", 216 | " " 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "[Previous: Slices](slices.ipynb)
[Next: Stacking](stacking.ipynb) " 224 | ] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": "Python 3", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.7.2" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 2 248 | } 249 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpys/dataset_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Working with a dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "We will use first dataset called `inflammation-01.csv` from the data provided by Software Carpentry: [Analyzing Patient Data](https://swcarpentry.github.io/python-novice-inflammation-2.7/).
\n", 15 | "You should be able to find it in your `data/` directory. " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "`numpy.loadtxt` has two parameters: the name of the file we want to read and the delimiter that separates values on a line. These both need to be character strings." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "We are studying inflammation in patients who have been given a new treatment for arthritis, and need to analyze the first dozen data sets of their daily inflammation. The data sets are stored in comma-separated values (CSV) format: each row holds information for a single patient, and the columns represent successive days:" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Explore array" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "print(data)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "In contrast to lists NumPy arrays can store elements of pre-determined type only.\n", 71 | "The type function will only tell you that a variable is a NumPy array but won’t tell you the type of thing inside the array. We can find out the type of the data contained in the NumPy array." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "print(data.dtype)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "and the shape of the array" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "print(data.shape)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "meaning that data array variable contains 60 rows and 40 columns" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Plotting the data" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "import matplotlib.pyplot as plt\n", 120 | "plt.imshow(data)\n", 121 | "plt.show()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "%matplotlib inline" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "plt.imshow(data)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Indexing" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "If we want to get a single number from the array, we must provide an index in square brackets after the variable name.
\n", 154 | "Note that the NumPy arrays are zero-indexed:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "data[0, 0]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "It means that that the third element in the first row has an index of [0, 2]:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "data[0, 2]" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "We can also assign the element with a new value:" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "data[0, 2] = 100.\n", 196 | "print(data[0, 2])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "NumPy (and Python in general) checks the bounds of the array:" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "print(data.shape)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "#data[60, 0]" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Finally, we can ask for several elements at once:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "data[0, [0, 10]]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## $\\color{green}{\\text{Excercise}}$\n", 245 | "\n", 246 | "Create the following array, call it `a`:\n", 247 | "\n", 248 | "a =
\n", 249 | "```\n", 250 | "2 7 12 0\n", 251 | "3 9 3 4\n", 252 | "4 0 1 3\n", 253 | "```\n", 254 | "\n", 255 | "use `a` to assign 4, 1 and 3 from the third row to array `b` \n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "[Previous: Numpy intro](numpy_intro.ipynb)
[Filtering data](filtering_data.ipynb)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [] 278 | } 279 | ], 280 | "metadata": { 281 | "kernelspec": { 282 | "display_name": "Python 3", 283 | "language": "python", 284 | "name": "python3" 285 | }, 286 | "language_info": { 287 | "codemirror_mode": { 288 | "name": "ipython", 289 | "version": 3 290 | }, 291 | "file_extension": ".py", 292 | "mimetype": "text/x-python", 293 | "name": "python", 294 | "nbconvert_exporter": "python", 295 | "pygments_lexer": "ipython3", 296 | "version": "3.7.2" 297 | } 298 | }, 299 | "nbformat": 4, 300 | "nbformat_minor": 2 301 | } 302 | -------------------------------------------------------------------------------- /Day_1_Scientific_Python/numpy_with_answers/numpys/dataset_intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Working with a dataset" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "We will use first dataset called `inflammation-01.csv` from the data provided by Software Carpentry: [Analyzing Patient Data](https://swcarpentry.github.io/python-novice-inflammation-2.7/).
\n", 15 | "You should be able to find it in your `data/` directory. " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "`numpy.loadtxt` has two parameters: the name of the file we want to read and the delimiter that separates values on a line. These both need to be character strings." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "data = np.loadtxt(fname='../data/inflammation-01.csv', delimiter=',')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "We are studying inflammation in patients who have been given a new treatment for arthritis, and need to analyze the first dozen data sets of their daily inflammation. The data sets are stored in comma-separated values (CSV) format: each row holds information for a single patient, and the columns represent successive days:" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "### Explore array" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "print(data)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "In contrast to lists NumPy arrays can store elements of pre-determined type only.\n", 71 | "The type function will only tell you that a variable is a NumPy array but won’t tell you the type of thing inside the array. We can find out the type of the data contained in the NumPy array." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "print(data.dtype)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "and the shape of the array" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "print(data.shape)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "meaning that data array variable contains 60 rows and 40 columns" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Plotting the data" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "import matplotlib.pyplot as plt\n", 120 | "plt.imshow(data)\n", 121 | "plt.show()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "%matplotlib inline" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "plt.imshow(data)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "## Indexing" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "If we want to get a single number from the array, we must provide an index in square brackets after the variable name.
\n", 154 | "Note that the NumPy arrays are zero-indexed:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "data[0, 0]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "It means that that the third element in the first row has an index of [0, 2]:" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "data[0, 2]" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "We can also assign the element with a new value:" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "data[0, 2] = 100.\n", 196 | "print(data[0, 2])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "NumPy (and Python in general) checks the bounds of the array:" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "print(data.shape)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "#data[60, 0]" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "Finally, we can ask for several elements at once:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "data[0, [0, 10]]" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## $\\color{green}{\\text{Excercise}}$\n", 245 | "\n", 246 | "Create the following array, call it `a`:\n", 247 | "\n", 248 | "a =
\n", 249 | "```\n", 250 | "2 7 12 0\n", 251 | "3 9 3 4\n", 252 | "4 0 1 3\n", 253 | "```\n", 254 | "\n", 255 | "use `a` to assign 4, 1 and 3 from the third row to array `b` \n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "a = np.array([[2, 7, 12, 0],[3, 9, 3, 4],[4, 0, 1, 3]])\n", 265 | "print(a)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "b = a[2, [0,2,3]]\n", 275 | "print(b)\n", 276 | "type(b)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "[Previous: Numpy intro](numpy_intro.ipynb)
[Filtering data](filtering_data.ipynb)" 284 | ] 285 | } 286 | ], 287 | "metadata": { 288 | "kernelspec": { 289 | "display_name": "Python 3", 290 | "language": "python", 291 | "name": "python3" 292 | }, 293 | "language_info": { 294 | "codemirror_mode": { 295 | "name": "ipython", 296 | "version": 3 297 | }, 298 | "file_extension": ".py", 299 | "mimetype": "text/x-python", 300 | "name": "python", 301 | "nbconvert_exporter": "python", 302 | "pygments_lexer": "ipython3", 303 | "version": "3.7.2" 304 | } 305 | }, 306 | "nbformat": 4, 307 | "nbformat_minor": 2 308 | } 309 | -------------------------------------------------------------------------------- /Day_2_Machine_Learning_Python/03_basic_preprocessing_categorical_variables_exercise_02_solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Solution for Exercise 03\n", 8 | "\n", 9 | "The goal of this exercise is to evaluate the impact of feature preprocessing on a pipeline that uses a decision-tree-based classifier instead of logistic regression.\n", 10 | "\n", 11 | "- The first question is to empirically evaluate whether scaling numerical feature is helpful or not;\n", 12 | "\n", 13 | "- The second question is to evaluate whether it is empirically better (both from a computational and a statistical perspective) to use integer coded or one-hot encoded categories.\n", 14 | "\n", 15 | "\n", 16 | "Hint: `HistGradientBoostingClassifier` does not yet support sparse input data. You might want to use\n", 17 | "`OneHotEncoder(handle_unknown=\"ignore\", sparse=False)` to force the use a dense representation as a workaround." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "import pandas as pd\n", 27 | "from sklearn.model_selection import cross_val_score\n", 28 | "from sklearn.pipeline import make_pipeline\n", 29 | "from sklearn.compose import ColumnTransformer\n", 30 | "from sklearn.preprocessing import OrdinalEncoder\n", 31 | "from sklearn.experimental import enable_hist_gradient_boosting\n", 32 | "from sklearn.ensemble import HistGradientBoostingClassifier\n", 33 | "\n", 34 | "df = pd.read_csv(\n", 35 | " \"https://www.openml.org/data/get_csv/1595261/adult-census.csv\")\n", 36 | "\n", 37 | "# Or use the local copy:\n", 38 | "# df = pd.read_csv('../datasets/adult-census.csv')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "target_name = \"class\"\n", 48 | "target = df[target_name].to_numpy()\n", 49 | "data = df.drop(columns=[target_name, \"fnlwgt\"])" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "numerical_columns = [\n", 59 | " c for c in data.columns if data[c].dtype.kind in [\"i\", \"f\"]]\n", 60 | "categorical_columns = [\n", 61 | " c for c in data.columns if data[c].dtype.kind not in [\"i\", \"f\"]]\n", 62 | "\n", 63 | "categories = [\n", 64 | " data[column].unique() for column in data[categorical_columns]]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Reference pipeline (no numerical scaling and integer-coded categories)\n", 72 | "\n", 73 | "First let's time the pipeline we used in the main notebook to serve as a reference:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "%%time\n", 83 | "\n", 84 | "preprocessor = ColumnTransformer([\n", 85 | " ('categorical', OrdinalEncoder(categories=categories),\n", 86 | " categorical_columns),], remainder=\"passthrough\")\n", 87 | "\n", 88 | "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", 89 | "scores = cross_val_score(model, data, target)\n", 90 | "print(f\"The different scores obtained are: \\n{scores}\")\n", 91 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## Scaling numerical features" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "%%time\n", 108 | "from sklearn.preprocessing import StandardScaler\n", 109 | "\n", 110 | "preprocessor = ColumnTransformer([\n", 111 | " ('numerical', StandardScaler(), numerical_columns),\n", 112 | " ('categorical', OrdinalEncoder(categories=categories),\n", 113 | " categorical_columns),])\n", 114 | "\n", 115 | "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", 116 | "scores = cross_val_score(model, data, target)\n", 117 | "print(f\"The different scores obtained are: \\n{scores}\")\n", 118 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "### Analysis\n", 126 | "\n", 127 | "We can observe that both the accuracy and the training time are approximately the same as the reference pipeline (any time difference you might observe is not significant).\n", 128 | "\n", 129 | "Scaling numerical features is indeed useless for most decision tree models in general and for `HistGradientBoostingClassifier` in particular." 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## One-hot encoding of categorical variables\n", 137 | "\n", 138 | "For linear models, we have observed that integer coding of categorical\n", 139 | "variables can be very detrimental. However for\n", 140 | "`HistGradientBoostingClassifier` models, it does not seem to be the\n", 141 | "case as the cross-validation of the reference pipeline with\n", 142 | "`OrdinalEncoder` is good.\n", 143 | "\n", 144 | "Let's see if we can get an even better accuracy with `OneHotEncoding`:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "%%time\n", 154 | "from sklearn.preprocessing import OneHotEncoder\n", 155 | "\n", 156 | "preprocessor = ColumnTransformer([\n", 157 | " ('categorical',\n", 158 | " OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n", 159 | " categorical_columns),], remainder=\"passthrough\")\n", 160 | "\n", 161 | "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n", 162 | "scores = cross_val_score(model, data, target)\n", 163 | "print(f\"The different scores obtained are: \\n{scores}\")\n", 164 | "print(f\"The accuracy is: {scores.mean():.3f} +- {scores.std():.3f}\")" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "### Analysis\n", 172 | "\n", 173 | "From an accuracy point of view, the result is almost exactly the same.\n", 174 | "The reason is that `HistGradientBoostingClassifier` is expressive\n", 175 | "and robust enough to deal with misleading ordering of integer coded\n", 176 | "categories (which was not the case for linear models).\n", 177 | "\n", 178 | "However from a computation point of view, the training time is\n", 179 | "significantly longer: this is caused by the fact that `OneHotEncoder`\n", 180 | "generates approximately 10 times more features than `OrdinalEncoder`.\n", 181 | "\n", 182 | "Note that the current implementation `HistGradientBoostingClassifier`\n", 183 | "is still incomplete, and once sparse representation are handled\n", 184 | "correctly, training time might improve with such kinds of encodings.\n", 185 | "\n", 186 | "The main take away message is that arbitrary integer coding of\n", 187 | "categories is perfectly fine for `HistGradientBoostingClassifier`\n", 188 | "and yields fast training times." 189 | ] 190 | } 191 | ], 192 | "metadata": { 193 | "jupytext": { 194 | "formats": "python_scripts//py:percent,notebooks//ipynb" 195 | }, 196 | "kernelspec": { 197 | "display_name": "Python 3", 198 | "language": "python", 199 | "name": "python3" 200 | } 201 | }, 202 | "nbformat": 4, 203 | "nbformat_minor": 2 204 | } 205 | --------------------------------------------------------------------------------