├── .gitignore ├── README.md ├── cousera ├── howtowin_kaggle │ ├── week_1 │ │ ├── 1_1_introduction.srt │ │ ├── 1_1_introduction.srt.style │ │ ├── 1_1_introduction.vtt │ │ ├── 1_2_meet_your_lecturers.srt │ │ ├── 1_2_meet_your_lecturers.srt.style │ │ ├── 1_2_meet_your_lecturers.vtt │ │ ├── 1_3_cource_overview.srt │ │ ├── 1_3_cource_overview.srt.style │ │ ├── 1_3_cource_overview.vtt │ │ ├── 2_1_competition_mechanics.srt │ │ ├── 2_1_competition_mechanics.srt.style │ │ ├── 2_1_competition_mechanics.vtt │ │ ├── 2_2_kaggle_overview.srt │ │ ├── 2_2_kaggle_overview.srt.style │ │ ├── 2_2_kaggle_overview.vtt │ │ ├── 2_3_real_world.srt │ │ ├── 2_3_real_world.srt.style │ │ ├── 2_3_real_world.vtt │ │ ├── 3_1_recap.srt │ │ ├── 3_1_recap.srt.style │ │ ├── 3_1_recap.vtt │ │ ├── 4_1_Software_Hardware_Requirements.srt │ │ ├── 4_1_Software_Hardware_Requirements.srt.style │ │ ├── 4_1_Software_Hardware_Requirements.vtt │ │ ├── 5_1_overview.srt │ │ ├── 5_1_overview.srt.style │ │ ├── 5_1_overview.vtt │ │ ├── 5_2_numeric_feature.srt │ │ ├── 5_2_numeric_feature.srt.style │ │ ├── 5_2_numeric_feature.vtt │ │ ├── 5_3_categorical_and_ordinal.srt │ │ ├── 5_3_categorical_and_ordinal.srt.style │ │ ├── 5_3_categorical_and_ordinal.vtt │ │ ├── 5_4_datetime_and_coordinates.srt │ │ ├── 5_4_datetime_and_coordinates.srt.style │ │ ├── 5_4_datetime_and_coordinates.vtt │ │ ├── 5_5_handling_missing_values.srt │ │ ├── 5_5_handling_missing_values.srt.style │ │ ├── 5_5_handling_missing_values.vtt │ │ ├── 6_1_bag_of_words.srt │ │ ├── 6_1_bag_of_words.srt.style │ │ ├── 6_1_bag_of_words.vtt │ │ ├── 6_2_word2vec.srt │ │ ├── 6_2_word2vec.srt.style │ │ ├── 6_2_word2vec.vtt │ │ ├── 7_1_final_project_overview.srt │ │ ├── 7_1_final_project_overview.srt.style │ │ ├── 7_1_final_project_overview.vtt │ │ └── ここにmp4をおいて再生してね │ ├── week_2 │ │ ├── 1_1_Exploratory_data analysis.srt │ │ ├── 1_1_Exploratory_data analysis.srt.style │ │ ├── 1_1_Exploratory_data analysis.vtt │ │ ├── 1_2_Building_intuition_about_the_data.srt │ │ ├── 1_2_Building_intuition_about_the_data.srt.style │ │ ├── 1_2_Building_intuition_about_the_data.vtt │ │ ├── 1_3_Exploring_anonymized_data.srt │ │ ├── 1_3_Exploring_anonymized_data.srt.style │ │ ├── 1_3_Exploring_anonymized_data.vtt │ │ ├── 1_4_Visualizations.srt │ │ ├── 1_4_Visualizations.srt.style │ │ ├── 1_4_Visualizations.vtt │ │ ├── 1_5_Dataset_cleaning_and_other_things_to_check.srt │ │ ├── 1_5_Dataset_cleaning_and_other_things_to_check.srt.style │ │ ├── 1_5_Dataset_cleaning_and_other_things_to_check.vtt │ │ ├── 1_6_Springleaf_competition_EDA_I.srt │ │ ├── 1_6_Springleaf_competition_EDA_I.srt.style │ │ ├── 1_6_Springleaf_competition_EDA_I.vtt │ │ ├── 1_7_Springleaf_competition_EDA_II.srt │ │ ├── 1_7_Springleaf_competition_EDA_II.srt.style │ │ ├── 1_7_Springleaf_competition_EDA_II.vtt │ │ ├── 1_8_Numerai_competition_EDA.srt │ │ ├── 1_8_Numerai_competition_EDA.srt.style │ │ ├── 1_8_Numerai_competition_EDA.vtt │ │ ├── 2_1_Validation and overfitting.srt │ │ ├── 2_1_Validation and overfitting.srt.style │ │ ├── 2_1_Validation and overfitting.vtt │ │ ├── 2_2_Validation strategies.srt │ │ ├── 2_2_Validation strategies.srt.style │ │ ├── 2_2_Validation strategies.vtt │ │ ├── 2_3_Data splitting strategies.srt │ │ ├── 2_3_Data splitting strategies.srt.style │ │ ├── 2_3_Data splitting strategies.vtt │ │ ├── 2_4_Problems occurring during validation.srt │ │ ├── 2_4_Problems occurring during validation.srt.style │ │ ├── 2_4_Problems occurring during validation.vtt │ │ ├── 3_1_Basic data leaks.srt │ │ ├── 3_1_Basic data leaks.srt.style │ │ ├── 3_1_Basic data leaks.vtt │ │ ├── 3_2_Leaderboard probing and examples of rare data leaks.srt │ │ ├── 3_2_Leaderboard probing and examples of rare data leaks.srt.style │ │ ├── 3_2_Leaderboard probing and examples of rare data leaks.vtt │ │ ├── 3_3_Expedia challenge.srt │ │ ├── 3_3_Expedia challenge.srt.style │ │ └── 3_3_Expedia challenge.vtt │ ├── week_3 │ │ ├── 1_1_Motivation.srt │ │ ├── 1_1_Motivation.srt.style │ │ ├── 1_1_Motivation.vtt │ │ ├── 1_2_Regression_metrics_review1.srt │ │ ├── 1_2_Regression_metrics_review1.srt.style │ │ ├── 1_2_Regression_metrics_review1.vtt │ │ ├── 1_3_Regression_metrics_review2.srt │ │ ├── 1_3_Regression_metrics_review2.srt.style │ │ ├── 1_3_Regression_metrics_review2.vtt │ │ ├── 1_4_Classification_metrics_review.srt │ │ ├── 1_4_Classification_metrics_review.srt.style │ │ ├── 1_4_Classification_metrics_review.vtt │ │ ├── 1_5_General_approaches.srt │ │ ├── 1_5_General_approaches.srt.style │ │ ├── 1_5_General_approaches.vtt │ │ ├── 1_6_Regression_metrics_optimization.srt │ │ ├── 1_6_Regression_metrics_optimization.srt.style │ │ ├── 1_6_Regression_metrics_optimization.vtt │ │ ├── 1_7_Classification_metrics_optimization_1.srt │ │ ├── 1_7_Classification_metrics_optimization_1.srt.style │ │ ├── 1_7_Classification_metrics_optimization_1.vtt │ │ ├── 1_8_Classification_metrics_optimization_2.srt │ │ ├── 1_8_Classification_metrics_optimization_2.srt.style │ │ ├── 1_8_Classification_metrics_optimization_2.vtt │ │ ├── 2_1_Concept_of_mean_encoding.srt │ │ ├── 2_1_Concept_of_mean_encoding.srt.style │ │ ├── 2_1_Concept_of_mean_encoding.vtt │ │ ├── 2_2_Regularization.srt │ │ ├── 2_2_Regularization.srt.style │ │ ├── 2_2_Regularization.vtt │ │ ├── 2_3_Extensions_and_generalizations.srt │ │ ├── 2_3_Extensions_and_generalizations.srt.style │ │ └── 2_3_Extensions_and_generalizations.vtt │ ├── week_4 │ │ ├── 1_1_Hyperparameter_tuning_1.srt │ │ ├── 1_1_Hyperparameter_tuning_1.srt.style │ │ ├── 1_1_Hyperparameter_tuning_1.vtt │ │ ├── 1_2_Hyperparameter_tuning_2.srt │ │ ├── 1_2_Hyperparameter_tuning_2.srt.style │ │ ├── 1_2_Hyperparameter_tuning_2.vtt │ │ ├── 1_3_Hyperparameter_tuning_3.srt │ │ ├── 1_3_Hyperparameter_tuning_3.srt.style │ │ ├── 1_3_Hyperparameter_tuning_3.vtt │ │ ├── 1_4_Practical_guide.srt │ │ ├── 1_4_Practical_guide.srt.style │ │ ├── 1_4_Practical_guide.vtt │ │ ├── 1_5_KazAnova's competition pipeline, part 1.srt │ │ ├── 1_5_KazAnova's competition pipeline, part 1.srt.style │ │ ├── 1_5_KazAnova's competition pipeline, part 1.vtt │ │ ├── 1_6_KazAnova's competition pipeline, part 2.srt │ │ ├── 1_6_KazAnova's competition pipeline, part 2.srt.style │ │ ├── 1_6_KazAnova's competition pipeline, part 2.vtt │ │ ├── 2_1_Statistics and distance based features.srt │ │ ├── 2_1_Statistics and distance based features.srt.style │ │ ├── 2_1_Statistics and distance based features.vtt │ │ ├── 2_2_Matrix factorizations.srt │ │ ├── 2_2_Matrix factorizations.srt.style │ │ ├── 2_2_Matrix factorizations.vtt │ │ ├── 2_3_Feature Interactions.srt │ │ ├── 2_3_Feature Interactions.srt.style │ │ ├── 2_3_Feature Interactions.vtt │ │ ├── 2_4_t-SNE.srt │ │ ├── 2_4_t-SNE.srt.style │ │ ├── 2_4_t-SNE.vtt │ │ ├── 3_1_Introduction into ensemble methods.srt │ │ ├── 3_1_Introduction into ensemble methods.srt.style │ │ ├── 3_1_Introduction into ensemble methods.vtt │ │ ├── 3_2_Bagging.srt │ │ ├── 3_2_Bagging.srt.style │ │ ├── 3_2_Bagging.vtt │ │ ├── 3_3_Boosting.srt │ │ ├── 3_3_Boosting.srt.style │ │ ├── 3_3_Boosting.vtt │ │ ├── 3_4_Stacking.srt │ │ ├── 3_4_Stacking.srt.style │ │ ├── 3_4_Stacking.vtt │ │ ├── 3_5_StackNet.srt │ │ ├── 3_5_StackNet.srt.style │ │ ├── 3_5_StackNet.vtt │ │ ├── 3_6_Ensembling Tips and Tricks.srt │ │ ├── 3_6_Ensembling Tips and Tricks.srt.style │ │ └── 3_6_Ensembling Tips and Tricks.vtt │ └── week_5 │ │ ├── 1_Crowdflower Competition.srt │ │ ├── 1_Crowdflower Competition.srt.style │ │ ├── 1_Crowdflower Competition.vtt │ │ ├── 1_Crowdflower\ Competition.srt │ │ ├── 2_Springleaf Marketing Response.srt │ │ ├── 2_Springleaf Marketing Response.srt.style │ │ ├── 2_Springleaf Marketing Response.vtt │ │ ├── 3_Microsoft Malware Classification Challenge.srt │ │ ├── 3_Microsoft Malware Classification Challenge.srt.style │ │ ├── 3_Microsoft Malware Classification Challenge.vtt │ │ ├── 4_Walmart Trip Type Classification.srt │ │ ├── 4_Walmart Trip Type Classification.srt.style │ │ ├── 4_Walmart Trip Type Classification.vtt │ │ ├── 5_Acquire Valued Shoppers Challenge part 1.srt │ │ ├── 5_Acquire Valued Shoppers Challenge part 1.srt.style │ │ ├── 5_Acquire Valued Shoppers Challenge part 1.vtt │ │ ├── 6_Acquire Valued Shoppers Challenge part 2.srt │ │ ├── 6_Acquire Valued Shoppers Challenge part 2.srt.style │ │ └── 6_Acquire Valued Shoppers Challenge part 2.vtt ├── script │ └── translate.py └── share │ ├── Week1_(T.Shimano).pdf │ ├── Week_1(T.Nakao).pdf │ ├── Week_1(T.Nakao).pptx │ ├── Week_1(takagishi).pdf │ ├── Week_1(takagishi).pptx │ ├── week3_門脇_Concept of mean encoding.pptx │ └── week3_門脇_Regularization.pptx └── wiki └── cousera ├── 3week_cls_image01.jpg ├── 3week_cls_image02.jpg ├── 3week_cls_image03.jpg ├── 3week_cls_image04.jpg ├── 3week_cls_image05.jpg ├── 3week_cls_image06.jpg ├── 3week_cls_image07.jpg ├── 3week_cls_image08.jpg ├── 3week_cls_image09.jpg ├── 3week_cls_image10.jpg ├── 3week_cls_image11.jpg ├── 3week_cls_image12.jpg ├── 3week_cls_image13.jpg ├── 3week_image001.png ├── 3week_image004.png ├── 3week_image007.png ├── 3week_image011.png ├── 3week_image012.png ├── 3week_image022.png ├── 3week_image025.png ├── 3week_image026.png ├── 3week_image027.png ├── 3week_image028.png ├── 3week_image029.png ├── 3week_image030.png ├── 3week_mean-encoding001.png ├── 3week_mean-encoding002.png ├── 3week_mean-encoding003.png ├── 3week_mean-encoding004.png ├── 3week_mean-encoding005.png ├── 3week_mean-encoding006.png ├── 3week_mean-encoding007.png ├── 3week_mean-encoding008.png ├── 3week_mean-encoding009.png ├── 3week_mean-encoding010.png ├── 3week_mean-encoding011.png ├── 3week_mean-encoding012.png ├── 3week_mean-encoding013.png ├── 3week_mean-encoding014.png ├── 3week_mean-encoding015.png ├── 3week_mean-encoding016.png ├── 3week_mean-encoding017.png ├── 3week_mean-encoding018.png ├── 4week_AdvancedFeatures001.png ├── 4week_AdvancedFeatures002.png ├── 4week_AdvancedFeatures003.png ├── 4week_AdvancedFeatures004.png ├── 4week_AdvancedFeatures005.png ├── 4week_AdvancedFeatures006.png ├── 4week_AdvancedFeatures007.png ├── 4week_AdvancedFeatures008.png ├── 4week_AdvancedFeatures009.png ├── 4week_AdvancedFeatures010.png ├── 4week_Ensemble_Tips1.png ├── 4week_Ensemble_Tips2.png ├── 4week_Ensemble_Tips3.png ├── 4week_Ensemble_Tips4.png ├── 4week_KNNfeatures ├── compute_KNN_features.ipynb ├── data │ ├── knn_feats_cosine_test.npy │ ├── knn_feats_cosine_train.npy.zip │ ├── knn_feats_minkowski_test.npy │ └── knn_feats_minkowski_train.npy.zip └── grader.py ├── 4week_NeuralNet001.png ├── 4week_Practicalguide001.png ├── 4week_Practicalguide002.png ├── 4week_Practicalguide003.png ├── 4week_Practicalguide004.png ├── 4week_StackNet1.png ├── 4week_StackNet2.png ├── 4week_Stacking1.png ├── 4week_ensemble_bagging1.jpg ├── 4week_ensemble_bagging2.PNG ├── 4week_ensemble_bagging3.PNG ├── 4week_ensemble_bagging4.PNG ├── 4week_ensemble_bagging5.PNG ├── 4week_ensemble_boosting1.PNG ├── 4week_ensemble_boosting2.PNG ├── 4week_ensemble_boosting3.PNG ├── 4week_ensemble_boosting4.PNG ├── 4week_ensemble_boosting5.PNG ├── 4week_ensemble_boosting6.PNG ├── 4week_ensemble_boosting7.PNG ├── 4week_ensemble_boosting8.PNG ├── 4week_ensemble_intro1.png ├── 4week_ensemble_intro2.png ├── 4week_ensemble_intro3.png ├── 4week_ensemble_intro4.png ├── 5week_image_tn_001.png ├── 5week_image_tn_002.png ├── 5week_image_tn_003.png ├── 5week_image_tn_004.png ├── 5week_image_tn_005.png ├── 5week_image_tn_006.png ├── 5week_image_tn_007.png ├── 5week_image_tn_008.png ├── 5week_image_tn_009.png ├── 5week_image_tn_101.png ├── 5week_image_tn_102.png ├── 5week_image_tn_103.png ├── 5week_image_tn_104.png ├── 5week_image_tn_105.png ├── 5week_image_tn_106.png ├── 5week_image_tn_107.png ├── 5week_image_tn_108.png ├── 5week_image_tn_109.png ├── 5week_image_tn_110.png ├── 5week_image_tn_111.png ├── 5week_image_tn_112.png ├── 5week_image_tn_113.png ├── Programming assignment, week 4_ Ensembles ├── Programming_assignment_week_4-Copy1.ipynb ├── Programming_assignment_week_4.ipynb ├── __pycache__ │ └── grader.cpython-36.pyc └── grader.py ├── clone1.png ├── translate_1.png ├── week1_program_plot.png ├── week5_cf1.PNG ├── week5_cf2.PNG ├── week5_cf3.PNG ├── week5_cf4.PNG ├── week5_cf5.PNG ├── week5_cf6.PNG ├── week5_cf7.PNG ├── week5_mm1.PNG ├── week5_mm2.PNG └── week5_sl1.PNG /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | cousera/*/*.mp4 106 | cousera/howtowin_kaggle/*/*mp4 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spade 2 | 勉強会用にリポジトリを作成しました 3 | 4 | ## 使用用途 5 | 各種情報共有(ドキュメント・wiki) 6 | 7 | ## その他 8 | spadeって名前はあんまり気にしないで下さい 9 | 単なるproject code的な 10 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_1_introduction.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/1_1_introduction.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_1_introduction.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_1_introduction.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:00.390 --> 00:00:05.447 5 | [MUSIC] 6 | 7 | 2 8 | 00:00:05.447 --> 00:00:07.250 9 | Hello, and welcome. 10 | 11 | 3 12 | 00:00:07.250 --> 00:00:08.170 13 | My name is Dimitri, and 14 | 15 | 4 16 | 00:00:08.170 --> 00:00:12.060 17 | I'm happy to see you are interested 18 | in competitive data science. 19 | 20 | 5 21 | 00:00:12.060 --> 00:00:14.710 22 | Data science is all about 23 | machine learning applications. 24 | 25 | 6 26 | 00:00:14.710 --> 00:00:17.640 27 | And in data science, like everywhere else, 28 | people are looking for 29 | 30 | 7 31 | 00:00:17.640 --> 00:00:20.380 32 | the very best solutions to their problems. 33 | 34 | 8 35 | 00:00:20.380 --> 00:00:24.240 36 | They're looking for the models that 37 | have the best predictive capabilities, 38 | 39 | 9 40 | 00:00:24.240 --> 00:00:27.850 41 | the models that make as 42 | few mistakes as possible. 43 | 44 | 10 45 | 00:00:27.850 --> 00:00:31.830 46 | And the competition for one becomes 47 | an essential way to find such solutions. 48 | 49 | 11 50 | 00:00:31.830 --> 00:00:35.340 51 | Competing for the prize, 52 | participants push through the limits, 53 | 54 | 12 55 | 00:00:35.340 --> 00:00:37.680 56 | come up with novel ideas. 57 | 58 | 13 59 | 00:00:37.680 --> 00:00:41.250 60 | Companies organize data science 61 | competitions to get top quality models for 62 | 63 | 14 64 | 00:00:41.250 --> 00:00:42.940 65 | not so high price. 66 | 67 | 15 68 | 00:00:42.940 --> 00:00:45.980 69 | And for data scientists, 70 | competitions become a truly unique 71 | 72 | 16 73 | 00:00:45.980 --> 00:00:49.215 74 | opportunity to learn, well, 75 | and of course win a prize. 76 | 77 | 17 78 | 00:00:50.360 --> 00:00:54.090 79 | This course is a chance for you to catch 80 | up on the trends in competitive data 81 | 82 | 18 83 | 00:00:54.090 --> 00:00:58.060 84 | science and learn what we, 85 | competition addicts and at the same time, 86 | 87 | 19 88 | 00:00:58.060 --> 00:01:01.239 89 | lecturers of this course, 90 | have already learned while competing. 91 | 92 | 20 93 | 00:01:02.390 --> 00:01:05.976 94 | In this course, we will go through 95 | competition solving process step by 96 | 97 | 21 98 | 00:01:05.976 --> 00:01:09.982 99 | step and tell you about exploratory data 100 | analysis, basic and advanced feature 101 | 102 | 22 103 | 00:01:09.982 --> 00:01:13.712 104 | generation and preprocessing, 105 | various model validation techniques. 106 | 107 | 23 108 | 00:01:13.712 --> 00:01:18.498 109 | Data leakages, competition's metric 110 | optimization, model ensembling, 111 | 112 | 24 113 | 00:01:18.498 --> 00:01:20.370 114 | and hyperparameter tuning. 115 | 116 | 25 117 | 00:01:20.370 --> 00:01:25.050 118 | We've put together all our experience and 119 | created this course for you. 120 | 121 | 26 122 | 00:01:25.050 --> 00:01:26.520 123 | We've also designed quizzes and 124 | 125 | 27 126 | 00:01:26.520 --> 00:01:31.000 127 | programming assignments to let you 128 | apply your newly acquired skills. 129 | 130 | 28 131 | 00:01:31.000 --> 00:01:34.570 132 | Moreover, as a final project, you will 133 | have an opportunity to compete with 134 | 135 | 29 136 | 00:01:34.570 --> 00:01:37.545 137 | other students and 138 | participate in a special competition, 139 | 140 | 30 141 | 00:01:37.545 --> 00:01:43.460 142 | hosted on the world's largest platform for 143 | data science challenges called Kaggle. 144 | 145 | 31 146 | 00:01:43.460 --> 00:01:46.354 147 | Now, let's meet other lecturers and 148 | get started. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:04.210 --> 00:00:09.390 5 | And now, I want to introduce other lecturers of this course. 6 | 7 | 2 8 | 00:00:09.390 --> 00:00:13.190 9 | Alexander, Dmitry, Mikhail, and Marios. 10 | 11 | 3 12 | 00:00:13.190 --> 00:00:15.520 13 | Mikhail is aka Cassanova, 14 | 15 | 4 16 | 00:00:15.520 --> 00:00:20.180 17 | the person who reached the very top of competitive data science. 18 | 19 | 5 20 | 00:00:20.180 --> 00:00:22.925 21 | I will tell you a couple of thoughts about the origins of the course. 22 | 23 | 6 24 | 00:00:22.925 --> 00:00:27.660 25 | In year 2014, we started our win in data science by joining competitions. 26 | 27 | 7 28 | 00:00:27.660 --> 00:00:30.880 29 | We've been meeting every week and discussing the past competitions, solutions, 30 | 31 | 8 32 | 00:00:30.880 --> 00:00:33.885 33 | ideas and tweaks what worked and what did not, 34 | 35 | 9 36 | 00:00:33.885 --> 00:00:36.760 37 | this exchange of knowledge and experience helped us 38 | 39 | 10 40 | 00:00:36.760 --> 00:00:39.915 41 | to learn quickly from each other and improve our skills. 42 | 43 | 11 44 | 00:00:39.915 --> 00:00:41.680 45 | Initially our community was small, 46 | 47 | 12 48 | 00:00:41.680 --> 00:00:44.545 49 | but over time more and more people were joining. 50 | 51 | 13 52 | 00:00:44.545 --> 00:00:47.230 53 | From the format of groups of discussion. 54 | 55 | 14 56 | 00:00:47.230 --> 00:00:49.550 57 | We moved on to the format of well organized meetings. 58 | 59 | 15 60 | 00:00:49.550 --> 00:00:54.185 61 | Where a speaker makes an overview of his approach and ideas in front of 50 people. 62 | 63 | 16 64 | 00:00:54.185 --> 00:00:56.585 65 | These meetings are called machine learning trainings. 66 | 67 | 17 68 | 00:00:56.585 --> 00:01:00.110 69 | Now with the help and support of Yandex and get a hundred of participants. 70 | 71 | 18 72 | 00:01:00.110 --> 00:01:06.720 73 | Thus we started from zero and learned everything by hard work and collaboration. 74 | 75 | 19 76 | 00:01:06.720 --> 00:01:08.240 77 | We had an excellent teacher, 78 | 79 | 20 80 | 00:01:08.240 --> 00:01:11.010 81 | Alexander D'yakonov who was top one on Kaggle, 82 | 83 | 21 84 | 00:01:11.010 --> 00:01:13.870 85 | he took the course on critical data analysis. 86 | 87 | 22 88 | 00:01:13.870 --> 00:01:18.235 89 | In Moscow state university and there we're grateful to him. 90 | 91 | 23 92 | 00:01:18.235 --> 00:01:21.160 93 | At some point we started to share our knowledge with 94 | 95 | 24 96 | 00:01:21.160 --> 00:01:25.925 97 | other people and some of us even started to read lectures at the university. 98 | 99 | 25 100 | 00:01:25.925 --> 00:01:31.630 101 | So now we have decided to summarize everything and make it available for everyone. 102 | 103 | 26 104 | 00:01:31.630 --> 00:01:35.835 105 | Together. We've finished and procesed in about 20 different competitions 106 | 107 | 27 108 | 00:01:35.835 --> 00:01:40.585 109 | only on Kaggle and just as many on other not so famous platforms. 110 | 111 | 28 112 | 00:01:40.585 --> 00:01:44.050 113 | All of us have a tremendous amount of skill and experience in 114 | 115 | 29 116 | 00:01:44.050 --> 00:01:48.250 117 | competitive data science and now we want to share this experience with you. 118 | 119 | 30 120 | 00:01:48.250 --> 00:01:49.500 121 | For all of us, 122 | 123 | 31 124 | 00:01:49.500 --> 00:01:52.555 125 | competitive data science opened a number of opportunities 126 | 127 | 32 128 | 00:01:52.555 --> 00:01:56.745 129 | as the competitions we took part were dedicated to a large variety of tasks. 130 | 131 | 33 132 | 00:01:56.745 --> 00:01:59.065 133 | Mikhail works in e-commerce. 134 | 135 | 34 136 | 00:01:59.065 --> 00:02:02.140 137 | Alexander builds predictive model for taxi services, 138 | 139 | 35 140 | 00:02:02.140 --> 00:02:04.180 141 | Dmitri works with financial data, 142 | 143 | 36 144 | 00:02:04.180 --> 00:02:08.725 145 | Mario develops machinery learning frameworks and I am a deep learning researcher. 146 | 147 | 37 148 | 00:02:08.725 --> 00:02:10.660 149 | Competitions, without a doubt, 150 | 151 | 38 152 | 00:02:10.660 --> 00:02:14.140 153 | became a stepping stone for our careers and believe me, 154 | 155 | 39 156 | 00:02:14.140 --> 00:02:18.040 157 | good comparative record will bring success to you as well. 158 | 159 | 40 160 | 00:02:18.040 --> 00:02:23.330 161 | We hope you will find something interesting in this course and wish you good luck. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_3_cource_overview.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/1_3_cource_overview.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/1_3_cource_overview.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/2_1_competition_mechanics.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/2_1_competition_mechanics.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/2_1_competition_mechanics.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/2_2_kaggle_overview.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/2_2_kaggle_overview.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/2_2_kaggle_overview.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/2_3_real_world.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/2_3_real_world.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/2_3_real_world.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/3_1_recap.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/3_1_recap.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/3_1_recap.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:04.460 --> 00:00:05.670 5 | Hi, everyone. 6 | 7 | 2 8 | 00:00:05.670 --> 00:00:10.490 9 | In this video, I want to do an overview 10 | of hardware and software requirements. 11 | 12 | 3 13 | 00:00:10.490 --> 00:00:14.520 14 | You will know what is typical stuff for 15 | data science competitions. 16 | 17 | 4 18 | 00:00:14.520 --> 00:00:17.900 19 | I want to start from 20 | hardware related things. 21 | 22 | 5 23 | 00:00:17.900 --> 00:00:19.399 24 | Participating in competitions, 25 | 26 | 6 27 | 00:00:19.399 --> 00:00:22.890 28 | you generally don't need a lot 29 | of computation resources. 30 | 31 | 7 32 | 00:00:22.890 --> 00:00:23.890 33 | A lot of competitions, 34 | 35 | 8 36 | 00:00:23.890 --> 00:00:28.650 37 | except imaged based, 38 | have under several gigabytes of data. 39 | 40 | 9 41 | 00:00:28.650 --> 00:00:33.350 42 | It's not very huge and can be processed on 43 | a high level laptop with 16 gigabyte ram 44 | 45 | 10 46 | 00:00:33.350 --> 00:00:34.500 47 | and four physical cores. 48 | 49 | 11 50 | 00:00:35.600 --> 00:00:39.480 51 | Quite a good setup is a tower 52 | PC with 32 gigabyte of ram and 53 | 54 | 12 55 | 00:00:39.480 --> 00:00:42.570 56 | six physical cores, 57 | this is what I personally use. 58 | 59 | 13 60 | 00:00:43.630 --> 00:00:45.660 61 | You have a choice of hardware to use. 62 | 63 | 14 64 | 00:00:45.660 --> 00:00:48.910 65 | I suggest you to pay attention 66 | to the following things. 67 | 68 | 15 69 | 00:00:48.910 --> 00:00:52.190 70 | First is RAM, for this more is better. 71 | 72 | 16 73 | 00:00:52.190 --> 00:00:56.150 74 | If you can keep your data in memory, 75 | your life will be much, much easier. 76 | 77 | 17 78 | 00:00:56.150 --> 00:00:59.600 79 | Personally, I found 64 80 | gigabytes is quite enough, but 81 | 82 | 18 83 | 00:00:59.600 --> 00:01:03.569 84 | some programmers prefer to have 85 | 128 gigabytes or even more. 86 | 87 | 19 88 | 00:01:04.618 --> 00:01:10.020 89 | Next are cores, the more core you have 90 | the more or faster experiments you can do. 91 | 92 | 20 93 | 00:01:10.020 --> 00:01:12.910 94 | I find it comfortable to 95 | work with fixed cores, but 96 | 97 | 21 98 | 00:01:12.910 --> 00:01:14.990 99 | sometimes even 32 are not enough. 100 | 101 | 22 102 | 00:01:16.270 --> 00:01:19.910 103 | Next thing to pay attention for 104 | is storage. 105 | 106 | 23 107 | 00:01:19.910 --> 00:01:23.570 108 | If you work with large datasets 109 | that don't fit into the memory, 110 | 111 | 24 112 | 00:01:23.570 --> 00:01:27.530 113 | it's crucial to have fast disk to read and 114 | write chunks of data. 115 | 116 | 25 117 | 00:01:27.530 --> 00:01:32.070 118 | SSD is especially important if you train 119 | narrowness or large number of images. 120 | 121 | 26 122 | 00:01:33.270 --> 00:01:35.660 123 | In case you really need 124 | computational resources. 125 | 126 | 27 127 | 00:01:35.660 --> 00:01:38.640 128 | For example, if you are part of team or 129 | 130 | 28 131 | 00:01:38.640 --> 00:01:43.260 132 | have a computational heavy approach, 133 | you can rent it on cloud platforms. 134 | 135 | 29 136 | 00:01:43.260 --> 00:01:47.530 137 | They offer machines with a lot of RAMs, 138 | cores, and GPUs. 139 | 140 | 30 141 | 00:01:47.530 --> 00:01:49.150 142 | There are several cloud providers, 143 | 144 | 31 145 | 00:01:49.150 --> 00:01:54.520 146 | most famous are Amazon AWS, 147 | Microsoft's Azure, and Google Cloud. 148 | 149 | 32 150 | 00:01:54.520 --> 00:01:56.335 151 | Each one has its own pricing, so 152 | 153 | 33 154 | 00:01:56.335 --> 00:01:59.840 155 | we can choose which one best 156 | fits your needs and budget. 157 | 158 | 34 159 | 00:01:59.840 --> 00:02:04.150 160 | I especially want to draw your 161 | attention to AWS spot option. 162 | 163 | 35 164 | 00:02:04.150 --> 00:02:07.800 165 | Spot instances enable you 166 | to be able to use instance, 167 | 168 | 36 169 | 00:02:07.800 --> 00:02:09.400 170 | which can lower your cost significantly. 171 | 172 | 37 173 | 00:02:09.400 --> 00:02:13.590 174 | The higher your price for 175 | spot instance is set by Amazon and 176 | 177 | 38 178 | 00:02:13.590 --> 00:02:18.090 179 | fluctuates depending on supply and 180 | demand for spot instances. 181 | 182 | 39 183 | 00:02:18.090 --> 00:02:22.630 184 | Your spot instance run whenever you 185 | bid exceeds the current market price. 186 | 187 | 40 188 | 00:02:22.630 --> 00:02:25.450 189 | Generally, it's much 190 | cheaper than other options. 191 | 192 | 41 193 | 00:02:25.450 --> 00:02:29.640 194 | But you always have risk that your bid 195 | will get under current market price, and 196 | 197 | 42 198 | 00:02:29.640 --> 00:02:30.820 199 | your source will be terminated. 200 | 201 | 43 202 | 00:02:31.840 --> 00:02:33.450 203 | Tutorials about how to setup and 204 | 205 | 44 206 | 00:02:33.450 --> 00:02:36.500 207 | configure cloud resources you may 208 | find in additional materials. 209 | 210 | 45 211 | 00:02:37.500 --> 00:02:39.948 212 | Another important thing I 213 | want to discuss is software. 214 | 215 | 46 216 | 00:02:39.948 --> 00:02:44.260 217 | Usually, rules in competitions 218 | prohibit to use commercial software, 219 | 220 | 47 221 | 00:02:44.260 --> 00:02:47.910 222 | since it requires to buy 223 | a license to reproduce results. 224 | 225 | 48 226 | 00:02:47.910 --> 00:02:50.770 227 | Some competitors prefer 228 | R as basic language. 229 | 230 | 49 231 | 00:02:50.770 --> 00:02:53.960 232 | But we will describe Python's tech 233 | as more common and more general. 234 | 235 | 50 236 | 00:02:55.290 --> 00:02:58.310 237 | Python is quite a good language for 238 | fast prototyping. 239 | 240 | 51 241 | 00:02:58.310 --> 00:03:02.090 242 | It has a huge amount of high quality and 243 | open source libraries. 244 | 245 | 52 246 | 00:03:02.090 --> 00:03:03.850 247 | And I want to reuse several of them. 248 | 249 | 53 250 | 00:03:05.060 --> 00:03:07.430 251 | Let's start with NumPy. 252 | 253 | 54 254 | 00:03:07.430 --> 00:03:11.210 255 | It's a linear algebra library 256 | to work with dimensional arrays, 257 | 258 | 55 259 | 00:03:11.210 --> 00:03:15.380 260 | which contains useful linear algebra 261 | routines and random number capabilities. 262 | 263 | 56 264 | 00:03:16.550 --> 00:03:20.660 265 | Pandas is a library providing fast, 266 | flexible, and expressive way to work with 267 | 268 | 57 269 | 00:03:20.660 --> 00:03:24.520 270 | a relational or table of data, 271 | both easily and intuitive. 272 | 273 | 58 274 | 00:03:24.520 --> 00:03:27.585 275 | It allows you to process your 276 | data in a way similar to SQL. 277 | 278 | 59 279 | 00:03:27.585 --> 00:03:32.190 280 | Scikit-learn is a library of classic 281 | machine learning algorithms. 282 | 283 | 60 284 | 00:03:32.190 --> 00:03:36.320 285 | It features various classification, 286 | regression, and clustering algorithms, 287 | 288 | 61 289 | 00:03:36.320 --> 00:03:40.750 290 | including support virtual machines, 291 | random force, and a lot more. 292 | 293 | 62 294 | 00:03:41.950 --> 00:03:44.030 295 | Matplotlib is a plotting library. 296 | 297 | 63 298 | 00:03:44.030 --> 00:03:47.070 299 | It allows you to do 300 | a variety of visualization, 301 | 302 | 64 303 | 00:03:47.070 --> 00:03:50.980 304 | like line plots, histograms, 305 | scatter plots and a lot more. 306 | 307 | 65 308 | 00:03:52.050 --> 00:03:56.460 309 | As IDE, I suggest you to use 310 | IPython with Jupyter node box, 311 | 312 | 66 313 | 00:03:56.460 --> 00:04:00.190 314 | since they allow you to work 315 | interactively and remotely. 316 | 317 | 67 318 | 00:04:00.190 --> 00:04:03.390 319 | The last property is especially 320 | useful if you use cloud resources. 321 | 322 | 68 323 | 00:04:04.490 --> 00:04:08.380 324 | Additional packages contain 325 | implementation of more specific tools. 326 | 327 | 69 328 | 00:04:08.380 --> 00:04:11.685 329 | Usually, single packages 330 | implement single algorithm. 331 | 332 | 70 333 | 00:04:11.685 --> 00:04:15.900 334 | XGBoost and LightGBM packages implement 335 | gradient-boosted decision trees 336 | 337 | 71 338 | 00:04:15.900 --> 00:04:18.320 339 | in a very efficient and optimized way. 340 | 341 | 72 342 | 00:04:18.320 --> 00:04:20.230 343 | You definitely should 344 | know about such tools. 345 | 346 | 73 347 | 00:04:21.370 --> 00:04:25.100 348 | Keras is a user-friendly framework for 349 | neural nets. 350 | 351 | 74 352 | 00:04:25.100 --> 00:04:28.000 353 | This new package is an efficient 354 | implementation of this new ]projection 355 | 356 | 75 357 | 00:04:28.000 --> 00:04:29.990 358 | method which we will 359 | discuss in our course. 360 | 361 | 76 362 | 00:04:31.050 --> 00:04:34.890 363 | Also, I want to say a few words about 364 | external tools which usually don't have 365 | 366 | 77 367 | 00:04:34.890 --> 00:04:38.670 368 | any connection despite, but 369 | still very used for computations. 370 | 371 | 78 372 | 00:04:38.670 --> 00:04:41.120 373 | One such tool is Vowpal Wabbit. 374 | 375 | 79 376 | 00:04:41.120 --> 00:04:44.020 377 | It is a tool designed to 378 | provide blazing speed and 379 | 380 | 80 381 | 00:04:44.020 --> 00:04:48.060 382 | handle really large data sets, 383 | which don't fit into memory. 384 | 385 | 81 386 | 00:04:48.060 --> 00:04:52.860 387 | Libfm and libffm implement different 388 | types of optimization machines, and 389 | 390 | 82 391 | 00:04:52.860 --> 00:04:57.810 392 | often used for sparse data like 393 | click-through rate prediction. 394 | 395 | 83 396 | 00:04:57.810 --> 00:05:02.910 397 | Rgf is an alternative base method, 398 | which I suggest you to use in ensembles. 399 | 400 | 84 401 | 00:05:02.910 --> 00:05:05.220 402 | You can install these packages one by one. 403 | 404 | 85 405 | 00:05:05.220 --> 00:05:07.250 406 | But as alternative, you can use byte and 407 | 408 | 86 409 | 00:05:07.250 --> 00:05:11.230 410 | distribution like Anaconda, which already 411 | contains a lot of mentioned packages. 412 | 413 | 87 414 | 00:05:12.260 --> 00:05:13.927 415 | And then, through this video, 416 | 417 | 88 418 | 00:05:13.927 --> 00:05:17.953 419 | I want to emphasize the proposed setup 420 | is the most common but not the only one. 421 | 422 | 89 423 | 00:05:17.953 --> 00:05:22.799 424 | Don't overestimate the role of hardware 425 | and software, since they are just tools. 426 | 427 | 90 428 | 00:05:22.799 --> 00:05:24.964 429 | Thank you for your attention. 430 | 431 | 91 432 | 00:05:24.964 --> 00:05:34.964 433 | [MUSIC] -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_1_overview.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_1_overview.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_1_overview.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_2_numeric_feature.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_2_numeric_feature.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_2_numeric_feature.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_3_categorical_and_ordinal.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_3_categorical_and_ordinal.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_3_categorical_and_ordinal.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_4_datetime_and_coordinates.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_4_datetime_and_coordinates.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_4_datetime_and_coordinates.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_5_handling_missing_values.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_5_handling_missing_values.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/5_5_handling_missing_values.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/6_1_bag_of_words.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/6_1_bag_of_words.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/6_1_bag_of_words.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/6_2_word2vec.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/6_2_word2vec.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/6_2_word2vec.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/7_1_final_project_overview.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/7_1_final_project_overview.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/7_1_final_project_overview.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/7_1_final_project_overview.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:02.420 --> 00:00:05.126 5 | Hello, everyone. In this video, 6 | 7 | 2 8 | 00:00:05.126 --> 00:00:08.550 9 | we will talk a little bit about the main assignment of this course, 10 | 11 | 3 12 | 00:00:08.550 --> 00:00:11.880 13 | the competition, which plays the role of the final project. 14 | 15 | 4 16 | 00:00:11.880 --> 00:00:14.800 17 | Now, let's briefly discuss the data. 18 | 19 | 5 20 | 00:00:14.800 --> 00:00:18.860 21 | For more details, see the competition web page on Kaggle. 22 | 23 | 6 24 | 00:00:18.860 --> 00:00:21.880 25 | The data in this competition is quite challenging. 26 | 27 | 7 28 | 00:00:21.880 --> 00:00:26.400 29 | You can work with a time series data set consisting of daily sales data, 30 | 31 | 8 32 | 00:00:26.400 --> 00:00:30.153 33 | kindly provided by one of the largest Russian software company. 34 | 35 | 9 36 | 00:00:30.153 --> 00:00:31.975 37 | It's called 1C. 38 | 39 | 10 40 | 00:00:31.975 --> 00:00:35.860 41 | The training data consists of records with information that 42 | 43 | 11 44 | 00:00:35.860 --> 00:00:39.550 45 | a particular item had been sold in a particular shop, 46 | 47 | 12 48 | 00:00:39.550 --> 00:00:42.560 49 | in a particular day, in the training period. 50 | 51 | 13 52 | 00:00:42.560 --> 00:00:48.630 53 | The task is to forecast the sales for every item in every shop in the testing period. 54 | 55 | 14 56 | 00:00:48.630 --> 00:00:51.952 57 | There are about 6 million such records in the training set, 58 | 59 | 15 60 | 00:00:51.952 --> 00:00:57.430 61 | collected over 30 shops selling 20,000 unique items. 62 | 63 | 16 64 | 00:00:57.430 --> 00:00:59.770 65 | But don't be afraid of these numbers. 66 | 67 | 17 68 | 00:00:59.770 --> 00:01:03.580 69 | This is the moderate-sized competition data set nowadays. 70 | 71 | 18 72 | 00:01:03.580 --> 00:01:07.150 73 | The training period is about one and a half year, 74 | 75 | 19 76 | 00:01:07.150 --> 00:01:11.515 77 | and the testing period is the month that falls on training period. 78 | 79 | 20 80 | 00:01:11.515 --> 00:01:14.500 81 | Note that you provide these daily sales in training period. 82 | 83 | 21 84 | 00:01:14.500 --> 00:01:19.370 85 | Well, you need to predict aggregated sales for testing period. 86 | 87 | 22 88 | 00:01:19.370 --> 00:01:24.055 89 | That is, you need to predict monthly sales for every possible shop item pair. 90 | 91 | 23 92 | 00:01:24.055 --> 00:01:27.382 93 | In fact, correct aggregation of 94 | 95 | 24 96 | 00:01:27.382 --> 00:01:32.880 97 | overall daily sales and generation of appropriate features is a part of this challenge. 98 | 99 | 25 100 | 00:01:32.880 --> 00:01:35.632 101 | As in the majority of competitions, 102 | 103 | 26 104 | 00:01:35.632 --> 00:01:38.945 105 | that this data is split into public and private parts. 106 | 107 | 27 108 | 00:01:38.945 --> 00:01:42.975 109 | You can submit your test predictions up to five times every day on 110 | 111 | 28 112 | 00:01:42.975 --> 00:01:45.790 113 | Kaggle platform and up to five times every 114 | 115 | 29 116 | 00:01:45.790 --> 00:01:49.105 117 | week to Coursera's programming assignment grader. 118 | 119 | 30 120 | 00:01:49.105 --> 00:01:54.885 121 | Kaggle will evaluate the quality of your predictions on the public part of test set, 122 | 123 | 31 124 | 00:01:54.885 --> 00:01:57.825 125 | while Coursera's grader will report quality, 126 | 127 | 32 128 | 00:01:57.825 --> 00:02:00.730 129 | both in public and private parts. 130 | 131 | 33 132 | 00:02:00.730 --> 00:02:04.390 133 | That is, you can rarely peek at your private score. 134 | 135 | 34 136 | 00:02:04.390 --> 00:02:08.295 137 | Remember, the earlier you start working on the competition, 138 | 139 | 35 140 | 00:02:08.295 --> 00:02:11.500 141 | the more private score feedback you can get. 142 | 143 | 36 144 | 00:02:11.500 --> 00:02:13.915 145 | We encourage you to get familiar with the data 146 | 147 | 37 148 | 00:02:13.915 --> 00:02:17.105 149 | right away and not to wait until the very end. 150 | 151 | 38 152 | 00:02:17.105 --> 00:02:22.160 153 | Start simple and then improve your solution every week. 154 | 155 | 39 156 | 00:02:22.160 --> 00:02:26.830 157 | Remember, your final grades will depend on how would you have performed on 158 | 159 | 40 160 | 00:02:26.830 --> 00:02:32.135 161 | the private part of the leaderboard and on the quality of your solution report, 162 | 163 | 41 164 | 00:02:32.135 --> 00:02:34.550 165 | which will be graded by your peers. 166 | 167 | 42 168 | 00:02:34.550 --> 00:02:40.050 169 | You can read more about this in the reading material in the end of this week. 170 | 171 | 43 172 | 00:02:40.050 --> 00:02:45.290 173 | And, finally, the goal of the competition is to learn as much as possible, 174 | 175 | 44 176 | 00:02:45.290 --> 00:02:48.370 177 | so we strongly encourage you to participate in teams. 178 | 179 | 45 180 | 00:02:48.370 --> 00:02:50.740 181 | It is always fun and engaging. 182 | 183 | 46 184 | 00:02:50.740 --> 00:02:54.005 185 | In teams, you can discuss ideas and get feedback. 186 | 187 | 47 188 | 00:02:54.005 --> 00:02:56.842 189 | You can share a code and learn new tricks, 190 | 191 | 48 192 | 00:02:56.842 --> 00:02:59.380 193 | and you can get help if you're stuck. 194 | 195 | 49 196 | 00:02:59.380 --> 00:03:01.523 197 | If you don't have any teammates yet, 198 | 199 | 50 200 | 00:03:01.523 --> 00:03:04.845 201 | you can find them and meet them on forums. 202 | 203 | 51 204 | 00:03:04.845 --> 00:03:09.030 205 | Please never, never share your code on forums, 206 | 207 | 52 208 | 00:03:09.030 --> 00:03:11.240 209 | neither on Coursera forums, 210 | 211 | 53 212 | 00:03:11.240 --> 00:03:13.195 213 | nor on Kaggle's forums. 214 | 215 | 54 216 | 00:03:13.195 --> 00:03:16.810 217 | Sharing codes outside of the teams is strictly forbidden. 218 | 219 | 55 220 | 00:03:16.810 --> 00:03:19.925 221 | You are encouraged to share and discuss interesting ideas, 222 | 223 | 56 224 | 00:03:19.925 --> 00:03:23.750 225 | thoughts, even small quote snippets held by the learners, 226 | 227 | 57 228 | 00:03:23.750 --> 00:03:27.950 229 | but do not even share the complete code for your solution 230 | 231 | 58 232 | 00:03:27.950 --> 00:03:30.560 233 | because many people will blindly copy 234 | 235 | 59 236 | 00:03:30.560 --> 00:03:33.930 237 | paste your code without even trying to understand it. 238 | 239 | 60 240 | 00:03:33.930 --> 00:03:38.960 241 | It will reduce quality of skills acquired by fellow students, 242 | 243 | 61 244 | 00:03:38.960 --> 00:03:41.255 245 | it will ruin the fun of the fair competition. 246 | 247 | 62 248 | 00:03:41.255 --> 00:03:44.175 249 | On the other hand, every time you're stuck, 250 | 251 | 63 252 | 00:03:44.175 --> 00:03:48.335 253 | go in forums, and you will definitely find some inspiration there. 254 | 255 | 64 256 | 00:03:48.335 --> 00:03:53.560 257 | Good luck with the project and have fun. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_1/ここにmp4をおいて再生してね: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/ここにmp4をおいて再生してね -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_1_Exploratory_data analysis.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_1_Exploratory_data analysis.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_1_Exploratory_data analysis.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_2_Building_intuition_about_the_data.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_2_Building_intuition_about_the_data.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_2_Building_intuition_about_the_data.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_3_Exploring_anonymized_data.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_3_Exploring_anonymized_data.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_3_Exploring_anonymized_data.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_4_Visualizations.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_4_Visualizations.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_4_Visualizations.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_5_Dataset_cleaning_and_other_things_to_check.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_5_Dataset_cleaning_and_other_things_to_check.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_5_Dataset_cleaning_and_other_things_to_check.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_6_Springleaf_competition_EDA_I.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_6_Springleaf_competition_EDA_I.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_6_Springleaf_competition_EDA_I.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_7_Springleaf_competition_EDA_II.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_7_Springleaf_competition_EDA_II.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_7_Springleaf_competition_EDA_II.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:02.790 --> 00:00:06.285 5 | Hi, everyone. In this video, 6 | 7 | 2 8 | 00:00:06.285 --> 00:00:08.290 9 | I will tell you about the specifics of 10 | 11 | 3 12 | 00:00:08.290 --> 00:00:12.731 13 | Numerai Competition that was held throughout year 2016. 14 | 15 | 4 16 | 00:00:12.731 --> 00:00:17.980 17 | Note that Numerai organizers changed the format in 2017. 18 | 19 | 5 20 | 00:00:17.980 --> 00:00:22.335 21 | So, the findings I'm going to read will not work on new data. 22 | 23 | 6 24 | 00:00:22.335 --> 00:00:24.860 25 | Let's state the problem. 26 | 27 | 7 28 | 00:00:24.860 --> 00:00:28.250 29 | Participants were solving a binary classification task on 30 | 31 | 8 32 | 00:00:28.250 --> 00:00:32.320 33 | a data set with 21 anonymized numeric features. 34 | 35 | 9 36 | 00:00:32.320 --> 00:00:38.305 37 | Unusual part is that both train and test data sets have been updating every week. 38 | 39 | 10 40 | 00:00:38.305 --> 00:00:41.660 41 | Data sets were also shuffled column-wise. 42 | 43 | 11 44 | 00:00:41.660 --> 00:00:44.215 45 | So it was like a new task every week. 46 | 47 | 12 48 | 00:00:44.215 --> 00:00:47.405 49 | Pretty challenging. As it turned out, 50 | 51 | 13 52 | 00:00:47.405 --> 00:00:50.210 53 | this competition had a data leak. 54 | 55 | 14 56 | 00:00:50.210 --> 00:00:55.320 57 | Organizers did not disclose any information about the nature of data set. 58 | 59 | 15 60 | 00:00:55.320 --> 00:00:59.120 61 | But allegedly, it was some time series data with target 62 | 63 | 16 64 | 00:00:59.120 --> 00:01:03.770 65 | variable highly dependent on transitions between time points. 66 | 67 | 17 68 | 00:01:03.770 --> 00:01:07.910 69 | Think of something like predicting price change in stock market here. 70 | 71 | 18 72 | 00:01:07.910 --> 00:01:13.165 73 | Means that, if we knew true order or had timestamp variable, 74 | 75 | 19 76 | 00:01:13.165 --> 00:01:15.890 77 | we could easily get nearly perfect score. 78 | 79 | 20 80 | 00:01:15.890 --> 00:01:20.140 81 | And therefore, we had to somehow reconstruct this order. 82 | 83 | 21 84 | 00:01:20.140 --> 00:01:21.805 85 | Of course, approximately. 86 | 87 | 22 88 | 00:01:21.805 --> 00:01:27.440 89 | But even a rough approximation was giving a huge advantage over other participants. 90 | 91 | 23 92 | 00:01:27.440 --> 00:01:30.725 93 | The first and most important step is to find 94 | 95 | 24 96 | 00:01:30.725 --> 00:01:33.995 97 | a nearest neighbor for every point in a data set, 98 | 99 | 25 100 | 00:01:33.995 --> 00:01:39.230 101 | and add all 21 features from that neighbor to original point. 102 | 103 | 26 104 | 00:01:39.230 --> 00:01:43.160 105 | Simple logistic regression of those 42 features, 106 | 107 | 27 108 | 00:01:43.160 --> 00:01:46.610 109 | 21 from original, and 21 from neighboring points, 110 | 111 | 28 112 | 00:01:46.610 --> 00:01:50.285 113 | allowed to get into top 10 on the leader board. 114 | 115 | 29 116 | 00:01:50.285 --> 00:01:54.945 117 | Of course, we can get better scores with some Hardcore EDA. 118 | 119 | 30 120 | 00:01:54.945 --> 00:01:59.500 121 | Let's start exploring correlation metrics of new 21 features. 122 | 123 | 31 124 | 00:01:59.500 --> 00:02:03.943 125 | If group features with highest correlation coefficient next to each other, 126 | 127 | 32 128 | 00:02:03.943 --> 00:02:06.735 129 | we'll get a right picture. 130 | 131 | 33 132 | 00:02:06.735 --> 00:02:10.340 133 | This picture can help us in two different ways. 134 | 135 | 34 136 | 00:02:10.340 --> 00:02:13.810 137 | First, we can actually fix some column order. 138 | 139 | 35 140 | 00:02:13.810 --> 00:02:17.735 141 | So, weekly column shuffling won't affect our models. 142 | 143 | 36 144 | 00:02:17.735 --> 00:02:20.480 145 | And second, we can clearly notice 146 | 147 | 37 148 | 00:02:20.480 --> 00:02:25.115 149 | seven groups with three highly correlated features in each of them. 150 | 151 | 38 152 | 00:02:25.115 --> 00:02:29.600 153 | So, the data actually has some non-trivial structure. 154 | 155 | 39 156 | 00:02:29.600 --> 00:02:35.615 157 | Now, let's remember that we get new data sets every week. What is more? 158 | 159 | 40 160 | 00:02:35.615 --> 00:02:40.110 161 | Each week, train data sets have the same number of points. 162 | 163 | 41 164 | 00:02:40.110 --> 00:02:45.170 165 | We can assume that there is some connection between consecutive data sets. 166 | 167 | 42 168 | 00:02:45.170 --> 00:02:49.360 169 | This is a little strange because we already have a time series. 170 | 171 | 43 172 | 00:02:49.360 --> 00:02:53.200 173 | So, what's the connection between the data from different weeks? 174 | 175 | 44 176 | 00:02:53.200 --> 00:02:56.480 177 | Well, if we find nearest neighbors from 178 | 179 | 45 180 | 00:02:56.480 --> 00:03:00.065 181 | every point in current data set from previous data set, 182 | 183 | 46 184 | 00:03:00.065 --> 00:03:02.195 185 | and plot distance distributions, 186 | 187 | 47 188 | 00:03:02.195 --> 00:03:04.910 189 | we can notice that first neighbor is much, 190 | 191 | 48 192 | 00:03:04.910 --> 00:03:07.370 193 | much closer than the second. 194 | 195 | 49 196 | 00:03:07.370 --> 00:03:11.585 197 | So, we indeed have some connection between consecutive data sets. 198 | 199 | 50 200 | 00:03:11.585 --> 00:03:16.000 201 | And it looks like we can build a bijective mapping between them. 202 | 203 | 51 204 | 00:03:16.000 --> 00:03:21.470 205 | But let's not quickly jump into conclusions and do more exploration. 206 | 207 | 52 208 | 00:03:21.470 --> 00:03:25.650 209 | Okay. We found a nearest neighbor in previous data set. 210 | 211 | 53 212 | 00:03:25.650 --> 00:03:28.070 213 | What if we examine the distances between 214 | 215 | 54 216 | 00:03:28.070 --> 00:03:32.793 217 | the neighboring objects at the level of individual features? 218 | 219 | 55 220 | 00:03:32.793 --> 00:03:36.735 221 | We clearly have three different groups of seven features. 222 | 223 | 56 224 | 00:03:36.735 --> 00:03:40.090 225 | Now remember, the sorted correlation matrix? 226 | 227 | 57 228 | 00:03:40.090 --> 00:03:46.470 229 | It turns out that each of three highly correlated features belong to a different group. 230 | 231 | 58 232 | 00:03:46.470 --> 00:03:48.140 233 | A perfect match. 234 | 235 | 59 236 | 00:03:48.140 --> 00:03:52.245 237 | And if we multiply seven features from the first group by three, 238 | 239 | 60 240 | 00:03:52.245 --> 00:03:56.565 241 | and seven features from the second group by two in the original data set, 242 | 243 | 61 244 | 00:03:56.565 --> 00:04:01.500 245 | recalculate nearest neighbor-based features within the data sets, 246 | 247 | 62 248 | 00:04:01.500 --> 00:04:03.165 249 | and re-train our models, 250 | 251 | 63 252 | 00:04:03.165 --> 00:04:06.020 253 | we'll get a nice improvement. 254 | 255 | 64 256 | 00:04:06.020 --> 00:04:09.650 257 | So, after this magic multiplications, of course, 258 | 259 | 65 260 | 00:04:09.650 --> 00:04:11.445 261 | I'd tried other constants, 262 | 263 | 66 264 | 00:04:11.445 --> 00:04:15.450 265 | our true order approximation became a little better. 266 | 267 | 67 268 | 00:04:15.450 --> 00:04:20.840 269 | Great. Now, let's move to the true relation. 270 | 271 | 68 272 | 00:04:20.840 --> 00:04:23.835 273 | New data, weekly updates, 274 | 275 | 69 276 | 00:04:23.835 --> 00:04:25.955 277 | all of it was a lie. 278 | 279 | 70 280 | 00:04:25.955 --> 00:04:31.290 281 | Remember, how we were calculating neighbors between consecutive data sets? 282 | 283 | 71 284 | 00:04:31.290 --> 00:04:33.685 285 | Well, we can forget about consecutiveness. 286 | 287 | 72 288 | 00:04:33.685 --> 00:04:36.750 289 | Calculate neighbors between current data set, 290 | 291 | 73 292 | 00:04:36.750 --> 00:04:40.550 293 | and the data set from two weeks ago or two months ago. 294 | 295 | 74 296 | 00:04:40.550 --> 00:04:45.350 297 | No matter what, we will be getting pretty much the same distances. 298 | 299 | 75 300 | 00:04:45.350 --> 00:04:51.535 301 | Why? The simplest answer is that the data actually didn't change. 302 | 303 | 76 304 | 00:04:51.535 --> 00:04:54.505 305 | And every week, we were getting the same data, 306 | 307 | 77 308 | 00:04:54.505 --> 00:04:56.275 309 | plus a little bit of noise. 310 | 311 | 78 312 | 00:04:56.275 --> 00:05:00.750 313 | And thus, we could find nearest neighbor in each of previous data sets, 314 | 315 | 79 316 | 00:05:00.750 --> 00:05:02.305 317 | and average them all, 318 | 319 | 80 320 | 00:05:02.305 --> 00:05:05.770 321 | successfully reducing the variance of added noise. 322 | 323 | 81 324 | 00:05:05.770 --> 00:05:10.720 325 | After averaging, true order approximation became even better. 326 | 327 | 82 328 | 00:05:10.720 --> 00:05:16.115 329 | I have to say that a little bit of test data actually did change from time to time. 330 | 331 | 83 332 | 00:05:16.115 --> 00:05:20.765 333 | But nonetheless, most of the roles migrated from week to week. 334 | 335 | 84 336 | 00:05:20.765 --> 00:05:23.320 337 | Because of that, it was possible to probe 338 | 339 | 85 340 | 00:05:23.320 --> 00:05:26.395 341 | the whole public leader board which helped even further, 342 | 343 | 86 344 | 00:05:26.395 --> 00:05:28.150 345 | and so on, and so on. 346 | 347 | 87 348 | 00:05:28.150 --> 00:05:31.495 349 | Of course, there are more details regarding that competition, 350 | 351 | 88 352 | 00:05:31.495 --> 00:05:33.715 353 | but they aren't very interesting. 354 | 355 | 89 356 | 00:05:33.715 --> 00:05:37.745 357 | I wanted to focus on the process of reverse engineering. 358 | 359 | 90 360 | 00:05:37.745 --> 00:05:41.875 361 | Anyway, I hope you like this kind of detective story 362 | 363 | 91 364 | 00:05:41.875 --> 00:05:46.880 365 | and realized how important exploratory data analysis could be. 366 | 367 | 92 368 | 00:05:46.880 --> 00:05:51.710 369 | Thank you for your attention and always pay respect to EDA. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_1_Validation and overfitting.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_1_Validation and overfitting.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_1_Validation and overfitting.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_2_Validation strategies.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_2_Validation strategies.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_2_Validation strategies.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_3_Data splitting strategies.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_3_Data splitting strategies.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_3_Data splitting strategies.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_4_Problems occurring during validation.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_4_Problems occurring during validation.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/2_4_Problems occurring during validation.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:03.640 --> 00:00:06.470 5 | Hi everyone. In this section, 6 | 7 | 2 8 | 00:00:06.470 --> 00:00:12.570 9 | we will talk about a very sensitive topic data leakage or more simply, leaks. 10 | 11 | 3 12 | 00:00:12.570 --> 00:00:16.870 13 | We'll define leakage in a very general sense as 14 | 15 | 4 16 | 00:00:16.870 --> 00:00:19.450 17 | an unexpected information in the data that 18 | 19 | 5 20 | 00:00:19.450 --> 00:00:22.615 21 | allows us to make unrealistically good predictions. 22 | 23 | 6 24 | 00:00:22.615 --> 00:00:24.080 25 | For the time being, 26 | 27 | 7 28 | 00:00:24.080 --> 00:00:26.485 29 | you may have think of it as of directly or 30 | 31 | 8 32 | 00:00:26.485 --> 00:00:31.490 33 | indirectly adding ground truths into the test data. 34 | 35 | 9 36 | 00:00:31.490 --> 00:00:33.765 37 | Data leaks are very, very bad. 38 | 39 | 10 40 | 00:00:33.765 --> 00:00:36.835 41 | They are completely unusable in real world. 42 | 43 | 11 44 | 00:00:36.835 --> 00:00:43.245 45 | They usually provide way too much signal and thus make competitions lose its main point, 46 | 47 | 12 48 | 00:00:43.245 --> 00:00:47.560 49 | and quickly turn them into a leak hunt increase. 50 | 51 | 13 52 | 00:00:47.560 --> 00:00:50.915 53 | People often are very sensitive about this matter. 54 | 55 | 14 56 | 00:00:50.915 --> 00:00:52.960 57 | They tend to overreact. 58 | 59 | 15 60 | 00:00:52.960 --> 00:00:54.755 61 | That's completely understandable. 62 | 63 | 16 64 | 00:00:54.755 --> 00:00:57.825 65 | After spending a lot of time on solving the problem, 66 | 67 | 17 68 | 00:00:57.825 --> 00:01:01.845 69 | a sudden data leak may render all of that useless. 70 | 71 | 18 72 | 00:01:01.845 --> 00:01:04.640 73 | It is not a pleasant position to be in. 74 | 75 | 19 76 | 00:01:04.640 --> 00:01:09.100 77 | I cannot force you to turn the blind eye but keep in mind, 78 | 79 | 20 80 | 00:01:09.100 --> 00:01:13.030 81 | there is no ill intent whatsoever. 82 | 83 | 21 84 | 00:01:13.030 --> 00:01:17.515 85 | Data leaks are the result of unintentional errors, accidents. 86 | 87 | 22 88 | 00:01:17.515 --> 00:01:19.270 89 | Even if you find yourself in 90 | 91 | 23 92 | 00:01:19.270 --> 00:01:23.770 93 | a competition with an unexpected data leak close to the deadline, 94 | 95 | 24 96 | 00:01:23.770 --> 00:01:26.520 97 | please be more tolerant. 98 | 99 | 25 100 | 00:01:26.520 --> 00:01:29.295 101 | The question of whether to exploit the data leak 102 | 103 | 26 104 | 00:01:29.295 --> 00:01:33.055 105 | or not is exclusive to machine learning competitions. 106 | 107 | 27 108 | 00:01:33.055 --> 00:01:37.875 109 | In real world, the answer is obviously a no, nothing to discuss. 110 | 111 | 28 112 | 00:01:37.875 --> 00:01:39.400 113 | But in a competition, 114 | 115 | 29 116 | 00:01:39.400 --> 00:01:43.325 117 | the ultimate goal is to get a higher leaderboard position. 118 | 119 | 30 120 | 00:01:43.325 --> 00:01:45.355 121 | And if you truly pursue that goal, 122 | 123 | 31 124 | 00:01:45.355 --> 00:01:49.045 125 | then exploit the leak in every way possible. 126 | 127 | 32 128 | 00:01:49.045 --> 00:01:50.440 129 | Further in this section, 130 | 131 | 33 132 | 00:01:50.440 --> 00:01:53.285 133 | I will show you the main types of data leaks 134 | 135 | 34 136 | 00:01:53.285 --> 00:01:56.790 137 | that could appear during solving a machine learning problem. 138 | 139 | 35 140 | 00:01:56.790 --> 00:02:03.550 141 | Also focus on a competition specific leak exploitation technique leaderboard probing. 142 | 143 | 36 144 | 00:02:03.550 --> 00:02:06.190 145 | Finally, you will find special videos 146 | 147 | 37 148 | 00:02:06.190 --> 00:02:11.040 149 | dedicated to the most interesting and non-trivial data leaks. 150 | 151 | 38 152 | 00:02:11.040 --> 00:02:17.910 153 | I will start with the most typical data leaks that may occur in almost every problem. 154 | 155 | 39 156 | 00:02:17.910 --> 00:02:20.125 157 | Time series is our first target. 158 | 159 | 40 160 | 00:02:20.125 --> 00:02:23.015 161 | Typically, future picking. 162 | 163 | 41 164 | 00:02:23.015 --> 00:02:26.780 165 | It is common sense not to pick into the future like, 166 | 167 | 42 168 | 00:02:26.780 --> 00:02:32.570 169 | can we use stock market's price from day after tomorrow to predict price for tomorrow? 170 | 171 | 43 172 | 00:02:32.570 --> 00:02:36.215 173 | Of course not. However, direct usage of 174 | 175 | 44 176 | 00:02:36.215 --> 00:02:41.240 177 | future information in incorrect time splits still exist. 178 | 179 | 45 180 | 00:02:41.240 --> 00:02:44.830 181 | When you enter a time serious competition at first, 182 | 183 | 46 184 | 00:02:44.830 --> 00:02:48.005 185 | check train, public, and private splits. 186 | 187 | 47 188 | 00:02:48.005 --> 00:02:50.630 189 | If even one of them is not on time, 190 | 191 | 48 192 | 00:02:50.630 --> 00:02:53.105 193 | then you found a data leak. 194 | 195 | 49 196 | 00:02:53.105 --> 00:03:01.165 197 | In such case, unrealistic features like prices next week will be the most important. 198 | 199 | 50 200 | 00:03:01.165 --> 00:03:03.210 201 | But even when split by time, 202 | 203 | 51 204 | 00:03:03.210 --> 00:03:06.245 205 | data still contains information about future. 206 | 207 | 52 208 | 00:03:06.245 --> 00:03:09.800 209 | We still can access the rows from the test set. 210 | 211 | 53 212 | 00:03:09.800 --> 00:03:13.790 213 | We can have future user history in CTR task, 214 | 215 | 54 216 | 00:03:13.790 --> 00:03:20.145 217 | some fundamental indicators in stock market predictions tasks, and so on. 218 | 219 | 55 220 | 00:03:20.145 --> 00:03:24.510 221 | There are only two ways to eliminate the possibility of data leakage. 222 | 223 | 56 224 | 00:03:24.510 --> 00:03:29.090 225 | It's called competitions, where one can not access 226 | 227 | 57 228 | 00:03:29.090 --> 00:03:34.150 229 | rows from future or a test set with no features at all, only IDs. 230 | 231 | 58 232 | 00:03:34.150 --> 00:03:39.740 233 | For example, just the number and instrument ID in stock market prediction, 234 | 235 | 59 236 | 00:03:39.740 --> 00:03:45.420 237 | so participants create features based on past and join them themselves. 238 | 239 | 60 240 | 00:03:45.420 --> 00:03:48.610 241 | Now, let's discuss something more unusual. 242 | 243 | 61 244 | 00:03:48.610 --> 00:03:52.820 245 | Those types of data leaks are much harder to find. 246 | 247 | 62 248 | 00:03:52.820 --> 00:03:56.810 249 | We often have more than just train and test files. 250 | 251 | 63 252 | 00:03:56.810 --> 00:04:01.140 253 | For example, a lot of images or text in archive. 254 | 255 | 64 256 | 00:04:01.140 --> 00:04:04.970 257 | In such case, we can't access some meta information, 258 | 259 | 65 260 | 00:04:04.970 --> 00:04:08.950 261 | file creation date, image resolution etcetera. 262 | 263 | 66 264 | 00:04:08.950 --> 00:04:13.890 265 | It turns out that this meta information may be connected to target variable. 266 | 267 | 67 268 | 00:04:13.890 --> 00:04:18.535 269 | Imagine classic cats versus dogs classification. 270 | 271 | 68 272 | 00:04:18.535 --> 00:04:20.640 273 | What if cat pictures were taken before dog? 274 | 275 | 69 276 | 00:04:20.640 --> 00:04:24.010 277 | Or taken with a different camera? 278 | 279 | 70 280 | 00:04:24.010 --> 00:04:29.510 281 | Because of that, a good practice from organizers is to erase the meta data, 282 | 283 | 71 284 | 00:04:29.510 --> 00:04:32.750 285 | resize the pictures, and change creation date. 286 | 287 | 72 288 | 00:04:32.750 --> 00:04:36.195 289 | Unfortunately, sometimes we will forget about it. 290 | 291 | 73 292 | 00:04:36.195 --> 00:04:39.210 293 | A good example is Truly Native competition, 294 | 295 | 74 296 | 00:04:39.210 --> 00:04:44.505 297 | where one could get nearly perfect scores using just the dates from zip archives. 298 | 299 | 75 300 | 00:04:44.505 --> 00:04:48.380 301 | Another type of leakage could be found in IDs. 302 | 303 | 76 304 | 00:04:48.380 --> 00:04:54.285 305 | IDs are unique identifiers of every row usually used for convenience. 306 | 307 | 77 308 | 00:04:54.285 --> 00:04:57.410 309 | It makes no sense to include them into the model. 310 | 311 | 78 312 | 00:04:57.410 --> 00:05:00.905 313 | It is assumed that they are automatically generated. 314 | 315 | 79 316 | 00:05:00.905 --> 00:05:04.060 317 | In reality, that's not always true. 318 | 319 | 80 320 | 00:05:04.060 --> 00:05:06.510 321 | ID may be a hash of something, 322 | 323 | 81 324 | 00:05:06.510 --> 00:05:09.295 325 | probably not intended for disclosure. 326 | 327 | 82 328 | 00:05:09.295 --> 00:05:14.075 329 | It may contain traces of information connected to target variable. 330 | 331 | 83 332 | 00:05:14.075 --> 00:05:16.605 333 | It was a case in Caterpillar competition. 334 | 335 | 84 336 | 00:05:16.605 --> 00:05:20.270 337 | A link ID as a feature slightly improve the result. 338 | 339 | 85 340 | 00:05:20.270 --> 00:05:23.930 341 | So I advise you to pay close attention to IDs and 342 | 343 | 86 344 | 00:05:23.930 --> 00:05:27.875 345 | always check whether they are useful or not. 346 | 347 | 87 348 | 00:05:27.875 --> 00:05:29.965 349 | Next is row order. 350 | 351 | 88 352 | 00:05:29.965 --> 00:05:35.230 353 | In trivial case, data may be shuffled by target variable. 354 | 355 | 89 356 | 00:05:35.230 --> 00:05:39.040 357 | Sometimes simply adding row number or relative number, 358 | 359 | 90 360 | 00:05:39.040 --> 00:05:41.200 361 | suddenly improves this course. 362 | 363 | 91 364 | 00:05:41.200 --> 00:05:44.680 365 | Like, in Telstra Network Disruptions competition. 366 | 367 | 92 368 | 00:05:44.680 --> 00:05:47.995 369 | It's also possible to find something way more interesting 370 | 371 | 93 372 | 00:05:47.995 --> 00:05:52.420 373 | like in TalkingData Mobile User Demographics competition. 374 | 375 | 94 376 | 00:05:52.420 --> 00:05:55.220 377 | There was some kind of row duplication, 378 | 379 | 95 380 | 00:05:55.220 --> 00:05:59.610 381 | rows next to each other usually have the same label. 382 | 383 | 96 384 | 00:05:59.610 --> 00:06:02.500 385 | This is it with a regular type of leaks. 386 | 387 | 97 388 | 00:06:02.500 --> 00:06:05.050 389 | To sum things up, in this video, 390 | 391 | 98 392 | 00:06:05.050 --> 00:06:12.780 393 | we embrace the concept of data leak and cover data leaks from future picking, 394 | 395 | 99 396 | 00:06:12.780 --> 00:06:16.380 397 | meta data, IDs, and row order. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_2_Leaderboard probing and examples of rare data leaks.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/3_2_Leaderboard probing and examples of rare data leaks.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_2_Leaderboard probing and examples of rare data leaks.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_3_Expedia challenge.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/3_3_Expedia challenge.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_2/3_3_Expedia challenge.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_1_Motivation.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_1_Motivation.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_1_Motivation.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_2_Regression_metrics_review1.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_2_Regression_metrics_review1.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_2_Regression_metrics_review1.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_3_Regression_metrics_review2.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_3_Regression_metrics_review2.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_3_Regression_metrics_review2.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_4_Classification_metrics_review.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_4_Classification_metrics_review.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_4_Classification_metrics_review.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_5_General_approaches.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_5_General_approaches.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_5_General_approaches.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_5_General_approaches.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:03.210 --> 00:00:09.520 5 | In this video, we will discuss what is the loss and what is a metric, 6 | 7 | 2 8 | 00:00:09.520 --> 00:00:11.905 9 | and what is the difference between them. 10 | 11 | 3 12 | 00:00:11.905 --> 00:00:18.155 13 | And then we'll overview what are the general approaches to metric optimization. 14 | 15 | 4 16 | 00:00:18.155 --> 00:00:23.545 17 | Let's start with a comparison between two notions, loss and metric. 18 | 19 | 5 20 | 00:00:23.545 --> 00:00:27.460 21 | The metric or target metric is a function which we 22 | 23 | 6 24 | 00:00:27.460 --> 00:00:31.690 25 | want to use to evaluate the quality of our model. 26 | 27 | 7 28 | 00:00:31.690 --> 00:00:34.390 29 | For example, for a classification task, 30 | 31 | 8 32 | 00:00:34.390 --> 00:00:38.097 33 | we may want to maximize accuracy of our predictions, 34 | 35 | 9 36 | 00:00:38.097 --> 00:00:41.765 37 | how frequently the model outputs the correct label. 38 | 39 | 10 40 | 00:00:41.765 --> 00:00:47.415 41 | But the problem is that no one really knows how to optimize accuracy efficiently. 42 | 43 | 11 44 | 00:00:47.415 --> 00:00:51.770 45 | Instead, people come up with the proxy loss functions. 46 | 47 | 12 48 | 00:00:51.770 --> 00:00:57.105 49 | They are such evaluation functions that are easy to optimize for a given model. 50 | 51 | 13 52 | 00:00:57.105 --> 00:01:02.325 53 | For example, logarithmic loss is widely used as an optimization loss, 54 | 55 | 14 56 | 00:01:02.325 --> 00:01:07.530 57 | while the accuracy score is how the solution is eventually evaluated. 58 | 59 | 15 60 | 00:01:07.530 --> 00:01:11.220 61 | So, once again, the loss function is a function 62 | 63 | 16 64 | 00:01:11.220 --> 00:01:15.205 65 | that our model optimizes and uses to evaluate the solution, 66 | 67 | 17 68 | 00:01:15.205 --> 00:01:20.455 69 | and the target metric is how we want the solution to be evaluated. 70 | 71 | 18 72 | 00:01:20.455 --> 00:01:24.365 73 | This is kind of expectation versus reality thing. 74 | 75 | 19 76 | 00:01:24.365 --> 00:01:30.670 77 | Sometimes we are lucky and the model can optimize our target metric directly. 78 | 79 | 20 80 | 00:01:30.670 --> 00:01:34.360 81 | For example, for mean square error metric, 82 | 83 | 21 84 | 00:01:34.360 --> 00:01:39.960 85 | most libraries can optimize it from the outset, from the box. 86 | 87 | 22 88 | 00:01:39.960 --> 00:01:43.745 89 | So the loss function is the same as the target metric. 90 | 91 | 23 92 | 00:01:43.745 --> 00:01:46.690 93 | And sometimes we want to optimize metrics that 94 | 95 | 24 96 | 00:01:46.690 --> 00:01:50.845 97 | are really hard or even impossible to optimize directly. 98 | 99 | 25 100 | 00:01:50.845 --> 00:01:53.420 101 | In this case, we usually set the model to optimize 102 | 103 | 26 104 | 00:01:53.420 --> 00:01:56.545 105 | a loss that is different to a target metric, 106 | 107 | 27 108 | 00:01:56.545 --> 00:01:58.420 109 | but after a model is trained, 110 | 111 | 28 112 | 00:01:58.420 --> 00:02:02.290 113 | we use hacks and heuristics to negate the discrepancy 114 | 115 | 29 116 | 00:02:02.290 --> 00:02:07.520 117 | and adjust the model to better fit the target metric. 118 | 119 | 30 120 | 00:02:07.520 --> 00:02:11.810 121 | We will see the examples for both cases in the following videos. 122 | 123 | 31 124 | 00:02:11.810 --> 00:02:14.935 125 | And the last thing to mention is that loss metric, 126 | 127 | 32 128 | 00:02:14.935 --> 00:02:22.055 129 | cost objective and other notions are more or less used as synonyms. 130 | 131 | 33 132 | 00:02:22.055 --> 00:02:26.680 133 | It is completely okay to say target loss and optimization metric, 134 | 135 | 34 136 | 00:02:26.680 --> 00:02:29.895 137 | but we will fix the wording for the clarity now. 138 | 139 | 35 140 | 00:02:29.895 --> 00:02:33.495 141 | Okay, so far, we've understood 142 | 143 | 36 144 | 00:02:33.495 --> 00:02:38.745 145 | why it's important to optimize a metric given in a competition. 146 | 147 | 37 148 | 00:02:38.745 --> 00:02:44.395 149 | And we have discussed the difference between optimization loss and target metric. 150 | 151 | 38 152 | 00:02:44.395 --> 00:02:50.305 153 | Now, let's overview the approaches to target metrics optimization in general. 154 | 155 | 39 156 | 00:02:50.305 --> 00:02:54.600 157 | The approaches can be broadly divided into several categories, 158 | 159 | 40 160 | 00:02:54.600 --> 00:02:57.300 161 | depending on the metric we need to optimize. 162 | 163 | 41 164 | 00:02:57.300 --> 00:03:01.050 165 | Some metrics can be optimized directly. 166 | 167 | 42 168 | 00:03:01.050 --> 00:03:06.825 169 | That is, we should just find a model that optimizes this metric and run it. 170 | 171 | 43 172 | 00:03:06.825 --> 00:03:13.200 173 | In fact, all we need to do is to set the model's loss function to these metric. 174 | 175 | 44 176 | 00:03:13.200 --> 00:03:16.055 177 | The most common metrics like MSE, 178 | 179 | 45 180 | 00:03:16.055 --> 00:03:22.470 181 | Logloss are implemented as loss functions in almost every library. 182 | 183 | 46 184 | 00:03:22.470 --> 00:03:26.090 185 | For some of the metrics that cannot be optimized directly, 186 | 187 | 47 188 | 00:03:26.090 --> 00:03:29.610 189 | we can somehow pre-process the train set and use 190 | 191 | 48 192 | 00:03:29.610 --> 00:03:34.245 193 | a model with a metric or loss function which is easy to optimize. 194 | 195 | 49 196 | 00:03:34.245 --> 00:03:40.265 197 | For example, while MSPE metric cannot be optimized directly with XGBoost, 198 | 199 | 50 200 | 00:03:40.265 --> 00:03:46.539 201 | we will see later that we can resample the train set and optimize MSE loss instead, 202 | 203 | 51 204 | 00:03:46.539 --> 00:03:48.930 205 | which XGBoost can optimize. 206 | 207 | 52 208 | 00:03:48.930 --> 00:03:52.470 209 | Sometimes, we'll optimize incorrect metric, 210 | 211 | 53 212 | 00:03:52.470 --> 00:03:58.890 213 | but we'll post-process the predictions to fit classification, 214 | 215 | 54 216 | 00:03:58.890 --> 00:04:01.850 217 | to fit the communication metric better. 218 | 219 | 55 220 | 00:04:01.850 --> 00:04:03.810 221 | For some models and frameworks, 222 | 223 | 56 224 | 00:04:03.810 --> 00:04:06.765 225 | it's possible to define a custom loss function, 226 | 227 | 57 228 | 00:04:06.765 --> 00:04:10.320 229 | and sometimes it's possible to implement a loss function which will 230 | 231 | 58 232 | 00:04:10.320 --> 00:04:14.345 233 | serve as a nice proxy for the desired metric. 234 | 235 | 59 236 | 00:04:14.345 --> 00:04:19.715 237 | For example, it can be done for quadratic-weighted Kappa, as we will see later. 238 | 239 | 60 240 | 00:04:19.715 --> 00:04:24.750 241 | It's actually quite easy to define a custom loss function for XGBoost. 242 | 243 | 61 244 | 00:04:24.750 --> 00:04:27.735 245 | We only need to implement a single function that 246 | 247 | 62 248 | 00:04:27.735 --> 00:04:30.910 249 | takes predictions and the target values and 250 | 251 | 63 252 | 00:04:30.910 --> 00:04:34.090 253 | computes first and second-order derivatives 254 | 255 | 64 256 | 00:04:34.090 --> 00:04:37.890 257 | of the loss function with respect to the model's predictions. 258 | 259 | 65 260 | 00:04:37.890 --> 00:04:41.275 261 | For example, here you see one for the Logloss. 262 | 263 | 66 264 | 00:04:41.275 --> 00:04:47.485 265 | Of course, the loss function should be smooth enough and have well-behaved derivatives, 266 | 267 | 67 268 | 00:04:47.485 --> 00:04:50.455 269 | otherwise XGBoost will drive crazy. 270 | 271 | 68 272 | 00:04:50.455 --> 00:04:53.965 273 | In this course, we consider only a small set of metrics, 274 | 275 | 69 276 | 00:04:53.965 --> 00:04:56.300 277 | but there are plenty of them in fact. 278 | 279 | 70 280 | 00:04:56.300 --> 00:04:57.960 281 | And for some of them, 282 | 283 | 71 284 | 00:04:57.960 --> 00:05:00.110 285 | it is really hard to come up with 286 | 287 | 72 288 | 00:05:00.110 --> 00:05:05.155 289 | a neat optimization procedure or write a custom loss function. 290 | 291 | 73 292 | 00:05:05.155 --> 00:05:09.020 293 | Thankfully, there is a method that always works. 294 | 295 | 74 296 | 00:05:09.020 --> 00:05:10.955 297 | It is called early stopping, 298 | 299 | 75 300 | 00:05:10.955 --> 00:05:13.310 301 | and it is very simple. 302 | 303 | 76 304 | 00:05:13.310 --> 00:05:16.290 305 | You set a model to optimize any loss function it can 306 | 307 | 77 308 | 00:05:16.290 --> 00:05:21.225 309 | optimize and you monitor the desired metric on a validation set. 310 | 311 | 78 312 | 00:05:21.225 --> 00:05:25.820 313 | And you stop the training when the model starts to fit according to 314 | 315 | 79 316 | 00:05:25.820 --> 00:05:30.815 317 | the desired metric and not according to the metric the model is truly optimizing. 318 | 319 | 80 320 | 00:05:30.815 --> 00:05:33.155 321 | That is important. Of course, 322 | 323 | 81 324 | 00:05:33.155 --> 00:05:36.615 325 | some metrics cannot be even easily evaluated. 326 | 327 | 82 328 | 00:05:36.615 --> 00:05:40.730 329 | For example, if the metric is based on a human assessor's opinions, 330 | 331 | 83 332 | 00:05:40.730 --> 00:05:44.500 333 | you cannot evaluate it on every iteration. 334 | 335 | 84 336 | 00:05:44.500 --> 00:05:47.730 337 | For such metrics, we cannot use early stopping, 338 | 339 | 85 340 | 00:05:47.730 --> 00:05:51.370 341 | but we will never find such metrics in a competition. 342 | 343 | 86 344 | 00:05:51.370 --> 00:05:53.050 345 | So, in this video, 346 | 347 | 87 348 | 00:05:53.050 --> 00:05:56.080 349 | we have discussed the discrepancy between our target 350 | 351 | 88 352 | 00:05:56.080 --> 00:06:00.055 353 | metric and the loss function that our model optimizes. 354 | 355 | 89 356 | 00:06:00.055 --> 00:06:04.150 357 | We've reviewed several approaches to target metric optimization and, 358 | 359 | 90 360 | 00:06:04.150 --> 00:06:06.880 361 | in particular, discussed early stopping. 362 | 363 | 91 364 | 00:06:06.880 --> 00:06:11.480 365 | In the following videos, we will go through the regression and 366 | 367 | 92 368 | 00:06:11.480 --> 00:06:17.390 369 | classification metrics and see the hacks we can use to optimize them. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_6_Regression_metrics_optimization.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_6_Regression_metrics_optimization.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_6_Regression_metrics_optimization.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_7_Classification_metrics_optimization_1.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_7_Classification_metrics_optimization_1.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_7_Classification_metrics_optimization_1.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_8_Classification_metrics_optimization_2.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_8_Classification_metrics_optimization_2.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/1_8_Classification_metrics_optimization_2.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/2_1_Concept_of_mean_encoding.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/2_1_Concept_of_mean_encoding.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/2_1_Concept_of_mean_encoding.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/2_2_Regularization.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/2_2_Regularization.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/2_2_Regularization.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/2_3_Extensions_and_generalizations.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/2_3_Extensions_and_generalizations.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_3/2_3_Extensions_and_generalizations.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_1_Hyperparameter_tuning_1.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_1_Hyperparameter_tuning_1.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_1_Hyperparameter_tuning_1.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_2_Hyperparameter_tuning_2.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_2_Hyperparameter_tuning_2.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_2_Hyperparameter_tuning_2.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_3_Hyperparameter_tuning_3.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_3_Hyperparameter_tuning_3.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_3_Hyperparameter_tuning_3.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_4_Practical_guide.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_4_Practical_guide.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_4_Practical_guide.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_5_KazAnova's competition pipeline, part 1.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_5_KazAnova's competition pipeline, part 1.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_5_KazAnova's competition pipeline, part 1.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_6_KazAnova's competition pipeline, part 2.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_6_KazAnova's competition pipeline, part 2.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/1_6_KazAnova's competition pipeline, part 2.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:02.670 --> 00:00:05.630 5 | Hi everyone. 6 | 7 | 2 8 | 00:00:05.630 --> 00:00:10.770 9 | This video is dedicated to the following advanced feature engineering techniques. 10 | 11 | 3 12 | 00:00:10.770 --> 00:00:14.400 13 | Calculating various statistics of one feature grouped by 14 | 15 | 4 16 | 00:00:14.400 --> 00:00:19.425 17 | another and features derived from neighborhood analysis of a given point. 18 | 19 | 5 20 | 00:00:19.425 --> 00:00:21.660 21 | To make it a little bit clearer, 22 | 23 | 6 24 | 00:00:21.660 --> 00:00:24.000 25 | let's consider a simple example. 26 | 27 | 7 28 | 00:00:24.000 --> 00:00:27.590 29 | Here we have a chunk of data for some CTR task. 30 | 31 | 8 32 | 00:00:27.590 --> 00:00:31.735 33 | Let's forget about target variable and focus on human features. 34 | 35 | 9 36 | 00:00:31.735 --> 00:00:35.865 37 | Namely, User_ID, unique identifier of a user, 38 | 39 | 10 40 | 00:00:35.865 --> 00:00:40.120 41 | Page_ID, an identifier of a page user visited, 42 | 43 | 11 44 | 00:00:40.120 --> 00:00:43.890 45 | Ad_price, item prices in the ad, 46 | 47 | 12 48 | 00:00:43.890 --> 00:00:49.645 49 | and Ad_position, relative position of an ad on the web page. 50 | 51 | 13 52 | 00:00:49.645 --> 00:00:53.310 53 | The most straightforward way to solve this problem is to label 54 | 55 | 14 56 | 00:00:53.310 --> 00:00:57.450 57 | and call the Ad_position and feed some classifier. 58 | 59 | 15 60 | 00:00:57.450 --> 00:01:00.480 61 | It would be a very good classifier that could take into 62 | 63 | 16 64 | 00:01:00.480 --> 00:01:04.395 65 | account all the hidden relations between variables. 66 | 67 | 17 68 | 00:01:04.395 --> 00:01:06.720 69 | But no matter how good it is, 70 | 71 | 18 72 | 00:01:06.720 --> 00:01:10.670 73 | it still treats all the data points independently. 74 | 75 | 19 76 | 00:01:10.670 --> 00:01:13.855 77 | And this is where we can apply feature engineering. 78 | 79 | 20 80 | 00:01:13.855 --> 00:01:16.460 81 | We can imply that an ad with 82 | 83 | 21 84 | 00:01:16.460 --> 00:01:20.580 85 | the lowest price on the page will catch most of the attention. 86 | 87 | 22 88 | 00:01:20.580 --> 00:01:24.450 89 | The rest of the ads on the page won't be very attractive. 90 | 91 | 23 92 | 00:01:24.450 --> 00:01:29.165 93 | It's pretty easy to calculate the features relevant to such an implication. 94 | 95 | 24 96 | 00:01:29.165 --> 00:01:34.930 97 | We can add lowest and highest prices for every user and page per ad. 98 | 99 | 25 100 | 00:01:34.930 --> 00:01:40.115 101 | Position of an ad with the lowest price could also be of use in such case. 102 | 103 | 26 104 | 00:01:40.115 --> 00:01:44.753 105 | Here's one of the ways to implement statistical features with paid ads. 106 | 107 | 27 108 | 00:01:44.753 --> 00:01:48.615 109 | If our data is stored in the data frame df, 110 | 111 | 28 112 | 00:01:48.615 --> 00:01:55.550 113 | we call groupby method like this to get maximum and minimum price values. 114 | 115 | 29 116 | 00:01:55.550 --> 00:01:59.160 117 | Then store this object in gb variable, 118 | 119 | 30 120 | 00:01:59.160 --> 00:02:04.627 121 | and then join it back to the data frame df. This is it. 122 | 123 | 31 124 | 00:02:04.627 --> 00:02:09.325 125 | I want to emphasize that you should not stop at this point. 126 | 127 | 32 128 | 00:02:09.325 --> 00:02:12.210 129 | It's possible to add other useful features not 130 | 131 | 33 132 | 00:02:12.210 --> 00:02:16.200 133 | necessarily calculated within user and page per. 134 | 135 | 34 136 | 00:02:16.200 --> 00:02:19.410 137 | It could be how many pages user has visited, 138 | 139 | 35 140 | 00:02:19.410 --> 00:02:23.455 141 | how many pages user has visited during the given session, 142 | 143 | 36 144 | 00:02:23.455 --> 00:02:26.280 145 | and ID of the most visited page, 146 | 147 | 37 148 | 00:02:26.280 --> 00:02:28.965 149 | how many users have visited that page, 150 | 151 | 38 152 | 00:02:28.965 --> 00:02:31.670 153 | and many, many more features. 154 | 155 | 39 156 | 00:02:31.670 --> 00:02:35.215 157 | The main idea is to introduce new information. 158 | 159 | 40 160 | 00:02:35.215 --> 00:02:40.210 161 | By that means, we can drastically increase the quality of the models. 162 | 163 | 41 164 | 00:02:40.210 --> 00:02:44.090 165 | But what if there is no features to use groupby on? 166 | 167 | 42 168 | 00:02:44.090 --> 00:02:45.960 169 | Well, in such case, 170 | 171 | 43 172 | 00:02:45.960 --> 00:02:50.535 173 | we can replace grouping operations with finding the nearest neighbors. 174 | 175 | 44 176 | 00:02:50.535 --> 00:02:56.370 177 | On the one hand, it's much harder to implement and collect useful information. 178 | 179 | 45 180 | 00:02:56.370 --> 00:02:59.455 181 | On the other hand, the method is more flexible. 182 | 183 | 46 184 | 00:02:59.455 --> 00:03:05.370 185 | We can fine tune things like the size of relevant neighborhood or metric. 186 | 187 | 47 188 | 00:03:05.370 --> 00:03:07.740 189 | The most common and natural example of 190 | 191 | 48 192 | 00:03:07.740 --> 00:03:12.050 193 | neighborhood analysis arises from purposive pricing. 194 | 195 | 49 196 | 00:03:12.050 --> 00:03:14.970 197 | Imagine that you need to predict rental prices. 198 | 199 | 50 200 | 00:03:14.970 --> 00:03:19.150 201 | You would probably have some characteristics like floor space, 202 | 203 | 51 204 | 00:03:19.150 --> 00:03:22.050 205 | number of rooms, presence of a bus stop. 206 | 207 | 52 208 | 00:03:22.050 --> 00:03:26.665 209 | But you need something more than that to create a really good model. 210 | 211 | 53 212 | 00:03:26.665 --> 00:03:30.090 213 | It could be the number of other houses in 214 | 215 | 54 216 | 00:03:30.090 --> 00:03:35.370 217 | different neighborhoods like in 500 meters, 1,000 meters, 218 | 219 | 55 220 | 00:03:35.370 --> 00:03:41.080 221 | or 1,500 meters, or average price per square meter in such neighborhoods, 222 | 223 | 56 224 | 00:03:41.080 --> 00:03:43.140 225 | or the number of schools, 226 | 227 | 57 228 | 00:03:43.140 --> 00:03:47.190 229 | supermarkets, and parking lots in such neighborhoods. 230 | 231 | 58 232 | 00:03:47.190 --> 00:03:50.835 233 | The distances to the closest objects of interest 234 | 235 | 59 236 | 00:03:50.835 --> 00:03:54.950 237 | like subway stations or gyms could also be of use. 238 | 239 | 60 240 | 00:03:54.950 --> 00:03:56.835 241 | I think you've got the idea. 242 | 243 | 61 244 | 00:03:56.835 --> 00:04:00.705 245 | In the example, we've used a very simple case, 246 | 247 | 62 248 | 00:04:00.705 --> 00:04:04.980 249 | where neighborhoods were calculated in geographical space. 250 | 251 | 63 252 | 00:04:04.980 --> 00:04:08.040 253 | But don't be afraid to apply this method to 254 | 255 | 64 256 | 00:04:08.040 --> 00:04:11.710 257 | some abstract or even anonymized feature space. 258 | 259 | 65 260 | 00:04:11.710 --> 00:04:14.055 261 | It still could be very useful. 262 | 263 | 66 264 | 00:04:14.055 --> 00:04:18.350 265 | My team and I used this method in Spring Leaf competition. 266 | 267 | 67 268 | 00:04:18.350 --> 00:04:22.910 269 | Furthermore, we did it in supervised fashion. 270 | 271 | 68 272 | 00:04:22.910 --> 00:04:24.405 273 | Here is how we have done it. 274 | 275 | 69 276 | 00:04:24.405 --> 00:04:28.260 277 | First of all, we applied mean encoding to all variables. 278 | 279 | 70 280 | 00:04:28.260 --> 00:04:32.940 281 | By doing so, we created homogeneous feature space so we 282 | 283 | 71 284 | 00:04:32.940 --> 00:04:38.325 285 | did not worry about scaling and importance of each particular feature. 286 | 287 | 72 288 | 00:04:38.325 --> 00:04:44.595 289 | After that, we calculated 2,000 nearest neighbors with Bray-Curtis metric. 290 | 291 | 73 292 | 00:04:44.595 --> 00:04:48.810 293 | Then we evaluated various features from 294 | 295 | 74 296 | 00:04:48.810 --> 00:04:53.740 297 | those neighbors like mean target of nearest 5, 10, 15, 500, 298 | 299 | 75 300 | 00:04:53.740 --> 00:04:59.540 301 | 2,000 neighbors, mean distance to 10 closest neighbors, 302 | 303 | 76 304 | 00:04:59.540 --> 00:05:03.713 305 | mean distance to 10 closest neighbors with target 1, 306 | 307 | 77 308 | 00:05:03.713 --> 00:05:08.240 309 | and mean distance to 10 closest neighbors with target 0, 310 | 311 | 78 312 | 00:05:08.240 --> 00:05:10.845 313 | and, it worked great. 314 | 315 | 79 316 | 00:05:10.845 --> 00:05:16.125 317 | In conclusion, I hope you embrace the main ideas of 318 | 319 | 80 320 | 00:05:16.125 --> 00:05:20.085 321 | both groupby and nearest neighbor methods 322 | 323 | 81 324 | 00:05:20.085 --> 00:05:24.935 325 | and you would be able to apply them in practice. 326 | 327 | 82 328 | 00:05:24.935 --> 00:05:28.510 329 | Thank you for your attention. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_2_Matrix factorizations.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_2_Matrix factorizations.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_2_Matrix factorizations.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:02.990 --> 00:00:05.183 5 | Hi, everyone. 6 | 7 | 2 8 | 00:00:05.183 --> 00:00:08.535 9 | The main topic of this video is Feature Interactions. 10 | 11 | 3 12 | 00:00:08.535 --> 00:00:12.040 13 | You will learn how to construct them and use in problem solving. 14 | 15 | 4 16 | 00:00:12.040 --> 00:00:16.405 17 | Additionally, we will discuss them for feature extraction from decision trees. 18 | 19 | 5 20 | 00:00:16.405 --> 00:00:18.100 21 | Let's start with an example. 22 | 23 | 6 24 | 00:00:18.100 --> 00:00:20.160 25 | Suppose that we are building a model to predict 26 | 27 | 7 28 | 00:00:20.160 --> 00:00:23.245 29 | the best advertisement banner to display on a website. 30 | 31 | 8 32 | 00:00:23.245 --> 00:00:27.760 33 | Among available features, there are two categorical ones that we will concentrate on. 34 | 35 | 9 36 | 00:00:27.760 --> 00:00:30.810 37 | The category of the advertising banner itself and 38 | 39 | 10 40 | 00:00:30.810 --> 00:00:34.150 41 | the category of the site the banner will be showing on. 42 | 43 | 11 44 | 00:00:34.150 --> 00:00:37.603 45 | Certainly, we can use the features as two independent ones, 46 | 47 | 12 48 | 00:00:37.603 --> 00:00:41.525 49 | but a really important feature is indeed the combination of them. 50 | 51 | 13 52 | 00:00:41.525 --> 00:00:43.770 53 | We can explicitly construct the combination in 54 | 55 | 14 56 | 00:00:43.770 --> 00:00:47.015 57 | order to incorporate our knowledge into a model. 58 | 59 | 15 60 | 00:00:47.015 --> 00:00:52.195 61 | Let's construct new feature named ad_site that represents the combination. 62 | 63 | 16 64 | 00:00:52.195 --> 00:00:54.551 65 | It will be categorical as the old ones, 66 | 67 | 17 68 | 00:00:54.551 --> 00:01:00.270 69 | but set of its values will be all possible combinations of two original values. 70 | 71 | 18 72 | 00:01:00.270 --> 00:01:01.905 73 | From a technical point of view, 74 | 75 | 19 76 | 00:01:01.905 --> 00:01:04.785 77 | there are two ways to construct such interaction. 78 | 79 | 20 80 | 00:01:04.785 --> 00:01:07.170 81 | Let's look at a simple example. 82 | 83 | 21 84 | 00:01:07.170 --> 00:01:08.700 85 | Consider our first feature, 86 | 87 | 22 88 | 00:01:08.700 --> 00:01:10.610 89 | f1, has values A or B. 90 | 91 | 23 92 | 00:01:10.610 --> 00:01:13.714 93 | Another feature, f2, has values X or Y or Z, 94 | 95 | 24 96 | 00:01:13.714 --> 00:01:17.870 97 | and our data set consist of four data points. 98 | 99 | 25 100 | 00:01:17.870 --> 00:01:21.810 101 | The first approach is to concatenate the text values of f1 and f2, 102 | 103 | 26 104 | 00:01:21.810 --> 00:01:25.710 105 | and use the result as a new categorical feature f_join. 106 | 107 | 27 108 | 00:01:25.710 --> 00:01:28.520 109 | We can then apply the OneHot according to it. 110 | 111 | 28 112 | 00:01:28.520 --> 00:01:30.840 113 | The second approach consist of two steps. 114 | 115 | 29 116 | 00:01:30.840 --> 00:01:35.025 117 | Firstly, apply OneHot and connect to features f1 and f2. 118 | 119 | 30 120 | 00:01:35.025 --> 00:01:38.940 121 | Secondly, construct new metrics by multiplying each column from 122 | 123 | 31 124 | 00:01:38.940 --> 00:01:43.390 125 | f1 encoded metrics to each column from f2 encoded metrics. 126 | 127 | 32 128 | 00:01:43.390 --> 00:01:46.068 129 | It was nothing that both methods results in 130 | 131 | 33 132 | 00:01:46.068 --> 00:01:49.410 133 | practically the same new feature representations. 134 | 135 | 34 136 | 00:01:49.410 --> 00:01:51.075 137 | In the above example, 138 | 139 | 35 140 | 00:01:51.075 --> 00:01:54.570 141 | we can consider as interactions between categorical features, 142 | 143 | 36 144 | 00:01:54.570 --> 00:01:58.060 145 | but similar ideas can be applied to real valued features. 146 | 147 | 37 148 | 00:01:58.060 --> 00:02:01.230 149 | For example, having two real valued features f1 and f2, 150 | 151 | 38 152 | 00:02:01.230 --> 00:02:07.375 153 | interactions between them can be obtained by multiplications of f1 and f2. 154 | 155 | 39 156 | 00:02:07.375 --> 00:02:11.035 157 | In fact, we are not limited to use only multiply operation. 158 | 159 | 40 160 | 00:02:11.035 --> 00:02:14.070 161 | Any function taking two arguments like sum, 162 | 163 | 41 164 | 00:02:14.070 --> 00:02:16.735 165 | difference, or division is okay. 166 | 167 | 42 168 | 00:02:16.735 --> 00:02:19.320 169 | The following transformations significantly enlarge 170 | 171 | 43 172 | 00:02:19.320 --> 00:02:22.695 173 | feature space and makes learning easier, 174 | 175 | 44 176 | 00:02:22.695 --> 00:02:26.205 177 | but keep in mind that it makes or frequent easier too. 178 | 179 | 45 180 | 00:02:26.205 --> 00:02:29.610 181 | It should be emphasized that for three ways algorithms such as 182 | 183 | 46 184 | 00:02:29.610 --> 00:02:32.280 185 | the random forest or gradient boost decision trees 186 | 187 | 47 188 | 00:02:32.280 --> 00:02:35.530 189 | it's difficult to extract such kind of dependencies. 190 | 191 | 48 192 | 00:02:35.530 --> 00:02:40.265 193 | That's why they're buffer transformation are very efficient for three based methods. 194 | 195 | 49 196 | 00:02:40.265 --> 00:02:42.755 197 | Let's discuss practical details now. 198 | 199 | 50 200 | 00:02:42.755 --> 00:02:47.520 201 | Where wise future generation approaches greatly increase the number of the features. 202 | 203 | 51 204 | 00:02:47.520 --> 00:02:49.190 205 | If there were any original features, 206 | 207 | 52 208 | 00:02:49.190 --> 00:02:51.150 209 | there will be n square. 210 | 211 | 53 212 | 00:02:51.150 --> 00:02:55.240 213 | And will be even more features if several types of interaction are used. 214 | 215 | 54 216 | 00:02:55.240 --> 00:02:57.550 217 | There are two ways to moderate this, 218 | 219 | 55 220 | 00:02:57.550 --> 00:03:01.100 221 | either do feature selection or dimensionality reduction. 222 | 223 | 56 224 | 00:03:01.100 --> 00:03:03.060 225 | I prefer doing the selection since 226 | 227 | 57 228 | 00:03:03.060 --> 00:03:05.615 229 | not all but only a few interactions often 230 | 231 | 58 232 | 00:03:05.615 --> 00:03:09.000 233 | achieve the same quality as all combinations of features. 234 | 235 | 59 236 | 00:03:09.000 --> 00:03:10.830 237 | For each type of interaction, 238 | 239 | 60 240 | 00:03:10.830 --> 00:03:13.555 241 | I construct all piecewise feature interactions. 242 | 243 | 61 244 | 00:03:13.555 --> 00:03:18.150 245 | Feature random forests over them and select several most important features. 246 | 247 | 62 248 | 00:03:18.150 --> 00:03:22.265 249 | Because number of resulting features for each type is relatively small. 250 | 251 | 63 252 | 00:03:22.265 --> 00:03:25.800 253 | It's possible to join them together along with original features and 254 | 255 | 64 256 | 00:03:25.800 --> 00:03:29.975 257 | use as input for any machine learning algorithm usually to be by use method. 258 | 259 | 65 260 | 00:03:29.975 --> 00:03:34.660 261 | During the video, we have examined the method to construct second order interactions. 262 | 263 | 66 264 | 00:03:34.660 --> 00:03:38.750 265 | But you can similarly produce throned order or higher. 266 | 267 | 67 268 | 00:03:38.750 --> 00:03:42.680 269 | Due to the fact that number of features grow rapidly with order, 270 | 271 | 68 272 | 00:03:42.680 --> 00:03:45.225 273 | it has become difficult to work with them. 274 | 275 | 69 276 | 00:03:45.225 --> 00:03:49.440 277 | Therefore high order directions are often constructed semi-manually. 278 | 279 | 70 280 | 00:03:49.440 --> 00:03:52.165 281 | And this is an art in some ways. 282 | 283 | 71 284 | 00:03:52.165 --> 00:03:54.690 285 | Additionally, I would like to talk about methods to 286 | 287 | 72 288 | 00:03:54.690 --> 00:03:57.880 289 | construct categorical features from decision trees. 290 | 291 | 73 292 | 00:03:57.880 --> 00:03:59.840 293 | Take a look at the decision tree. 294 | 295 | 74 296 | 00:03:59.840 --> 00:04:03.475 297 | Let's map each leaf into a binary feature. 298 | 299 | 75 300 | 00:04:03.475 --> 00:04:09.215 301 | The index of the object's leaf can be used as a value for a new categorical feature. 302 | 303 | 76 304 | 00:04:09.215 --> 00:04:12.565 305 | If we use not a single tree but an ensemble of them. 306 | 307 | 77 308 | 00:04:12.565 --> 00:04:14.260 309 | For example, a random forest, 310 | 311 | 78 312 | 00:04:14.260 --> 00:04:18.070 313 | then such operation can be applied to each of entries. 314 | 315 | 79 316 | 00:04:18.070 --> 00:04:22.270 317 | This is a powerful way to extract high order interactions. 318 | 319 | 80 320 | 00:04:22.270 --> 00:04:24.895 321 | This technique is quite simple to implement. 322 | 323 | 81 324 | 00:04:24.895 --> 00:04:27.970 325 | Tree-based poodles from sklearn library have 326 | 327 | 82 328 | 00:04:27.970 --> 00:04:30.190 329 | an apply method which takes as 330 | 331 | 83 332 | 00:04:30.190 --> 00:04:33.830 333 | input feature metrics and rituals corresponding indices of leaves. 334 | 335 | 84 336 | 00:04:33.830 --> 00:04:39.840 337 | In xgboost, also support to why a parameter breed leaf in predict method. 338 | 339 | 85 340 | 00:04:39.840 --> 00:04:42.730 341 | I suggest we need to collaborate documentations in order to 342 | 343 | 86 344 | 00:04:42.730 --> 00:04:46.420 345 | get more information about these methods and IPIs. 346 | 347 | 87 348 | 00:04:46.420 --> 00:04:48.210 349 | In the end of this video, 350 | 351 | 88 352 | 00:04:48.210 --> 00:04:50.250 353 | I will tackle the main points. 354 | 355 | 89 356 | 00:04:50.250 --> 00:04:54.960 357 | We examined method to construct an interactions of categorical features. 358 | 359 | 90 360 | 00:04:54.960 --> 00:04:58.135 361 | Also, we extend the approach to real-valued features. 362 | 363 | 91 364 | 00:04:58.135 --> 00:05:00.610 365 | And we have learned how to use trees to extract 366 | 367 | 92 368 | 00:05:00.610 --> 00:05:04.510 369 | high order interactions. Thank you for your attention. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_4_t-SNE.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_4_t-SNE.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_4_t-SNE.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/2_4_t-SNE.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:03.080 --> 00:00:05.268 5 | Hi, everyone. 6 | 7 | 2 8 | 00:00:05.268 --> 00:00:10.095 9 | Today, we will discuss this new method for visualizing data integrating features. 10 | 11 | 3 12 | 00:00:10.095 --> 00:00:11.540 13 | At the end of this video, 14 | 15 | 4 16 | 00:00:11.540 --> 00:00:14.190 17 | you will be able to use tSNE in your products. 18 | 19 | 5 20 | 00:00:14.190 --> 00:00:15.745 21 | In the previous video, 22 | 23 | 6 24 | 00:00:15.745 --> 00:00:20.930 25 | we learned about metaphysician technique that is predatory very close to linear models. 26 | 27 | 7 28 | 00:00:20.930 --> 00:00:22.980 29 | In this video, we will touch 30 | 31 | 8 32 | 00:00:22.980 --> 00:00:26.355 33 | the subject of non-linear methods of dimensionality reduction. 34 | 35 | 9 36 | 00:00:26.355 --> 00:00:29.180 37 | That says in general are called manifold learning. 38 | 39 | 10 40 | 00:00:29.180 --> 00:00:34.225 41 | For example, look at the data in form of letter S on the left side. 42 | 43 | 11 44 | 00:00:34.225 --> 00:00:36.380 45 | On the right, we can see results of running 46 | 47 | 12 48 | 00:00:36.380 --> 00:00:39.255 49 | different manifold learning algorithm on the data. 50 | 51 | 13 52 | 00:00:39.255 --> 00:00:43.560 53 | This new result is placed at the right bottom corner on the slide. 54 | 55 | 14 56 | 00:00:43.560 --> 00:00:46.803 57 | This new algorithm is the main topic of the lecture, 58 | 59 | 15 60 | 00:00:46.803 --> 00:00:50.170 61 | as it tells of how this really works won't be explained here. 62 | 63 | 16 64 | 00:00:50.170 --> 00:00:54.090 65 | But you will come to look at additional materials for the details. 66 | 67 | 17 68 | 00:00:54.090 --> 00:00:58.295 69 | Let's just say that this is a method that tries to project points from 70 | 71 | 18 72 | 00:00:58.295 --> 00:01:01.340 73 | high dimensional space into small dimensional space 74 | 75 | 19 76 | 00:01:01.340 --> 00:01:05.075 77 | so that the distances between points are approximately preserved. 78 | 79 | 20 80 | 00:01:05.075 --> 00:01:09.500 81 | Let's look at the example of the tSNE on the MNIST dataset. 82 | 83 | 21 84 | 00:01:09.500 --> 00:01:15.225 85 | Here are points from 700 dimensional space that are projected into two dimensional space. 86 | 87 | 22 88 | 00:01:15.225 --> 00:01:19.235 89 | You can see that such projection forms explicit clusters. 90 | 91 | 23 92 | 00:01:19.235 --> 00:01:22.240 93 | Coolest shows that these clusters are meaningful and 94 | 95 | 24 96 | 00:01:22.240 --> 00:01:25.785 97 | corresponds to the target numbers well. 98 | 99 | 25 100 | 00:01:25.785 --> 00:01:29.400 101 | Moreover, neighbor clusters corresponds to a visually similar numbers. 102 | 103 | 26 104 | 00:01:29.400 --> 00:01:32.730 105 | For example, cluster of three is located next to the cluster of 106 | 107 | 27 108 | 00:01:32.730 --> 00:01:37.490 109 | five which in chance is adjustment to the cluster of six and eight. 110 | 111 | 28 112 | 00:01:37.490 --> 00:01:41.535 113 | If data has explicit structure as in case of MNIST dataset, 114 | 115 | 29 116 | 00:01:41.535 --> 00:01:44.460 117 | it's likely to be reflected on tSNE plot. 118 | 119 | 30 120 | 00:01:44.460 --> 00:01:49.410 121 | For the reason tSNE is widely used in exploratory data analysis. 122 | 123 | 31 124 | 00:01:49.410 --> 00:01:53.875 125 | However, do not assume that tSNE is a magic want that always helps. 126 | 127 | 32 128 | 00:01:53.875 --> 00:01:58.640 129 | For example, a misfortune choice of hyperparameters may lead to poor results. 130 | 131 | 33 132 | 00:01:58.640 --> 00:02:02.095 133 | Consider an example, in the center is the least presented 134 | 135 | 34 136 | 00:02:02.095 --> 00:02:06.590 137 | a tSNE projection of exactly the same MNIST data as in previous example, 138 | 139 | 35 140 | 00:02:06.590 --> 00:02:09.340 141 | only perplexity parameter has been changed. 142 | 143 | 36 144 | 00:02:09.340 --> 00:02:11.110 145 | On the left, for comparison, 146 | 147 | 37 148 | 00:02:11.110 --> 00:02:13.225 149 | we have plots from previous right. 150 | 151 | 38 152 | 00:02:13.225 --> 00:02:17.190 153 | On the right, so it present a tSNE projection of random data. 154 | 155 | 39 156 | 00:02:17.190 --> 00:02:20.790 157 | We can see as a choice of hybrid parameters change projection of 158 | 159 | 40 160 | 00:02:20.790 --> 00:02:24.500 161 | MNIST data significantly so that we cannot see clusters. 162 | 163 | 41 164 | 00:02:24.500 --> 00:02:30.775 165 | Moreover, new projection become more similar to random data rather than to the original. 166 | 167 | 42 168 | 00:02:30.775 --> 00:02:34.615 169 | Let's find out what depends on the perplexity hyperparameter value. 170 | 171 | 43 172 | 00:02:34.615 --> 00:02:36.426 173 | On the left, we have perplexity=3, 174 | 175 | 44 176 | 00:02:36.426 --> 00:02:42.805 177 | in the center=10, and on the right= 150. 178 | 179 | 45 180 | 00:02:42.805 --> 00:02:47.910 181 | I want to emphasize that these projections are all made for the same data. 182 | 183 | 46 184 | 00:02:47.910 --> 00:02:52.875 185 | The illustration shows that these new results strongly depends on its parameters, 186 | 187 | 47 188 | 00:02:52.875 --> 00:02:57.270 189 | and the interpretation of the results is not a simple task. 190 | 191 | 48 192 | 00:02:57.270 --> 00:02:59.500 193 | In particular, one cannot infer the size of 194 | 195 | 49 196 | 00:02:59.500 --> 00:03:02.855 197 | original clusters using the size of projected clusters. 198 | 199 | 50 200 | 00:03:02.855 --> 00:03:06.050 201 | Similar proposition is valid for a distance between clusters. 202 | 203 | 51 204 | 00:03:06.050 --> 00:03:09.417 205 | Blog distill.pub contain a post 206 | 207 | 52 208 | 00:03:09.417 --> 00:03:13.595 209 | about how to understand and interpret the results of tSNE. 210 | 211 | 53 212 | 00:03:13.595 --> 00:03:16.220 213 | Also, it contains a great interactive demo 214 | 215 | 54 216 | 00:03:16.220 --> 00:03:19.575 217 | that will help you to get into issues of how tSNE works. 218 | 219 | 55 220 | 00:03:19.575 --> 00:03:21.980 221 | I strongly advise you to take a look at it. 222 | 223 | 56 224 | 00:03:21.980 --> 00:03:24.690 225 | In addition to exploratory data analysis, 226 | 227 | 57 228 | 00:03:24.690 --> 00:03:28.770 229 | tSNE can be considered as a method to obtain new features from data. 230 | 231 | 58 232 | 00:03:28.770 --> 00:03:33.235 233 | You should just concatenate the transformers coordinates to the original feature matrix. 234 | 235 | 59 236 | 00:03:33.235 --> 00:03:35.680 237 | Now if you've heard this about practical details, 238 | 239 | 60 240 | 00:03:35.680 --> 00:03:37.270 241 | as it has been shown earlier, 242 | 243 | 61 244 | 00:03:37.270 --> 00:03:38.490 245 | the results of tSNE algorithm, 246 | 247 | 62 248 | 00:03:38.490 --> 00:03:41.480 249 | it strongly depends on hyperparameters. 250 | 251 | 63 252 | 00:03:41.480 --> 00:03:45.690 253 | It is good practice to use several projections with different perplexities. 254 | 255 | 64 256 | 00:03:45.690 --> 00:03:49.110 257 | In addition, because of stochastic of this methods results in 258 | 259 | 65 260 | 00:03:49.110 --> 00:03:52.660 261 | different projections even with the same data and hyperparameters. 262 | 263 | 66 264 | 00:03:52.660 --> 00:03:58.490 265 | This means the train and test sets should be projected together rather than separately. 266 | 267 | 67 268 | 00:03:58.490 --> 00:04:02.575 269 | Also, tSNE will run for a long time if you have a lot of features. 270 | 271 | 68 272 | 00:04:02.575 --> 00:04:05.290 273 | If the number of features is greater than 500, 274 | 275 | 69 276 | 00:04:05.290 --> 00:04:09.165 277 | you should use one of dimensionality reduction approach and reduce number of features, 278 | 279 | 70 280 | 00:04:09.165 --> 00:04:11.585 281 | for example, to 100. 282 | 283 | 71 284 | 00:04:11.585 --> 00:04:15.700 285 | Implementation of tSNE can be found in the sklearn library. 286 | 287 | 72 288 | 00:04:15.700 --> 00:04:17.255 289 | But personally, I prefer to use 290 | 291 | 73 292 | 00:04:17.255 --> 00:04:20.975 293 | another implementation from a separate Python package called tSNE, 294 | 295 | 74 296 | 00:04:20.975 --> 00:04:24.830 297 | since it provide a way more efficient implementation. 298 | 299 | 75 300 | 00:04:24.830 --> 00:04:28.570 301 | In conclusion, I want to remind you the basic points of the lecture. 302 | 303 | 76 304 | 00:04:28.570 --> 00:04:31.630 305 | TSNE is an excellent tool for visualizing data. 306 | 307 | 77 308 | 00:04:31.630 --> 00:04:33.785 309 | If data has an explicit structure, 310 | 311 | 78 312 | 00:04:33.785 --> 00:04:37.318 313 | then it likely be [inaudible] on tSNE projection. 314 | 315 | 79 316 | 00:04:37.318 --> 00:04:41.615 317 | However, it requires to be cautious with interpretation of tSNE results. 318 | 319 | 80 320 | 00:04:41.615 --> 00:04:46.145 321 | Sometimes you can see structure where it does not exist or vice versa, 322 | 323 | 81 324 | 00:04:46.145 --> 00:04:48.785 325 | see none where structure is actually present. 326 | 327 | 82 328 | 00:04:48.785 --> 00:04:53.530 329 | It's a good practice to do several tSNE projections with different perplexities. 330 | 331 | 83 332 | 00:04:53.530 --> 00:04:55.035 333 | And in addition to EJ, 334 | 335 | 84 336 | 00:04:55.035 --> 00:04:59.125 337 | tSNE is working very well as a feature for feeding models. 338 | 339 | 85 340 | 00:04:59.125 --> 00:05:01.800 341 | Thank you for your attention. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:01.030 --> 00:00:05.942 5 | Hello everyone, this is Marios 6 | Michailidis, and this will be the first 7 | 8 | 2 9 | 00:00:05.942 --> 00:00:10.452 10 | video in a series that we will be 11 | discussing on ensemble methods for 12 | 13 | 3 14 | 00:00:10.452 --> 00:00:11.835 15 | machine learning. 16 | 17 | 4 18 | 00:00:11.835 --> 00:00:18.165 19 | To tell you a bit about me, I work as 20 | Research Data Scientist for H2Oai. 21 | 22 | 5 23 | 00:00:18.165 --> 00:00:21.976 24 | In fact, 25 | my PhD is about assemble methods, and 26 | 27 | 6 28 | 00:00:21.976 --> 00:00:25.501 29 | they used to be ranked 30 | number one in cargo and 31 | 32 | 7 33 | 00:00:25.501 --> 00:00:30.600 34 | ensemble methods have greatly 35 | helped me to achieve this spot. 36 | 37 | 8 38 | 00:00:30.600 --> 00:00:32.800 39 | So you might find the course interesting. 40 | 41 | 9 42 | 00:00:34.480 --> 00:00:37.077 43 | So what is ensemble modelling? 44 | 45 | 10 46 | 00:00:37.077 --> 00:00:41.947 47 | I think with this term, we refer to 48 | combining many different machine learning 49 | 50 | 11 51 | 00:00:41.947 --> 00:00:45.620 52 | models in order to get 53 | a more powerful prediction. 54 | 55 | 12 56 | 00:00:45.620 --> 00:00:48.997 57 | And later on we will see 58 | examples that this happens, 59 | 60 | 13 61 | 00:00:48.997 --> 00:00:53.386 62 | that we combine different models and 63 | we do get better predictions. 64 | 65 | 14 66 | 00:00:53.386 --> 00:00:56.175 67 | There are various ensemble methods. 68 | 69 | 15 70 | 00:00:56.175 --> 00:01:01.240 71 | Here we'll discuss a few, those that 72 | we encounter quite often, in predictive 73 | 74 | 16 75 | 00:01:01.240 --> 00:01:06.471 76 | modelling competitions, and they tend 77 | to be, in general, quite competitive. 78 | 79 | 17 80 | 00:01:06.471 --> 00:01:10.924 81 | We will start with simple averaging 82 | methods, then we'll go to weighted 83 | 84 | 18 85 | 00:01:10.924 --> 00:01:15.311 86 | averaging methods, and we will also 87 | examine conditional averaging. 88 | 89 | 19 90 | 00:01:15.311 --> 00:01:19.950 91 | And then we will move to some more 92 | typical ones like bagging, or 93 | 94 | 20 95 | 00:01:19.950 --> 00:01:24.942 96 | the very, very popular, boosting, 97 | then stacking and StackNet, 98 | 99 | 21 100 | 00:01:24.942 --> 00:01:27.590 101 | which is the result of my research. 102 | 103 | 22 104 | 00:01:30.350 --> 00:01:34.160 105 | But as I said, 106 | these will be a series of videos, and 107 | 108 | 23 109 | 00:01:34.160 --> 00:01:38.163 110 | we will initially start 111 | with the averaging methods. 112 | 113 | 24 114 | 00:01:41.060 --> 00:01:45.366 115 | So, in order to help you understand 116 | a bit more about the averaging methods, 117 | 118 | 25 119 | 00:01:45.366 --> 00:01:46.791 120 | let's take an example. 121 | 122 | 26 123 | 00:01:46.791 --> 00:01:51.622 124 | Let's say we have a variable called age, 125 | as in age years, 126 | 127 | 27 128 | 00:01:51.622 --> 00:01:54.150 129 | and we try to predict this. 130 | 131 | 28 132 | 00:01:54.150 --> 00:01:57.241 133 | We have a model that yields prediction for 134 | age. 135 | 136 | 29 137 | 00:01:57.241 --> 00:02:01.386 138 | Let's assume that 139 | the relationship between the two, 140 | 141 | 30 142 | 00:02:01.386 --> 00:02:08.010 143 | the actual age in our prediction, 144 | looks like in the graph, as in the graph. 145 | 146 | 31 147 | 00:02:08.010 --> 00:02:15.660 148 | So you can see that the model boasts 149 | quite a higher square of a value of 0.91, 150 | 151 | 32 152 | 00:02:15.660 --> 00:02:19.980 153 | but it doesn't do so 154 | well in the whole range of values. 155 | 156 | 33 157 | 00:02:19.980 --> 00:02:25.680 158 | So when age is less than 50, 159 | the model actually does quite well. 160 | 161 | 34 162 | 00:02:25.680 --> 00:02:28.856 163 | But when age is more than 50, 164 | 165 | 35 166 | 00:02:28.856 --> 00:02:33.505 167 | you can see that the average 168 | error is higher. 169 | 170 | 36 171 | 00:02:33.505 --> 00:02:35.960 172 | Now let's take another example. 173 | 174 | 37 175 | 00:02:35.960 --> 00:02:40.962 176 | Let's assume we have a second model 177 | that also tries to predict age, 178 | 179 | 38 180 | 00:02:40.962 --> 00:02:43.167 181 | but this one looks like that. 182 | 183 | 39 184 | 00:02:43.167 --> 00:02:48.988 185 | As you can see, this model does quite 186 | well when age is higher than 50, 187 | 188 | 40 189 | 00:02:48.988 --> 00:02:56.020 190 | but not so well when age is less than 50, 191 | nevertheless, it scores again 0.91. 192 | 193 | 41 194 | 00:02:56.020 --> 00:03:01.200 195 | So we have two models that have 196 | a similar predictive power, 197 | 198 | 42 199 | 00:03:01.200 --> 00:03:04.007 200 | but they look quite different. 201 | 202 | 43 203 | 00:03:04.007 --> 00:03:08.682 204 | It's quite obvious that they do 205 | better in different parts of 206 | 207 | 44 208 | 00:03:08.682 --> 00:03:10.707 209 | the distribution of age. 210 | 211 | 45 212 | 00:03:10.707 --> 00:03:14.394 213 | So what will happen if we 214 | were to try to combine 215 | 216 | 46 217 | 00:03:14.394 --> 00:03:19.148 218 | this two with a simple averaging method, 219 | in other words, 220 | 221 | 47 222 | 00:03:19.148 --> 00:03:25.540 223 | just say (model 1 + model two) / 2, 224 | so a simple averaging method. 225 | 226 | 48 227 | 00:03:25.540 --> 00:03:28.920 228 | The end result will look 229 | as in the new graph. 230 | 231 | 49 232 | 00:03:28.920 --> 00:03:34.592 233 | So, our square has moved to 0.95, 234 | which is a considerable 235 | 236 | 50 237 | 00:03:34.592 --> 00:03:40.692 238 | improvement versus the 0.91 we had before, 239 | and as you can see, 240 | 241 | 51 242 | 00:03:40.692 --> 00:03:46.059 243 | on average, the points tend to 244 | be closer with the reality. 245 | 246 | 52 247 | 00:03:46.059 --> 00:03:49.723 248 | So the average error is smaller. 249 | 250 | 53 251 | 00:03:49.723 --> 00:03:56.052 252 | However, as you can see, the model doesn't 253 | do better as an individual models for 254 | 255 | 54 256 | 00:03:56.052 --> 00:03:59.998 257 | the areas where the models 258 | were doing really well, 259 | 260 | 55 261 | 00:03:59.998 --> 00:04:03.410 262 | nevertheless, it does better on average. 263 | 264 | 56 265 | 00:04:03.410 --> 00:04:06.584 266 | This is something we need to understand, 267 | 268 | 57 269 | 00:04:06.584 --> 00:04:12.195 270 | that there is potentially a better 271 | way to combine these models. 272 | 273 | 58 274 | 00:04:12.195 --> 00:04:15.354 275 | We could try to take a weighting average. 276 | 277 | 59 278 | 00:04:15.354 --> 00:04:19.976 279 | So say, I'm going to take 70% of 280 | the first model prediction and 281 | 282 | 60 283 | 00:04:19.976 --> 00:04:22.893 284 | 30% of the second model prediction. 285 | 286 | 61 287 | 00:04:22.893 --> 00:04:28.853 288 | In other words, 289 | (model 1x0.7 + model 2x0.3), 290 | 291 | 62 292 | 00:04:28.853 --> 00:04:33.393 293 | and the end result would 294 | look as in the graph. 295 | 296 | 63 297 | 00:04:33.393 --> 00:04:38.849 298 | So you can see their square is no better 299 | and that makes sense, because the models 300 | 301 | 64 302 | 00:04:38.849 --> 00:04:44.560 303 | have quite similar predictive power and 304 | it doesn't make sense to rely more in one. 305 | 306 | 65 307 | 00:04:46.280 --> 00:04:51.215 308 | And also it is quite clear that 309 | it looks more with model 1, 310 | 311 | 66 312 | 00:04:51.215 --> 00:04:56.452 313 | because it has better predictions 314 | when age is less than 50, 315 | 316 | 67 317 | 00:04:56.452 --> 00:05:00.699 318 | and worse predictions 319 | when age is more than 50. 320 | 321 | 68 322 | 00:05:00.699 --> 00:05:08.250 323 | As a theoretical exercise, what is the 324 | theoretical best we could get out of this? 325 | 326 | 69 327 | 00:05:08.250 --> 00:05:13.250 328 | We know we have a model that scores 329 | really well when age is less than 50, 330 | 331 | 70 332 | 00:05:13.250 --> 00:05:17.820 333 | and another model that scores really 334 | well when age is more than 50. 335 | 336 | 71 337 | 00:05:17.820 --> 00:05:21.776 338 | So ideally, we would like to 339 | get to something like that. 340 | 341 | 72 342 | 00:05:21.776 --> 00:05:26.420 343 | This is how we leverage the two 344 | models in the best possible way 345 | 346 | 73 347 | 00:05:26.420 --> 00:05:29.891 348 | here by using a simple 349 | conditioning method. 350 | 351 | 74 352 | 00:05:29.891 --> 00:05:35.187 353 | So if less than 50 is one I'll just 354 | use the other, and we will see later 355 | 356 | 75 357 | 00:05:35.187 --> 00:05:40.310 358 | on that there are ensemble methods 359 | that are very good at finding these 360 | 361 | 76 362 | 00:05:40.310 --> 00:05:46.510 363 | relationships of two or more predictions 364 | in respect to the target variable. 365 | 366 | 77 367 | 00:05:46.510 --> 00:05:49.210 368 | But, this will be a topic for 369 | another discussion. 370 | 371 | 78 372 | 00:05:49.210 --> 00:05:53.340 373 | Here we discuss simple averaging methods, 374 | 375 | 79 376 | 00:05:53.340 --> 00:05:58.250 377 | hopefully you found it useful, and 378 | stay here for the next session to come. 379 | 380 | 80 381 | 00:05:58.250 --> 00:05:59.170 382 | Thank you very much. -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_2_Bagging.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_2_Bagging.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_2_Bagging.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_3_Boosting.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_3_Boosting.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_3_Boosting.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_4_Stacking.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_4_Stacking.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_4_Stacking.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_5_StackNet.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_5_StackNet.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_5_StackNet.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_6_Ensembling Tips and Tricks.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_6_Ensembling Tips and Tricks.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_4/3_6_Ensembling Tips and Tricks.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/1_Crowdflower Competition.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/1_Crowdflower\ Competition.srt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_5/1_Crowdflower\ Competition.srt -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/2_Springleaf Marketing Response.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:00.000 --> 00:00:03.802 3 | 音楽 4 | 5 | 2 6 | 00:00:03.802 --> 00:00:08.527 7 | こんにちは、コースを通して、我々は役に立 8 | つとしてスプリングリーフの競争を使用する 9 | 10 | 3 11 | 00:00:08.527 --> 00:00:13.840 12 | EDA の例では、最も近い近傍に基づいた 13 | 符号化と機能を意味します。 14 | 15 | 4 16 | 00:00:13.840 --> 00:00:20.090 17 | 当時、我々はと一緒にこの大会で3位を取っ 18 | た。 19 | 20 | 5 21 | 00:00:20.090 --> 00:00:24.710 22 | そして今、このビデオでは、私たちのソリュ 23 | ーションの最後の部分について説明します 24 | 25 | 6 26 | 00:00:24.710 --> 00:00:28.470 27 | これは、スタッキングとアンサンブルの使用 28 | 方法です。 29 | 30 | 7 31 | 00:00:28.470 --> 00:00:33.349 32 | この写真では、我々はレベルで生産最終的な 33 | スタッキングスキームを見ることができます 34 | 35 | 8 36 | 00:00:33.349 --> 00:00:38.070 37 | 最初のレベルで0の機能は、基本的なモデル 38 | による予測。 39 | 40 | 9 41 | 00:00:38.070 --> 00:00:40.110 42 | レベル1プラスの組み合わせで。 43 | 44 | 10 45 | 00:00:40.110 --> 00:00:44.270 46 | したがって、これらの予測といくつかの正確 47 | に選択した機能 48 | 49 | 11 50 | 00:00:44.270 --> 00:00:47.880 51 | 機能のこの新しいセットの2番目のレベルの 52 | モデルに。 53 | 54 | 12 55 | 00:00:47.880 --> 00:00:52.290 56 | そして最後に、3番目のレベルでは、その線 57 | 形の組み合わせ。 58 | 59 | 13 60 | 00:00:52.290 --> 00:00:52.910 61 | このビデオでは、 62 | 63 | 14 64 | 00:00:52.910 --> 00:00:58.301 65 | それは、この非自明な ensembled 66 | スキームにビルドとして我々は、各レベルを 67 | 通過します。 68 | 69 | 15 70 | 00:00:59.430 --> 00:01:03.590 71 | しかし、まず、すぐに問題について自分自身 72 | を思い出させる。 73 | 74 | 16 75 | 00:01:03.590 --> 00:01:07.428 76 | これは、曲線メトリックの下の領域を持つバ 77 | イナリ分類タスクでした。 78 | 79 | 17 80 | 00:01:07.428 --> 00:01:15.270 81 | 我々は、トレーニングデータと約2000匿 82 | 名の機能で145000サンプルを持ってい 83 | た。 84 | 85 | 18 86 | 00:01:15.270 --> 00:01:19.740 87 | これらは、EDA をしながら私たちによっ 88 | て導き出された有用な洞察でした。 89 | 90 | 19 91 | 00:01:19.740 --> 00:01:26.080 92 | そして、あなたのメモリをリフレッシュする 93 | ために我々のコースで以前に行われた 94 | EDA 95 | をチェックアウトすることができます。 96 | 97 | 20 98 | 00:01:26.080 --> 00:01:29.150 99 | だから今の機能から始めましょう。 100 | 101 | 21 102 | 00:01:29.150 --> 00:01:33.040 103 | ここでは、機能の4つのパックがあります。 104 | 105 | 22 106 | 00:01:33.040 --> 00:01:37.760 107 | 最初の2つは、基本データセットと処理され 108 | たデータセットです。 109 | 110 | 23 111 | 00:01:37.760 --> 00:01:41.560 112 | それをシンプルに保つために、我々は単にか 113 | ら派生した洞察を使用 114 | 115 | 24 116 | 00:01:41.560 --> 00:01:45.690 117 | EDA は、データをきれいに 118 | [聞こえない] と新機能を生成します。 119 | 120 | 25 121 | 00:01:45.690 --> 00:01:49.280 122 | たとえば、重複した機能を削除し、 123 | 124 | 26 125 | 00:01:49.280 --> 00:01:53.729 126 | 散布図と相関関係に基づいていくつかのフィ 127 | ーチャインタラクションを編集します。 128 | 129 | 27 130 | 00:01:54.790 --> 00:01:59.815 131 | その後、我々は、成長関係のループを使用し 132 | てすべてのカテゴリ機能を意味エンコード 133 | 134 | 28 135 | 00:01:59.815 --> 00:02:02.050 136 | データとスムージングに署名します。 137 | 138 | 29 139 | 00:02:02.050 --> 00:02:06.704 140 | さらに、平均エンコードされたデータセット 141 | を使用して、最も近い 142 | 143 | 30 144 | 00:02:06.704 --> 00:02:07.620 145 | 隣人。 146 | 147 | 31 148 | 00:02:07.620 --> 00:02:12.390 149 | と同様に、クラスゼロの最も近いオブジェク 150 | トでは何ですか? 151 | 152 | 32 153 | 00:02:12.390 --> 00:02:18.280 154 | そして、どのように多くのオブジェクトのう 155 | ち、10の最寄りの隣人クラス1に属してい 156 | る? 157 | 158 | 33 159 | 00:02:18.280 --> 00:02:21.119 160 | これがどのように行われるかを確認すること 161 | ができます 162 | 163 | 34 164 | 00:02:21.119 --> 00:02:24.677 165 | 関連トピックでは、デミトリ 166 | Altihof によって導入。 167 | 168 | 35 169 | 00:02:24.677 --> 00:02:31.469 170 | そこで最後に、これらの4つの機能のパック 171 | は、私たちのソリューションのレベル0でし 172 | た。 173 | 174 | 36 175 | 00:02:31.469 --> 00:02:35.952 176 | 2番目のレベルは、内のいくつかの異なるグ 177 | ラデーションで表された 178 | 179 | 37 180 | 00:02:35.952 --> 00:02:39.570 181 | デシジョンツリーモデルと1つのニューラル 182 | ネットワーク 183 | 184 | 38 185 | 00:02:39.570 --> 00:02:44.120 186 | ここでの主な考え方は、メタ機能は多様でな 187 | ければならないということです。 188 | 189 | 39 190 | 00:02:44.120 --> 00:02:48.340 191 | 各メタ機能は、ターゲットに関する新しい情 192 | 報をもたらす必要があります。 193 | 194 | 40 195 | 00:02:48.340 --> 00:02:55.714 196 | だから我々のモデルの両方の異なるパラメー 197 | タと機能のさまざまなセットを使用します。 198 | 199 | 41 200 | 00:02:55.714 --> 00:03:00.449 201 | ニューラルネットワークについては、我々は 202 | さらに事前に処理された機能 203 | 204 | 42 205 | 00:03:00.449 --> 00:03:04.600 206 | 共通のスカラー、ランクおよび力の変形。 207 | 208 | 43 209 | 00:03:04.600 --> 00:03:10.960 210 | 問題は、ネットワークのトレーニング結果を 211 | スキュー巨大な飛び地にあった。 212 | 213 | 44 214 | 00:03:10.960 --> 00:03:15.120 215 | 従ってランクおよび力の変形はこの問題の処 216 | 理を助けた。 217 | 218 | 45 219 | 00:03:16.566 --> 00:03:19.990 220 | それに決定を後押しすることで漸進的である 221 | メタ特徴を作り出した後 222 | 223 | 46 224 | 00:03:19.990 --> 00:03:21.230 225 | ニューラルネットワーク、 226 | 227 | 47 228 | 00:03:21.230 --> 00:03:26.280 229 | 我々は、次のレベルのモデルを支援するため 230 | にそれらの賃金上昇の違いを計算した。 231 | 232 | 48 233 | 00:03:26.280 --> 00:03:30.570 234 | これはまた、モデルを強制的に興味深いトリ 235 | ックであることに注意してください 236 | 237 | 49 238 | 00:03:30.570 --> 00:03:35.290 239 | 最初のレベルのモデルの予測の違いを活用す 240 | る。 241 | 242 | 50 243 | 00:03:35.290 --> 00:03:40.370 244 | ここでは、最も近い近傍に基づいてフィーチ 245 | ャの2つのデータセットを編集します。 246 | 247 | 51 248 | 00:03:40.370 --> 00:03:45.380 249 | 1つはレベル0から直接取得され、同じ機能 250 | が含まれています。 251 | 252 | 52 253 | 00:03:45.380 --> 00:03:51.040 254 | しかし、それは、平均符号化されたデータセ 255 | ットで半分の力に計算しました。 256 | 257 | 53 258 | 00:03:51.040 --> 00:03:55.704 259 | ここでのポイントは、これらの機能が完全に 260 | 利用されていないということでした 261 | 262 | 54 263 | 00:03:55.704 --> 00:03:57.370 264 | 最初のレベルのモデル。 265 | 266 | 55 267 | 00:03:57.370 --> 00:04:02.690 268 | そして確かに、彼らはこのレベルに情報の新 269 | しい部分をもたらした。 270 | 271 | 56 272 | 00:04:03.740 --> 00:04:08.850 273 | 今、我々はすでに最初のレベルから自動フォ 274 | ールディング折り曲げる予測を持っていると 275 | 276 | 57 277 | 00:04:08.850 --> 00:04:11.110 278 | 我々はそれらのモデルを訓練します。 279 | 280 | 58 281 | 00:04:11.110 --> 00:04:16.230 282 | 他の民族のせいで標的が漏れる 283 | 284 | 59 285 | 00:04:16.230 --> 00:04:18.610 286 | また、機能が非常によくないため、 287 | 288 | 60 289 | 00:04:18.610 --> 00:04:23.600 290 | モデルが検出するデータには、ほとんどパタ 291 | ーンが残っていません。 292 | 293 | 61 294 | 00:04:23.600 --> 00:04:29.160 295 | 我々は、予測は多様であるべきであることを 296 | 念頭に置き、単純な分類器を選んだ。 297 | 298 | 62 299 | 00:04:29.160 --> 00:04:31.670 300 | 4種類のモデルを使用しました。 301 | 302 | 63 303 | 00:04:31.670 --> 00:04:34.780 304 | 勾配ブーストデシジョンツリー, 305 | ニューラルネットワーク, 306 | 307 | 64 308 | 00:04:34.780 --> 00:04:38.710 309 | ランダムフォレストとロジスティック回帰。 310 | 311 | 65 312 | 00:04:38.710 --> 00:04:42.289 313 | だから、このすべての2番目のレベルのモデ 314 | ルです。 315 | 316 | 66 317 | 00:04:43.340 --> 00:04:48.410 318 | そして最後に、我々は2番目のレベルのモデ 319 | ルのあなたの組み合わせでリニアを取った。 320 | 321 | 67 322 | 00:04:48.410 --> 00:04:54.770 323 | 線形モデルは、我々は推定係数に傾いていな 324 | いので、 325 | 326 | 68 327 | 00:04:54.770 --> 00:04:59.890 328 | 直接これらの4つの予測とデータを投げるた 329 | めの我々のターゲットを使用します。 330 | 331 | 69 332 | 00:04:59.890 --> 00:05:01.450 333 | だから、これは。 334 | 335 | 70 336 | 00:05:01.450 --> 00:05:06.244 337 | 我々は、この積み重ねスキームの各レベルを 338 | 経て、学生を行った。 339 | 340 | 71 341 | 00:05:06.244 --> 00:05:08.390 342 | なぜ我々はこのような複雑さが必要ですか? 343 | 344 | 72 345 | 00:05:08.390 --> 00:05:13.795 346 | 別のモデルが異なるパターンを利用するので 347 | 、まあ、通常それは 348 | 349 | 73 350 | 00:05:13.795 --> 00:05:19.610 351 | データでは、我々は1つの強大なモデルでは 352 | 、このパターンのすべてを団結したい。 353 | 354 | 74 355 | 00:05:19.610 --> 00:05:22.930 356 | そして積み重ねは私達のためのそれを丁度す 357 | ることができる。 358 | 359 | 75 360 | 00:05:22.930 --> 00:05:24.970 361 | これはあまりにも複雑に見えるかもしれませ 362 | ん。 363 | 364 | 76 365 | 00:05:24.970 --> 00:05:29.480 366 | もちろん、それは競争の中でスキームのこの 367 | 種の上に移動するのに時間がかかります。 368 | 369 | 77 370 | 00:05:29.480 --> 00:05:32.630 371 | しかし、私たちのコースを完了した後、必ず 372 | 373 | 78 374 | 00:05:32.630 --> 00:05:37.310 375 | あなたはすでにこれを行う方法について十分 376 | な知識を持っている。 377 | 378 | 79 379 | 00:05:37.310 --> 00:05:41.580 380 | これらのスキームは、コンペティションの開 381 | 始時に最終的な形状には表示されません。 382 | 383 | 80 384 | 00:05:41.580 --> 00:05:44.800 385 | ほとんどの仕事は、通常、最初のレベルで行 386 | われます。 387 | 388 | 81 389 | 00:05:44.800 --> 00:05:51.770 390 | だから、多様なメタ機能を作成しようとする 391 | と、1つのシンプルモデルでそれらを団結。 392 | 393 | 82 394 | 00:05:51.770 --> 00:05:56.880 395 | 通常、あなたは、スタッキングの高品位第二 396 | レベルを作成し始める 397 | 398 | 83 399 | 00:05:56.880 --> 00:05:59.362 400 | 数日しか残っていないとき。 401 | 402 | 84 403 | 00:05:59.362 --> 00:06:04.480 404 | そして、その後、主にこのスキームの改善に 405 | 取り組んでいます。 406 | 407 | 85 408 | 00:06:04.480 --> 00:06:08.262 409 | そうは言っても、あなたはすでに必要な知識 410 | と 411 | 412 | 86 413 | 00:06:08.262 --> 00:06:11.570 414 | 今、あなたはちょうどそこにいくつかの練習 415 | を取得する必要があります。 416 | 417 | 87 418 | 00:06:11.570 --> 00:06:16.594 419 | 勤勉であり、疑いもなく、あなたは成功しま 420 | す。 421 | 422 | 88 423 | 00:06:16.594 --> 00:06:18.964 424 | 音 425 | 426 | 89 427 | 00:06:18.964 --> 00:06:28.964 428 | 音楽 429 | 430 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/2_Springleaf Marketing Response.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/2_Springleaf Marketing Response.vtt: -------------------------------------------------------------------------------- 1 | WEBVTT 2 | 3 | 1 4 | 00:00:00.000 --> 00:00:03.802 5 | [MUSIC] 6 | 7 | 2 8 | 00:00:03.802 --> 00:00:08.527 9 | Hi, throughout the course, we use 10 | the Springleaf competition as a useful 11 | 12 | 3 13 | 00:00:08.527 --> 00:00:13.840 14 | example of EDA, mean encodings and 15 | features based on nearest neighbors. 16 | 17 | 4 18 | 00:00:13.840 --> 00:00:20.090 19 | Back then, we took the third place in 20 | this competition together with and. 21 | 22 | 5 23 | 00:00:20.090 --> 00:00:24.710 24 | And now in this video, I will describe 25 | the last part of our solution, 26 | 27 | 6 28 | 00:00:24.710 --> 00:00:28.470 29 | which is the usage of stacking and 30 | ensembles. 31 | 32 | 7 33 | 00:00:28.470 --> 00:00:33.349 34 | On this picture, you can see the final 35 | stacking scheme we produced on the level 36 | 37 | 8 38 | 00:00:33.349 --> 00:00:38.070 39 | 0 features, on the first level, 40 | predictions by basic models. 41 | 42 | 9 43 | 00:00:38.070 --> 00:00:40.110 44 | On the level one plus combination. 45 | 46 | 10 47 | 00:00:40.110 --> 00:00:44.270 48 | So these predictions and 49 | some accurately chosen features 50 | 51 | 11 52 | 00:00:44.270 --> 00:00:47.880 53 | on the second level models 54 | on this new set of features. 55 | 56 | 12 57 | 00:00:47.880 --> 00:00:52.290 58 | And finally, on the third level, 59 | their linear combination. 60 | 61 | 13 62 | 00:00:52.290 --> 00:00:52.910 63 | In this video, 64 | 65 | 14 66 | 00:00:52.910 --> 00:00:58.301 67 | we will go through each level as it builds 68 | up to this non-trivial ensembled scheme. 69 | 70 | 15 71 | 00:00:59.430 --> 00:01:03.590 72 | But first, let's quickly remind 73 | ourselves about the problem. 74 | 75 | 16 76 | 00:01:03.590 --> 00:01:07.428 77 | This was a binary classification 78 | task with area under curve metric. 79 | 80 | 17 81 | 00:01:07.428 --> 00:01:15.270 82 | We had 145,000 samples in training data 83 | and about 2,000 anonymized features. 84 | 85 | 18 86 | 00:01:15.270 --> 00:01:19.740 87 | These were useful insights 88 | derived by us while doing EDA. 89 | 90 | 19 91 | 00:01:19.740 --> 00:01:26.080 92 | And you can check out EDA done by earlier 93 | in our course to refresh your memory. 94 | 95 | 20 96 | 00:01:26.080 --> 00:01:29.150 97 | So now let's start with features. 98 | 99 | 21 100 | 00:01:29.150 --> 00:01:33.040 101 | Here we have four packs of features. 102 | 103 | 22 104 | 00:01:33.040 --> 00:01:37.760 105 | First two are the basic dataset and 106 | the processed dataset. 107 | 108 | 23 109 | 00:01:37.760 --> 00:01:41.560 110 | To keep it simple, 111 | we just used insights derived from 112 | 113 | 24 114 | 00:01:41.560 --> 00:01:45.690 115 | EDA to clean data [INAUDIBLE] and 116 | to generate new features. 117 | 118 | 25 119 | 00:01:45.690 --> 00:01:49.280 120 | For example, 121 | we remove duplicated features and 122 | 123 | 26 124 | 00:01:49.280 --> 00:01:53.729 125 | edit some feature interaction based 126 | on scatter plots and correlations. 127 | 128 | 27 129 | 00:01:54.790 --> 00:01:59.815 130 | Then, we mean-encoded all categorical 131 | features using growth relation loop and 132 | 133 | 28 134 | 00:01:59.815 --> 00:02:02.050 135 | sign data and smoothing. 136 | 137 | 29 138 | 00:02:02.050 --> 00:02:06.704 139 | We further used the mean-encoded dataset 140 | to calculate features based on nearest 141 | 142 | 30 143 | 00:02:06.704 --> 00:02:07.620 144 | neighbors. 145 | 146 | 31 147 | 00:02:07.620 --> 00:02:12.390 148 | Like, what is the least in 149 | closest object of the class zero? 150 | 151 | 32 152 | 00:02:12.390 --> 00:02:18.280 153 | And how many objects out of ten 154 | nearest neighbors belong to class one? 155 | 156 | 33 157 | 00:02:18.280 --> 00:02:21.119 158 | You can review how this could be done in 159 | 160 | 34 161 | 00:02:21.119 --> 00:02:24.677 162 | related topics introduced 163 | by Dmitri Altihof. 164 | 165 | 35 166 | 00:02:24.677 --> 00:02:31.469 167 | So finally, these four packs of 168 | feature were level 0 of our solution. 169 | 170 | 36 171 | 00:02:31.469 --> 00:02:35.952 172 | And the second level was represented 173 | by several different gradient within 174 | 175 | 37 176 | 00:02:35.952 --> 00:02:39.570 177 | decision tree models, 178 | and one neural network. 179 | 180 | 38 181 | 00:02:39.570 --> 00:02:44.120 182 | The main idea here is that meta 183 | features should be diverse. 184 | 185 | 39 186 | 00:02:44.120 --> 00:02:48.340 187 | Each meta feature should bring 188 | new information about the target. 189 | 190 | 40 191 | 00:02:48.340 --> 00:02:55.714 192 | So we use both distinct parameters and 193 | different sets of features for our models. 194 | 195 | 41 196 | 00:02:55.714 --> 00:03:00.449 197 | For the neural network, we additionally 198 | pre-processed features with 199 | 200 | 42 201 | 00:03:00.449 --> 00:03:04.600 202 | common scalars, ranks and 203 | power transformation. 204 | 205 | 43 206 | 00:03:04.600 --> 00:03:10.960 207 | The problem there was in huge outliers 208 | which skew network training results. 209 | 210 | 44 211 | 00:03:10.960 --> 00:03:15.120 212 | So ranks and power transformation 213 | helped to handle this problem. 214 | 215 | 45 216 | 00:03:16.566 --> 00:03:19.990 217 | After producing meta features who is 218 | gradual in boosting decision to it and 219 | 220 | 46 221 | 00:03:19.990 --> 00:03:21.230 222 | neural networks, 223 | 224 | 47 225 | 00:03:21.230 --> 00:03:26.280 226 | we calculated pay rise differences 227 | on them to help next level models. 228 | 229 | 48 230 | 00:03:26.280 --> 00:03:30.570 231 | Note that this is also an interesting 232 | trick to force the model 233 | 234 | 49 235 | 00:03:30.570 --> 00:03:35.290 236 | to utilize the differences in 237 | the first level models predictions. 238 | 239 | 50 240 | 00:03:35.290 --> 00:03:40.370 241 | Here we edit two datasets of 242 | features based on nearest neighbors. 243 | 244 | 51 245 | 00:03:40.370 --> 00:03:45.380 246 | One was taken directly from level 0 and 247 | they contain the same features. 248 | 249 | 52 250 | 00:03:45.380 --> 00:03:51.040 251 | But it was calculated on the mean-encoded 252 | dataset to the power of one-half. 253 | 254 | 53 255 | 00:03:51.040 --> 00:03:55.704 256 | The point here was that these features 257 | were not completely utilized by 258 | 259 | 54 260 | 00:03:55.704 --> 00:03:57.370 261 | the first level models. 262 | 263 | 55 264 | 00:03:57.370 --> 00:04:02.690 265 | And indeed, they brought new pieces 266 | of information to this level. 267 | 268 | 56 269 | 00:04:03.740 --> 00:04:08.850 270 | Now we already have autofold 271 | predictions from the first level and 272 | 273 | 57 274 | 00:04:08.850 --> 00:04:11.110 275 | we will train with the models on them. 276 | 277 | 58 278 | 00:04:11.110 --> 00:04:16.230 279 | Because we could have target leakage 280 | here because of other folk, and 281 | 282 | 59 283 | 00:04:16.230 --> 00:04:18.610 284 | also because features not very good and 285 | 286 | 60 287 | 00:04:18.610 --> 00:04:23.600 288 | there are almost no patterns left 289 | in the data for models to discover. 290 | 291 | 61 292 | 00:04:23.600 --> 00:04:29.160 293 | We chose simple classifiers, keeping in 294 | mind that predictions should be diverse. 295 | 296 | 62 297 | 00:04:29.160 --> 00:04:31.670 298 | We used four different models. 299 | 300 | 63 301 | 00:04:31.670 --> 00:04:34.780 302 | Gradient boosted decision tree, 303 | neural networks, 304 | 305 | 64 306 | 00:04:34.780 --> 00:04:38.710 307 | random forest and logistic regression. 308 | 309 | 65 310 | 00:04:38.710 --> 00:04:42.289 311 | So this is all with 312 | the second level models. 313 | 314 | 66 315 | 00:04:43.340 --> 00:04:48.410 316 | And finally, we took a linear in your 317 | combination of the second level models. 318 | 319 | 67 320 | 00:04:48.410 --> 00:04:54.770 321 | Because a linear model is not inclined 322 | to that we estimated coefficients 323 | 324 | 68 325 | 00:04:54.770 --> 00:04:59.890 326 | directly using these four predictions and 327 | our target for throwing in data. 328 | 329 | 69 330 | 00:04:59.890 --> 00:05:01.450 331 | So, this is it. 332 | 333 | 70 334 | 00:05:01.450 --> 00:05:06.244 335 | We just went through each level of this 336 | stacking scheme and then the student. 337 | 338 | 71 339 | 00:05:06.244 --> 00:05:08.390 340 | Why we need this kind of complexity? 341 | 342 | 72 343 | 00:05:08.390 --> 00:05:13.795 344 | Well, usually it's because different 345 | models utilize different patterns 346 | 347 | 73 348 | 00:05:13.795 --> 00:05:19.610 349 | in the data and we want to unite all 350 | of this patterns in one mighty model. 351 | 352 | 74 353 | 00:05:19.610 --> 00:05:22.930 354 | And stacking can do exactly that for us. 355 | 356 | 75 357 | 00:05:22.930 --> 00:05:24.970 358 | This may seem too complicated. 359 | 360 | 76 361 | 00:05:24.970 --> 00:05:29.480 362 | Of course, it takes time to move up to 363 | this kind of scheme in a competition. 364 | 365 | 77 366 | 00:05:29.480 --> 00:05:32.630 367 | But be sure that after 368 | completion our course, 369 | 370 | 78 371 | 00:05:32.630 --> 00:05:37.310 372 | you already have enough 373 | knowledge about how to do this. 374 | 375 | 79 376 | 00:05:37.310 --> 00:05:41.580 377 | These schemes never appear in the final 378 | shape at the beginning of the competition. 379 | 380 | 80 381 | 00:05:41.580 --> 00:05:44.800 382 | Most work here usually is 383 | done on the first level. 384 | 385 | 81 386 | 00:05:44.800 --> 00:05:51.770 387 | So you try to create diverse meta features 388 | and unite them in one simple model. 389 | 390 | 82 391 | 00:05:51.770 --> 00:05:56.880 392 | Usually, you start to create the high 393 | grade second level of stacking, 394 | 395 | 83 396 | 00:05:56.880 --> 00:05:59.362 397 | when you have only a few days left. 398 | 399 | 84 400 | 00:05:59.362 --> 00:06:04.480 401 | And after that, you mostly work on 402 | the improvement of this scheme. 403 | 404 | 85 405 | 00:06:04.480 --> 00:06:08.262 406 | That said, you already have 407 | the required knowledge and 408 | 409 | 86 410 | 00:06:08.262 --> 00:06:11.570 411 | now you just need to get 412 | some practice out there. 413 | 414 | 87 415 | 00:06:11.570 --> 00:06:16.594 416 | Be diligent, and without a doubt, 417 | you will succeed. 418 | 419 | 88 420 | 00:06:16.594 --> 00:06:18.964 421 | [SOUND] 422 | 423 | 89 424 | 00:06:18.964 --> 00:06:28.964 425 | [MUSIC] -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/3_Microsoft Malware Classification Challenge.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/4_Walmart Trip Type Classification.srt: -------------------------------------------------------------------------------- 1 | 1 2 | 00:00:03.010 --> 00:00:05.620 3 | こんにちは。このビデオでは、 4 | 5 | 2 6 | 00:00:05.620 --> 00:00:06.845 7 | 私は話をするつもりです 8 | 9 | 3 10 | 00:00:06.845 --> 00:00:12.380 11 | 年前の Kaggle カップルで開催され 12 | たウォルマートの旅行型分類の挑戦。 13 | 14 | 4 15 | 00:00:12.380 --> 00:00:14.795 16 | 私はその競技で1位を獲得した。 17 | 18 | 5 19 | 00:00:14.795 --> 00:00:17.990 20 | そして今、私はの最も興味深い部分について 21 | 教えてくれます 22 | 23 | 6 24 | 00:00:17.990 --> 00:00:21.704 25 | 問題と私の解決策について。 26 | 27 | 7 28 | 00:00:21.704 --> 00:00:25.814 29 | つまり、このプレゼンテーションは4つの部 30 | 分で構成されています。 31 | 32 | 8 33 | 00:00:25.814 --> 00:00:28.100 34 | まず、問題を述べます。 35 | 36 | 9 37 | 00:00:28.100 --> 00:00:31.699 38 | 第二に、我々はどのようなデータフォーマッ 39 | トとデータの再処理を理解する。 40 | 41 | 10 42 | 00:00:31.699 --> 00:00:35.570 43 | 第3に、モデルについてお話します。 44 | 45 | 11 46 | 00:00:35.570 --> 00:00:39.665 47 | その相対的な品質と一般のカシメスキームと 48 | の関係。 49 | 50 | 12 51 | 00:00:39.665 --> 00:00:44.285 52 | そして最後に、我々はここに新機能を生成す 53 | るいくつかの可能性を概説します。 54 | 55 | 13 56 | 00:00:44.285 --> 00:00:46.605 57 | では、始めましょう。 58 | 59 | 14 60 | 00:00:46.605 --> 00:00:52.400 61 | 私たちのデータでは、我々はウォルマートで 62 | 2週間で自分の店を訪問した購入の人々があ 63 | った 64 | 65 | 15 66 | 00:00:52.400 --> 00:00:58.620 67 | そして、我々は38の訪問旅行の種類やクラ 68 | スにそれらを分類しなければならなかった。 69 | 70 | 16 71 | 00:00:58.620 --> 00:01:01.985 72 | のは、データ内の機能を簡単に見てみましょ 73 | う。 74 | 75 | 17 76 | 00:01:01.985 --> 00:01:04.947 77 | トリップタイプ列はターゲットを表し、 78 | 79 | 18 80 | 00:01:04.947 --> 00:01:08.110 81 | 訪問番号は、結合する ID を表します。 82 | 83 | 19 84 | 00:01:08.110 --> 00:01:12.075 85 | 1つのショッピング旅行で1つの顧客によっ 86 | て行われた購入。 87 | 88 | 20 89 | 00:01:12.075 --> 00:01:15.785 90 | たとえば、訪問番号7を作った顧客は、 91 | 92 | 21 93 | 00:01:15.785 --> 00:01:18.470 94 | に配置されている2つの項目を購入 95 | 96 | 22 97 | 00:01:18.470 --> 00:01:21.890 98 | このデータフレームの3番目の行にあります 99 | 。 100 | 101 | 23 102 | 00:01:21.890 --> 00:01:26.319 103 | 同じ訪問番号を持つすべての行に同じトリッ 104 | プタイプがあることに注意してください。 105 | 106 | 24 107 | 00:01:26.319 --> 00:01:32.383 108 | 重要な瞬間は、我々は訪問番号の旅行の種類 109 | を予測する必要があることです 110 | 111 | 25 112 | 00:01:32.383 --> 00:01:35.325 113 | そして列車データの各行のためではない。 114 | 115 | 26 116 | 00:01:35.325 --> 00:01:36.920 117 | 見ての通り 118 | 119 | 27 120 | 00:01:36.920 --> 00:01:42.850 121 | 電車の中で我々は約647000を持ってい 122 | る 123 | 124 | 28 125 | 00:01:42.850 --> 00:01:49.305 126 | 行とのみ95000の訪問。 127 | 128 | 29 129 | 00:01:49.305 --> 00:01:51.085 130 | 機能に戻る, 次の機能は、平日です 131 | 132 | 30 133 | 00:01:51.085 --> 00:01:54.030 134 | 明らかに訪問の平日を表しています。 135 | 136 | 31 137 | 00:01:54.030 --> 00:01:55.980 138 | 次は UPC です。 139 | 140 | 32 141 | 00:01:55.980 --> 00:01:59.635 142 | UPC は、購入したアイテムの正確な 143 | ID です。 144 | 145 | 33 146 | 00:01:59.635 --> 00:02:01.742 147 | 次に、スキャンカウント。 148 | 149 | 34 150 | 00:02:01.742 --> 00:02:05.408 151 | スキャン数は、購入したアイテムの正確な数 152 | です。 153 | 154 | 35 155 | 00:02:05.408 --> 00:02:10.420 156 | ここでマイナス1は購入ではなく返品を表し 157 | ていることに注意してください。 158 | 159 | 36 160 | 00:02:10.420 --> 00:02:13.285 161 | 次の機能、部門の説明、 162 | 163 | 37 164 | 00:02:13.285 --> 00:02:18.330 165 | と68ユニークな値は、項目のための広いカ 166 | テゴリです。 167 | 168 | 38 169 | 00:02:18.330 --> 00:02:20.307 170 | そして最後に、fineline 番号、 171 | 172 | 39 173 | 00:02:20.307 --> 00:02:22.670 174 | 約5000のユニークな値で、 175 | 176 | 40 177 | 00:02:22.670 --> 00:02:25.950 178 | は、アイテムのより洗練されたカテゴリです 179 | 。 180 | 181 | 41 182 | 00:02:25.950 --> 00:02:29.095 183 | この機能が何を表しているかを理解した後、 184 | 185 | 42 186 | 00:02:29.095 --> 00:02:33.655 187 | 訪問番号ごとに1つの予測を行う必要がある 188 | ことを思い出してみましょう。 189 | 190 | 43 191 | 00:02:33.655 --> 00:02:37.255 192 | のは、訪問番号8のデータを見てみましょう 193 | 。 194 | 195 | 44 196 | 00:02:37.255 --> 00:02:39.565 197 | 我々はここを見ることができます 198 | 199 | 45 200 | 00:02:39.565 --> 00:02:44.315 201 | この特定の訪問は、カテゴリの塗料やアクセ 202 | サリーの購入がたくさんある 203 | 204 | 46 205 | 00:02:44.315 --> 00:02:47.920 206 | つまり、トリップタイプ番号26は、 207 | 208 | 47 209 | 00:02:47.920 --> 00:02:52.360 210 | そのカテゴリ内のほとんどの購入を訪問を表 211 | します。 212 | 213 | 48 214 | 00:02:52.360 --> 00:02:55.465 215 | さて、ここで鉄道模型にアプローチする方法 216 | 。 217 | 218 | 49 219 | 00:02:55.465 --> 00:02:59.525 220 | データをもう一度見て、可能性を評価してみ 221 | ましょう。 222 | 223 | 50 224 | 00:02:59.525 --> 00:03:05.027 225 | 我々は、リスト上の各項目のトリップタイプ 226 | を予測するか、我々は別の方法を選択する必 227 | 要がありますか? 228 | 229 | 51 230 | 00:03:05.027 --> 00:03:07.637 231 | もちろん二人とも可能ですが、 232 | 233 | 52 234 | 00:03:07.637 --> 00:03:09.095 235 | でも最初のうちは 236 | 237 | 53 238 | 00:03:09.095 --> 00:03:13.428 239 | 各データセットを使用して各行のトリップタ 240 | イプを予測し、 241 | 242 | 54 243 | 00:03:13.428 --> 00:03:18.170 244 | 私達は同じ訪問に属する項目間の重要な相互 245 | 作用を逃す。 246 | 247 | 55 248 | 00:03:18.170 --> 00:03:22.223 249 | 例えば、トリップタイプは、26の数がある 250 | かもしれませんが、 251 | 252 | 56 253 | 00:03:22.223 --> 00:03:27.309 254 | その項目の半分以上の場合は、塗料やアクセ 255 | サリーからです。 256 | 257 | 57 258 | 00:03:27.309 --> 00:03:31.170 259 | しかし、我々はこれらの項目間の相互作用を 260 | 考慮しない場合は、 261 | 262 | 58 263 | 00:03:31.170 --> 00:03:33.580 264 | それはかなり予測するのは難しいことができ 265 | ます。 266 | 267 | 59 268 | 00:03:33.580 --> 00:03:38.155 269 | だから、訪問とメイキングですべての購入を 270 | 結合する2番目のオプション 271 | 272 | 60 273 | 00:03:38.155 --> 00:03:43.250 274 | 各行が完全な訪問を表すデータセットは、よ 275 | り合理的なようです。 276 | 277 | 61 278 | 00:03:43.250 --> 00:03:45.658 279 | と、期待できるように、 280 | 281 | 62 282 | 00:03:45.658 --> 00:03:51.375 283 | このアプローチは、競争の中でより重要な利 284 | 点につながります。 285 | 286 | 63 287 | 00:03:51.375 --> 00:03:56.330 288 | 私はあなたの目的の1つにデータ形式を変更 289 | する最も簡単な方法を示すつもりです。 290 | 291 | 64 292 | 00:03:56.330 --> 00:04:00.815 293 | 例を目的として、部署の説明機能を選択して 294 | みましょう。 295 | 296 | 65 297 | 00:04:00.815 --> 00:04:04.900 298 | まず、データフレームを訪問番号でグループ 299 | 化して、 300 | 301 | 66 302 | 00:04:04.900 --> 00:04:09.915 303 | 各部門の説明が訪問に何回存在するかを計算 304 | します。 305 | 306 | 67 307 | 00:04:09.915 --> 00:04:14.010 308 | では、最後のグループを unstack 309 | てみましょう。 310 | 311 | 68 312 | 00:04:14.010 --> 00:04:19.285 313 | 列ので、各部門の説明値の一意の列を取得し 314 | ます。 315 | 316 | 69 317 | 00:04:19.285 --> 00:04:22.210 318 | 今、これは我々が欲しかった形式です。 319 | 320 | 70 321 | 00:04:22.210 --> 00:04:27.645 322 | 各行は訪問を表し、各列はその訪問で説明さ 323 | れている機能です。 324 | 325 | 71 326 | 00:04:27.645 --> 00:04:32.710 327 | 我々は、部門の説明以外の他の機能のアプロ 328 | ーチでこのグループを使用することができま 329 | す。 330 | 331 | 72 332 | 00:04:32.710 --> 00:04:39.755 333 | また、訪問中の項目は、実際にはテキスト内 334 | の単語に非常に似ていることに注意してくだ 335 | さい。 336 | 337 | 73 338 | 00:04:39.755 --> 00:04:44.680 339 | 我々の確認後、各機能は、ここでカウントを 340 | 表し、 341 | 342 | 74 343 | 00:04:44.680 --> 00:04:47.865 344 | だから、通常のテキストで動作するアイデア 345 | を適用することが、 346 | 347 | 75 348 | 00:04:47.865 --> 00:04:51.215 349 | たとえば、tf-idf 変換。 350 | 351 | 76 352 | 00:04:51.215 --> 00:04:55.565 353 | お察しのとおり、多くの可能性がここに出て 354 | くる。 355 | 356 | 77 357 | 00:04:55.565 --> 00:05:00.999 358 | すごい。これが行われ、我々は所望の形式で 359 | データを処理した後、 360 | 361 | 78 362 | 00:05:00.999 --> 00:05:03.100 363 | モデル選びに移りましょう。 364 | 365 | 79 366 | 00:05:03.100 --> 00:05:05.750 367 | すでに話し合ったことを踏まえて、 368 | 369 | 80 370 | 00:05:05.750 --> 00:05:08.620 371 | もし我々が大きな違いを期待する必要があり 372 | ます推測することができます 373 | 374 | 81 375 | 00:05:08.620 --> 00:05:12.485 376 | 線形モデルとツリーベースのモデルの間のス 377 | コアはここですか? 378 | 379 | 82 380 | 00:05:12.485 --> 00:05:15.715 381 | これについて少し考えてみてください。 382 | 383 | 83 384 | 00:05:15.715 --> 00:05:19.795 385 | たとえば、線形モデルが 386 | 387 | 84 388 | 00:05:19.795 --> 00:05:24.745 389 | ツリーベースのモデルと比較して実行します 390 | か?はい、あります。 391 | 392 | 85 393 | 00:05:24.745 --> 00:05:27.785 394 | 繰り返しますが、私はここでの相互作用につ 395 | いて話している。 396 | 397 | 86 398 | 00:05:27.785 --> 00:05:31.120 399 | 実際、ニューラルネットワークにおける木ベ 400 | ースのモデルは、 401 | 402 | 87 403 | 00:05:31.120 --> 00:05:36.315 404 | この非常に理由のためにこの競争の中で品質 405 | の重要な優位性。 406 | 407 | 88 408 | 00:05:36.315 --> 00:05:42.855 409 | しかし、それでも、1つは便利なメソッドの 410 | 機能をここに生成する線形モデルと TNN 411 | を使用することができます。 412 | 413 | 89 414 | 00:05:42.855 --> 00:05:46.430 415 | 彼らは相互作用を意味していないという事実 416 | にもかかわらず、 417 | 418 | 90 419 | 00:05:46.430 --> 00:05:50.230 420 | 彼らは私の一般的な賭けのスキームで貴重な 421 | 資産だった。 422 | 423 | 91 424 | 00:05:50.230 --> 00:05:54.295 425 | 私たちはここで賭けの詳細に行くことはあり 426 | ませんので、 427 | 428 | 92 429 | 00:05:54.295 --> 00:05:58.590 430 | 既に競争についての他のビデオのほとんどの 431 | 考えを覆った。 432 | 433 | 93 434 | 00:05:58.590 --> 00:06:03.165 435 | その代わりに、機能の生成について少し話し 436 | ます。 437 | 438 | 94 439 | 00:06:03.165 --> 00:06:07.798 440 | 1回の来店で購入した項目間のやりとり以外 441 | は、 442 | 443 | 95 444 | 00:06:07.798 --> 00:06:10.975 445 | 1つは機能間の相互作用を利用することを試 446 | みることができる。 447 | 448 | 96 449 | 00:06:10.975 --> 00:06:15.290 450 | ここで面白いと予想外の結果は、 451 | 452 | 97 453 | 00:06:15.290 --> 00:06:19.993 454 | 1つの fineline 番号は複数の部 455 | 門の記述に属することができる 456 | 457 | 98 458 | 00:06:19.993 --> 00:06:23.345 459 | つまり、fineline 番号は 460 | 461 | 99 462 | 00:06:23.345 --> 00:06:28.195 463 | あなたが考えることができるように、より詳 464 | 細な部門の説明。 465 | 466 | 100 467 | 00:06:28.195 --> 00:06:33.200 468 | この相互作用を使用して、1つは彼のモデル 469 | を更に改善できる。 470 | 471 | 101 472 | 00:06:33.200 --> 00:06:35.536 473 | もう一つの興味深い特徴の生成の考え 474 | 475 | 102 476 | 00:06:35.536 --> 00:06:38.875 477 | データの時間構造に接続した。 478 | 479 | 103 480 | 00:06:38.875 --> 00:06:41.895 481 | このプロットを見てください, 482 | 483 | 104 484 | 00:06:41.895 --> 00:06:46.395 485 | これは、行番号に対する曜日機能の変更を表 486 | します。 487 | 488 | 105 489 | 00:06:46.395 --> 00:06:50.230 490 | これは、データが時間によってここに注文さ 491 | れているように見えます。 492 | 493 | 106 494 | 00:06:50.230 --> 00:06:54.210 495 | データは31日で構成されています。 496 | 497 | 107 498 | 00:06:54.210 --> 00:06:57.350 499 | しかし、列車のテストの分割時間ベースでは 500 | なかった。 501 | 502 | 108 503 | 00:06:57.350 --> 00:07:02.533 504 | そのため、データセットの日番号のような機 505 | 能を派生させることができ、 506 | 507 | 109 508 | 00:07:02.533 --> 00:07:04.940 509 | 1日の訪問数、 510 | 511 | 110 512 | 00:07:04.940 --> 00:07:08.645 513 | と一日の訪問の合計金額。 514 | 515 | 111 516 | 00:07:08.645 --> 00:07:10.970 517 | だから、これは。 518 | 519 | 112 520 | 00:07:10.970 --> 00:07:15.350 521 | 私達はちょうどこの競争の最も興味深い部分 522 | を論議した。 523 | 524 | 113 525 | 00:07:15.350 --> 00:07:18.425 526 | データ形式をより適切に変更すると、 527 | 528 | 114 529 | 00:07:18.425 --> 00:07:21.214 530 | 販売しながら機能を生成する、 531 | 532 | 115 533 | 00:07:21.214 --> 00:07:24.300 534 | 積み重ねをしながらモデルを操作する。 535 | 536 | 116 537 | 00:07:24.300 --> 00:07:28.365 538 | そして最後に、追加の機能エンジニアリング 539 | のいくつかを行う。 540 | 541 | 117 542 | 00:07:28.365 --> 00:07:32.395 543 | 挑戦自体は有用、興味深い証明した。 544 | 545 | 118 546 | 00:07:32.395 --> 00:07:38.770 547 | そして、私はそれをチェックアウトし、我々 548 | が話しているアプローチを試してみることを 549 | お勧めします。 550 | 551 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/4_Walmart Trip Type Classification.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/5_Acquire Valued Shoppers Challenge part 1.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/howtowin_kaggle/week_5/6_Acquire Valued Shoppers Challenge part 2.srt.style: -------------------------------------------------------------------------------- 1 | ScriptType: v4.00+ 2 | 3 | [V4+ Styles] 4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding 5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1 6 | -------------------------------------------------------------------------------- /cousera/script/translate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import http.client, urllib.parse, uuid, json 4 | import argparse 5 | import sys 6 | import textwrap 7 | 8 | parser = argparse.ArgumentParser(description='translate program') 9 | parser.add_argument('-i','--in', action='store', dest='in_file', required=True, help='tanslate input file') 10 | args = parser.parse_args() 11 | 12 | if str(args.in_file).endswith('vtt') == False: 13 | print('invalid in_file. not vtt file. file=' +str(args.in_file)) 14 | sys.exit() 15 | 16 | # ********************************************** 17 | # *** Update or verify the following values. *** 18 | # ********************************************** 19 | # Replace the subscriptionKey string value with your valid subscription key. 20 | subscriptionKey = '' 21 | host = 'api.cognitive.microsofttranslator.com' 22 | path = '/translate?api-version=3.0' 23 | # Translate to Japanese 24 | params = "&to=ja"; 25 | 26 | 27 | def translate (content): 28 | 29 | headers = { 30 | 'Ocp-Apim-Subscription-Key': subscriptionKey, 31 | 'Content-type': 'application/json', 32 | 'X-ClientTraceId': str(uuid.uuid4()) 33 | } 34 | 35 | conn = http.client.HTTPSConnection(host) 36 | conn.request ("POST", path + params, content, headers) 37 | response = conn.getresponse () 38 | return response 39 | 40 | out_file = str(args.in_file).replace('vtt','srt') 41 | out_f = open(out_file, 'w') 42 | comment_no = 1 43 | print("start srt file") 44 | with open(args.in_file,'r') as f: 45 | for row in f: 46 | line = row.strip() 47 | if line == str(comment_no): 48 | out_f.write(str(comment_no) + "\n") 49 | out_f.write(next(f)) 50 | 51 | eng_text = next(f).strip() 52 | try: 53 | work = next(f).strip() 54 | while work != '': 55 | eng_text += " " + str(work) 56 | work = next(f).strip() 57 | except StopIteration: 58 | pass 59 | 60 | #print(str(comment_no) + ":" + eng_text +"\n") 61 | requestBody = [{ 62 | 'Text' : eng_text, 63 | }] 64 | content = json.dumps(requestBody, ensure_ascii=False).encode('utf-8') 65 | result = translate (content) 66 | result_dict = json.load(result)[0] 67 | #print(result_dict) 68 | #print('result_dict:{}'.format(type(result_dict))) 69 | 70 | jpn_text = result_dict['translations'][0]['text'] 71 | jpn_text = "\n".join(textwrap.wrap(jpn_text, width=20)) 72 | out_f.write(jpn_text +"\n") 73 | 74 | comment_no = comment_no + 1 75 | out_f.write('\n') 76 | 77 | out_f.close() 78 | print("end srt file") 79 | 80 | print("start style file") 81 | with open(out_file+".style",'w') as f: 82 | f.write("ScriptType: v4.00+\n") 83 | f.write("\n") 84 | f.write("[V4+ Styles]\n") 85 | f.write("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n") 86 | f.write("Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1\n") 87 | print("end style file") 88 | -------------------------------------------------------------------------------- /cousera/share/Week1_(T.Shimano).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week1_(T.Shimano).pdf -------------------------------------------------------------------------------- /cousera/share/Week_1(T.Nakao).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(T.Nakao).pdf -------------------------------------------------------------------------------- /cousera/share/Week_1(T.Nakao).pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(T.Nakao).pptx -------------------------------------------------------------------------------- /cousera/share/Week_1(takagishi).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(takagishi).pdf -------------------------------------------------------------------------------- /cousera/share/Week_1(takagishi).pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(takagishi).pptx -------------------------------------------------------------------------------- /cousera/share/week3_門脇_Concept of mean encoding.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/week3_門脇_Concept of mean encoding.pptx -------------------------------------------------------------------------------- /cousera/share/week3_門脇_Regularization.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/week3_門脇_Regularization.pptx -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image01.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image02.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image03.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image04.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image05.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image05.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image06.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image06.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image07.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image07.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image08.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image08.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image09.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image09.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image10.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image11.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image12.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_cls_image13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image13.jpg -------------------------------------------------------------------------------- /wiki/cousera/3week_image001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image001.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image004.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image007.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image011.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image012.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image022.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image022.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image025.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image025.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image026.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image026.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image027.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image027.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image028.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image028.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image029.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image029.png -------------------------------------------------------------------------------- /wiki/cousera/3week_image030.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image030.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding001.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding002.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding003.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding004.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding005.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding006.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding007.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding008.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding009.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding010.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding011.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding011.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding012.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding013.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding013.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding014.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding014.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding015.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding015.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding016.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding016.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding017.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding017.png -------------------------------------------------------------------------------- /wiki/cousera/3week_mean-encoding018.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding018.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures001.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures002.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures003.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures004.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures005.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures006.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures007.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures008.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures009.png -------------------------------------------------------------------------------- /wiki/cousera/4week_AdvancedFeatures010.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures010.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Ensemble_Tips1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips1.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Ensemble_Tips2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips2.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Ensemble_Tips3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips3.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Ensemble_Tips4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips4.png -------------------------------------------------------------------------------- /wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_test.npy -------------------------------------------------------------------------------- /wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_train.npy.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_train.npy.zip -------------------------------------------------------------------------------- /wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_test.npy -------------------------------------------------------------------------------- /wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_train.npy.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_train.npy.zip -------------------------------------------------------------------------------- /wiki/cousera/4week_NeuralNet001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_NeuralNet001.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Practicalguide001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide001.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Practicalguide002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide002.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Practicalguide003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide003.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Practicalguide004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide004.png -------------------------------------------------------------------------------- /wiki/cousera/4week_StackNet1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_StackNet1.png -------------------------------------------------------------------------------- /wiki/cousera/4week_StackNet2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_StackNet2.png -------------------------------------------------------------------------------- /wiki/cousera/4week_Stacking1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Stacking1.png -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_bagging1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging1.jpg -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_bagging2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging2.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_bagging3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging3.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_bagging4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging4.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_bagging5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging5.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting1.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting2.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting3.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting4.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting5.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting6.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting7.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_boosting8.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting8.PNG -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_intro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro1.png -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_intro2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro2.png -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_intro3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro3.png -------------------------------------------------------------------------------- /wiki/cousera/4week_ensemble_intro4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro4.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_001.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_002.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_003.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_004.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_005.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_006.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_007.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_008.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_008.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_009.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_009.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_101.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_101.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_102.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_102.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_103.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_103.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_104.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_104.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_105.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_105.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_106.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_106.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_107.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_107.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_108.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_108.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_109.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_109.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_110.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_110.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_111.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_111.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_112.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_112.png -------------------------------------------------------------------------------- /wiki/cousera/5week_image_tn_113.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_113.png -------------------------------------------------------------------------------- /wiki/cousera/Programming assignment, week 4_ Ensembles/__pycache__/grader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/Programming assignment, week 4_ Ensembles/__pycache__/grader.cpython-36.pyc -------------------------------------------------------------------------------- /wiki/cousera/Programming assignment, week 4_ Ensembles/grader.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import numpy as np 4 | from collections import OrderedDict 5 | 6 | def array_to_hash(x): 7 | x_tupled = None 8 | if type(x) == list: 9 | x_tupled = tuple(x) 10 | elif type(x) == np.ndarray: 11 | x_tupled = tuple(list(x.flatten())) 12 | elif type(x) == tuple: 13 | x_tupled = x 14 | else: 15 | raise RuntimeError('unexpected type of input: {}'.format(type(x))) 16 | return hash(tuple(map(float, x_tupled))) 17 | 18 | def almostEqual(x, y): 19 | return abs(x - y) < 1e-5 20 | 21 | 22 | class Grader(object): 23 | def __init__(self): 24 | self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1' 25 | self.assignment_key = 'Lhay-55JEeet3xIBvGMumA' 26 | self.parts = OrderedDict([ 27 | ('EyiFH', 'best_alpha'), 28 | ('XH82R', 'r2_train_simple_mix'), 29 | ('BHeRs', 'r2_test_simple_mix'), 30 | ('MkwCS', 'r2_train_stacking'), 31 | ('j4Adb', 'r2_test_stacking'), 32 | ]) 33 | self.answers = {key: None for key in self.parts} 34 | 35 | @staticmethod 36 | def ravel_output(output): 37 | ''' 38 | If student accedentally submitted np.array with one 39 | element instead of number, this function will submit 40 | this number instead 41 | ''' 42 | if isinstance(output, np.ndarray) and output.size == 1: 43 | output = output.item(0) 44 | return output 45 | 46 | def submit(self, email, token): 47 | submission = { 48 | "assignmentKey": self.assignment_key, 49 | "submitterEmail": email, 50 | "secret": token, 51 | "parts": {} 52 | } 53 | for part, output in self.answers.items(): 54 | if output is not None: 55 | submission["parts"][part] = {"output": output} 56 | else: 57 | submission["parts"][part] = dict() 58 | request = requests.post(self.submission_page, data=json.dumps(submission)) 59 | response = request.json() 60 | if request.status_code == 201: 61 | print('Submitted to Coursera platform. See results on assignment page!') 62 | elif u'details' in response and u'learnerMessage' in response[u'details']: 63 | print(response[u'details'][u'learnerMessage']) 64 | else: 65 | print("Unknown response from Coursera: {}".format(request.status_code)) 66 | print(response) 67 | 68 | def status(self): 69 | print("You want to submit these numbers:") 70 | for part_id, part_name in self.parts.items(): 71 | answer = self.answers[part_id] 72 | if answer is None: 73 | answer = '-'*10 74 | print("Task {}: {}".format(part_name, answer)) 75 | 76 | def submit_part(self, part, output): 77 | self.answers[part] = output 78 | print("Current answer for task {} is: {}".format(self.parts[part], output)) 79 | 80 | def submit_tag(self, tag, output): 81 | part_id = [k for k, v in self.parts.items() if v == tag] 82 | if len(part_id)!=1: 83 | raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id))) 84 | part_id = part_id[0] 85 | self.submit_part(part_id, str(self.ravel_output(output))) -------------------------------------------------------------------------------- /wiki/cousera/clone1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/clone1.png -------------------------------------------------------------------------------- /wiki/cousera/translate_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/translate_1.png -------------------------------------------------------------------------------- /wiki/cousera/week1_program_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week1_program_plot.png -------------------------------------------------------------------------------- /wiki/cousera/week5_cf1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf1.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_cf2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf2.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_cf3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf3.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_cf4.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf4.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_cf5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf5.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_cf6.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf6.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_cf7.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf7.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_mm1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_mm1.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_mm2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_mm2.PNG -------------------------------------------------------------------------------- /wiki/cousera/week5_sl1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_sl1.PNG --------------------------------------------------------------------------------