├── .gitignore
├── README.md
├── cousera
    ├── howtowin_kaggle
    │   ├── week_1
    │   │   ├── 1_1_introduction.srt
    │   │   ├── 1_1_introduction.srt.style
    │   │   ├── 1_1_introduction.vtt
    │   │   ├── 1_2_meet_your_lecturers.srt
    │   │   ├── 1_2_meet_your_lecturers.srt.style
    │   │   ├── 1_2_meet_your_lecturers.vtt
    │   │   ├── 1_3_cource_overview.srt
    │   │   ├── 1_3_cource_overview.srt.style
    │   │   ├── 1_3_cource_overview.vtt
    │   │   ├── 2_1_competition_mechanics.srt
    │   │   ├── 2_1_competition_mechanics.srt.style
    │   │   ├── 2_1_competition_mechanics.vtt
    │   │   ├── 2_2_kaggle_overview.srt
    │   │   ├── 2_2_kaggle_overview.srt.style
    │   │   ├── 2_2_kaggle_overview.vtt
    │   │   ├── 2_3_real_world.srt
    │   │   ├── 2_3_real_world.srt.style
    │   │   ├── 2_3_real_world.vtt
    │   │   ├── 3_1_recap.srt
    │   │   ├── 3_1_recap.srt.style
    │   │   ├── 3_1_recap.vtt
    │   │   ├── 4_1_Software_Hardware_Requirements.srt
    │   │   ├── 4_1_Software_Hardware_Requirements.srt.style
    │   │   ├── 4_1_Software_Hardware_Requirements.vtt
    │   │   ├── 5_1_overview.srt
    │   │   ├── 5_1_overview.srt.style
    │   │   ├── 5_1_overview.vtt
    │   │   ├── 5_2_numeric_feature.srt
    │   │   ├── 5_2_numeric_feature.srt.style
    │   │   ├── 5_2_numeric_feature.vtt
    │   │   ├── 5_3_categorical_and_ordinal.srt
    │   │   ├── 5_3_categorical_and_ordinal.srt.style
    │   │   ├── 5_3_categorical_and_ordinal.vtt
    │   │   ├── 5_4_datetime_and_coordinates.srt
    │   │   ├── 5_4_datetime_and_coordinates.srt.style
    │   │   ├── 5_4_datetime_and_coordinates.vtt
    │   │   ├── 5_5_handling_missing_values.srt
    │   │   ├── 5_5_handling_missing_values.srt.style
    │   │   ├── 5_5_handling_missing_values.vtt
    │   │   ├── 6_1_bag_of_words.srt
    │   │   ├── 6_1_bag_of_words.srt.style
    │   │   ├── 6_1_bag_of_words.vtt
    │   │   ├── 6_2_word2vec.srt
    │   │   ├── 6_2_word2vec.srt.style
    │   │   ├── 6_2_word2vec.vtt
    │   │   ├── 7_1_final_project_overview.srt
    │   │   ├── 7_1_final_project_overview.srt.style
    │   │   ├── 7_1_final_project_overview.vtt
    │   │   └── ここにmp4をおいて再生してね
    │   ├── week_2
    │   │   ├── 1_1_Exploratory_data analysis.srt
    │   │   ├── 1_1_Exploratory_data analysis.srt.style
    │   │   ├── 1_1_Exploratory_data analysis.vtt
    │   │   ├── 1_2_Building_intuition_about_the_data.srt
    │   │   ├── 1_2_Building_intuition_about_the_data.srt.style
    │   │   ├── 1_2_Building_intuition_about_the_data.vtt
    │   │   ├── 1_3_Exploring_anonymized_data.srt
    │   │   ├── 1_3_Exploring_anonymized_data.srt.style
    │   │   ├── 1_3_Exploring_anonymized_data.vtt
    │   │   ├── 1_4_Visualizations.srt
    │   │   ├── 1_4_Visualizations.srt.style
    │   │   ├── 1_4_Visualizations.vtt
    │   │   ├── 1_5_Dataset_cleaning_and_other_things_to_check.srt
    │   │   ├── 1_5_Dataset_cleaning_and_other_things_to_check.srt.style
    │   │   ├── 1_5_Dataset_cleaning_and_other_things_to_check.vtt
    │   │   ├── 1_6_Springleaf_competition_EDA_I.srt
    │   │   ├── 1_6_Springleaf_competition_EDA_I.srt.style
    │   │   ├── 1_6_Springleaf_competition_EDA_I.vtt
    │   │   ├── 1_7_Springleaf_competition_EDA_II.srt
    │   │   ├── 1_7_Springleaf_competition_EDA_II.srt.style
    │   │   ├── 1_7_Springleaf_competition_EDA_II.vtt
    │   │   ├── 1_8_Numerai_competition_EDA.srt
    │   │   ├── 1_8_Numerai_competition_EDA.srt.style
    │   │   ├── 1_8_Numerai_competition_EDA.vtt
    │   │   ├── 2_1_Validation and overfitting.srt
    │   │   ├── 2_1_Validation and overfitting.srt.style
    │   │   ├── 2_1_Validation and overfitting.vtt
    │   │   ├── 2_2_Validation strategies.srt
    │   │   ├── 2_2_Validation strategies.srt.style
    │   │   ├── 2_2_Validation strategies.vtt
    │   │   ├── 2_3_Data splitting strategies.srt
    │   │   ├── 2_3_Data splitting strategies.srt.style
    │   │   ├── 2_3_Data splitting strategies.vtt
    │   │   ├── 2_4_Problems occurring during validation.srt
    │   │   ├── 2_4_Problems occurring during validation.srt.style
    │   │   ├── 2_4_Problems occurring during validation.vtt
    │   │   ├── 3_1_Basic data leaks.srt
    │   │   ├── 3_1_Basic data leaks.srt.style
    │   │   ├── 3_1_Basic data leaks.vtt
    │   │   ├── 3_2_Leaderboard probing and examples of rare data leaks.srt
    │   │   ├── 3_2_Leaderboard probing and examples of rare data leaks.srt.style
    │   │   ├── 3_2_Leaderboard probing and examples of rare data leaks.vtt
    │   │   ├── 3_3_Expedia challenge.srt
    │   │   ├── 3_3_Expedia challenge.srt.style
    │   │   └── 3_3_Expedia challenge.vtt
    │   ├── week_3
    │   │   ├── 1_1_Motivation.srt
    │   │   ├── 1_1_Motivation.srt.style
    │   │   ├── 1_1_Motivation.vtt
    │   │   ├── 1_2_Regression_metrics_review1.srt
    │   │   ├── 1_2_Regression_metrics_review1.srt.style
    │   │   ├── 1_2_Regression_metrics_review1.vtt
    │   │   ├── 1_3_Regression_metrics_review2.srt
    │   │   ├── 1_3_Regression_metrics_review2.srt.style
    │   │   ├── 1_3_Regression_metrics_review2.vtt
    │   │   ├── 1_4_Classification_metrics_review.srt
    │   │   ├── 1_4_Classification_metrics_review.srt.style
    │   │   ├── 1_4_Classification_metrics_review.vtt
    │   │   ├── 1_5_General_approaches.srt
    │   │   ├── 1_5_General_approaches.srt.style
    │   │   ├── 1_5_General_approaches.vtt
    │   │   ├── 1_6_Regression_metrics_optimization.srt
    │   │   ├── 1_6_Regression_metrics_optimization.srt.style
    │   │   ├── 1_6_Regression_metrics_optimization.vtt
    │   │   ├── 1_7_Classification_metrics_optimization_1.srt
    │   │   ├── 1_7_Classification_metrics_optimization_1.srt.style
    │   │   ├── 1_7_Classification_metrics_optimization_1.vtt
    │   │   ├── 1_8_Classification_metrics_optimization_2.srt
    │   │   ├── 1_8_Classification_metrics_optimization_2.srt.style
    │   │   ├── 1_8_Classification_metrics_optimization_2.vtt
    │   │   ├── 2_1_Concept_of_mean_encoding.srt
    │   │   ├── 2_1_Concept_of_mean_encoding.srt.style
    │   │   ├── 2_1_Concept_of_mean_encoding.vtt
    │   │   ├── 2_2_Regularization.srt
    │   │   ├── 2_2_Regularization.srt.style
    │   │   ├── 2_2_Regularization.vtt
    │   │   ├── 2_3_Extensions_and_generalizations.srt
    │   │   ├── 2_3_Extensions_and_generalizations.srt.style
    │   │   └── 2_3_Extensions_and_generalizations.vtt
    │   ├── week_4
    │   │   ├── 1_1_Hyperparameter_tuning_1.srt
    │   │   ├── 1_1_Hyperparameter_tuning_1.srt.style
    │   │   ├── 1_1_Hyperparameter_tuning_1.vtt
    │   │   ├── 1_2_Hyperparameter_tuning_2.srt
    │   │   ├── 1_2_Hyperparameter_tuning_2.srt.style
    │   │   ├── 1_2_Hyperparameter_tuning_2.vtt
    │   │   ├── 1_3_Hyperparameter_tuning_3.srt
    │   │   ├── 1_3_Hyperparameter_tuning_3.srt.style
    │   │   ├── 1_3_Hyperparameter_tuning_3.vtt
    │   │   ├── 1_4_Practical_guide.srt
    │   │   ├── 1_4_Practical_guide.srt.style
    │   │   ├── 1_4_Practical_guide.vtt
    │   │   ├── 1_5_KazAnova's competition pipeline, part 1.srt
    │   │   ├── 1_5_KazAnova's competition pipeline, part 1.srt.style
    │   │   ├── 1_5_KazAnova's competition pipeline, part 1.vtt
    │   │   ├── 1_6_KazAnova's competition pipeline, part 2.srt
    │   │   ├── 1_6_KazAnova's competition pipeline, part 2.srt.style
    │   │   ├── 1_6_KazAnova's competition pipeline, part 2.vtt
    │   │   ├── 2_1_Statistics and distance based features.srt
    │   │   ├── 2_1_Statistics and distance based features.srt.style
    │   │   ├── 2_1_Statistics and distance based features.vtt
    │   │   ├── 2_2_Matrix factorizations.srt
    │   │   ├── 2_2_Matrix factorizations.srt.style
    │   │   ├── 2_2_Matrix factorizations.vtt
    │   │   ├── 2_3_Feature Interactions.srt
    │   │   ├── 2_3_Feature Interactions.srt.style
    │   │   ├── 2_3_Feature Interactions.vtt
    │   │   ├── 2_4_t-SNE.srt
    │   │   ├── 2_4_t-SNE.srt.style
    │   │   ├── 2_4_t-SNE.vtt
    │   │   ├── 3_1_Introduction into ensemble methods.srt
    │   │   ├── 3_1_Introduction into ensemble methods.srt.style
    │   │   ├── 3_1_Introduction into ensemble methods.vtt
    │   │   ├── 3_2_Bagging.srt
    │   │   ├── 3_2_Bagging.srt.style
    │   │   ├── 3_2_Bagging.vtt
    │   │   ├── 3_3_Boosting.srt
    │   │   ├── 3_3_Boosting.srt.style
    │   │   ├── 3_3_Boosting.vtt
    │   │   ├── 3_4_Stacking.srt
    │   │   ├── 3_4_Stacking.srt.style
    │   │   ├── 3_4_Stacking.vtt
    │   │   ├── 3_5_StackNet.srt
    │   │   ├── 3_5_StackNet.srt.style
    │   │   ├── 3_5_StackNet.vtt
    │   │   ├── 3_6_Ensembling Tips and Tricks.srt
    │   │   ├── 3_6_Ensembling Tips and Tricks.srt.style
    │   │   └── 3_6_Ensembling Tips and Tricks.vtt
    │   └── week_5
    │   │   ├── 1_Crowdflower Competition.srt
    │   │   ├── 1_Crowdflower Competition.srt.style
    │   │   ├── 1_Crowdflower Competition.vtt
    │   │   ├── 1_Crowdflower\ Competition.srt
    │   │   ├── 2_Springleaf Marketing Response.srt
    │   │   ├── 2_Springleaf Marketing Response.srt.style
    │   │   ├── 2_Springleaf Marketing Response.vtt
    │   │   ├── 3_Microsoft Malware Classification Challenge.srt
    │   │   ├── 3_Microsoft Malware Classification Challenge.srt.style
    │   │   ├── 3_Microsoft Malware Classification Challenge.vtt
    │   │   ├── 4_Walmart Trip Type Classification.srt
    │   │   ├── 4_Walmart Trip Type Classification.srt.style
    │   │   ├── 4_Walmart Trip Type Classification.vtt
    │   │   ├── 5_Acquire Valued Shoppers Challenge part 1.srt
    │   │   ├── 5_Acquire Valued Shoppers Challenge part 1.srt.style
    │   │   ├── 5_Acquire Valued Shoppers Challenge part 1.vtt
    │   │   ├── 6_Acquire Valued Shoppers Challenge part 2.srt
    │   │   ├── 6_Acquire Valued Shoppers Challenge part 2.srt.style
    │   │   └── 6_Acquire Valued Shoppers Challenge part 2.vtt
    ├── script
    │   └── translate.py
    └── share
    │   ├── Week1_(T.Shimano).pdf
    │   ├── Week_1(T.Nakao).pdf
    │   ├── Week_1(T.Nakao).pptx
    │   ├── Week_1(takagishi).pdf
    │   ├── Week_1(takagishi).pptx
    │   ├── week3_門脇_Concept of mean encoding.pptx
    │   └── week3_門脇_Regularization.pptx
└── wiki
    └── cousera
        ├── 3week_cls_image01.jpg
        ├── 3week_cls_image02.jpg
        ├── 3week_cls_image03.jpg
        ├── 3week_cls_image04.jpg
        ├── 3week_cls_image05.jpg
        ├── 3week_cls_image06.jpg
        ├── 3week_cls_image07.jpg
        ├── 3week_cls_image08.jpg
        ├── 3week_cls_image09.jpg
        ├── 3week_cls_image10.jpg
        ├── 3week_cls_image11.jpg
        ├── 3week_cls_image12.jpg
        ├── 3week_cls_image13.jpg
        ├── 3week_image001.png
        ├── 3week_image004.png
        ├── 3week_image007.png
        ├── 3week_image011.png
        ├── 3week_image012.png
        ├── 3week_image022.png
        ├── 3week_image025.png
        ├── 3week_image026.png
        ├── 3week_image027.png
        ├── 3week_image028.png
        ├── 3week_image029.png
        ├── 3week_image030.png
        ├── 3week_mean-encoding001.png
        ├── 3week_mean-encoding002.png
        ├── 3week_mean-encoding003.png
        ├── 3week_mean-encoding004.png
        ├── 3week_mean-encoding005.png
        ├── 3week_mean-encoding006.png
        ├── 3week_mean-encoding007.png
        ├── 3week_mean-encoding008.png
        ├── 3week_mean-encoding009.png
        ├── 3week_mean-encoding010.png
        ├── 3week_mean-encoding011.png
        ├── 3week_mean-encoding012.png
        ├── 3week_mean-encoding013.png
        ├── 3week_mean-encoding014.png
        ├── 3week_mean-encoding015.png
        ├── 3week_mean-encoding016.png
        ├── 3week_mean-encoding017.png
        ├── 3week_mean-encoding018.png
        ├── 4week_AdvancedFeatures001.png
        ├── 4week_AdvancedFeatures002.png
        ├── 4week_AdvancedFeatures003.png
        ├── 4week_AdvancedFeatures004.png
        ├── 4week_AdvancedFeatures005.png
        ├── 4week_AdvancedFeatures006.png
        ├── 4week_AdvancedFeatures007.png
        ├── 4week_AdvancedFeatures008.png
        ├── 4week_AdvancedFeatures009.png
        ├── 4week_AdvancedFeatures010.png
        ├── 4week_Ensemble_Tips1.png
        ├── 4week_Ensemble_Tips2.png
        ├── 4week_Ensemble_Tips3.png
        ├── 4week_Ensemble_Tips4.png
        ├── 4week_KNNfeatures
            ├── compute_KNN_features.ipynb
            ├── data
            │   ├── knn_feats_cosine_test.npy
            │   ├── knn_feats_cosine_train.npy.zip
            │   ├── knn_feats_minkowski_test.npy
            │   └── knn_feats_minkowski_train.npy.zip
            └── grader.py
        ├── 4week_NeuralNet001.png
        ├── 4week_Practicalguide001.png
        ├── 4week_Practicalguide002.png
        ├── 4week_Practicalguide003.png
        ├── 4week_Practicalguide004.png
        ├── 4week_StackNet1.png
        ├── 4week_StackNet2.png
        ├── 4week_Stacking1.png
        ├── 4week_ensemble_bagging1.jpg
        ├── 4week_ensemble_bagging2.PNG
        ├── 4week_ensemble_bagging3.PNG
        ├── 4week_ensemble_bagging4.PNG
        ├── 4week_ensemble_bagging5.PNG
        ├── 4week_ensemble_boosting1.PNG
        ├── 4week_ensemble_boosting2.PNG
        ├── 4week_ensemble_boosting3.PNG
        ├── 4week_ensemble_boosting4.PNG
        ├── 4week_ensemble_boosting5.PNG
        ├── 4week_ensemble_boosting6.PNG
        ├── 4week_ensemble_boosting7.PNG
        ├── 4week_ensemble_boosting8.PNG
        ├── 4week_ensemble_intro1.png
        ├── 4week_ensemble_intro2.png
        ├── 4week_ensemble_intro3.png
        ├── 4week_ensemble_intro4.png
        ├── 5week_image_tn_001.png
        ├── 5week_image_tn_002.png
        ├── 5week_image_tn_003.png
        ├── 5week_image_tn_004.png
        ├── 5week_image_tn_005.png
        ├── 5week_image_tn_006.png
        ├── 5week_image_tn_007.png
        ├── 5week_image_tn_008.png
        ├── 5week_image_tn_009.png
        ├── 5week_image_tn_101.png
        ├── 5week_image_tn_102.png
        ├── 5week_image_tn_103.png
        ├── 5week_image_tn_104.png
        ├── 5week_image_tn_105.png
        ├── 5week_image_tn_106.png
        ├── 5week_image_tn_107.png
        ├── 5week_image_tn_108.png
        ├── 5week_image_tn_109.png
        ├── 5week_image_tn_110.png
        ├── 5week_image_tn_111.png
        ├── 5week_image_tn_112.png
        ├── 5week_image_tn_113.png
        ├── Programming assignment, week 4_ Ensembles
            ├── Programming_assignment_week_4-Copy1.ipynb
            ├── Programming_assignment_week_4.ipynb
            ├── __pycache__
            │   └── grader.cpython-36.pyc
            └── grader.py
        ├── clone1.png
        ├── translate_1.png
        ├── week1_program_plot.png
        ├── week5_cf1.PNG
        ├── week5_cf2.PNG
        ├── week5_cf3.PNG
        ├── week5_cf4.PNG
        ├── week5_cf5.PNG
        ├── week5_cf6.PNG
        ├── week5_cf7.PNG
        ├── week5_mm1.PNG
        ├── week5_mm2.PNG
        └── week5_sl1.PNG


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | cousera/*/*.mp4
106 | cousera/howtowin_kaggle/*/*mp4
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spade
 2 | 勉強会用にリポジトリを作成しました
 3 | 
 4 | ## 使用用途
 5 | 各種情報共有(ドキュメント・wiki)
 6 | 
 7 | ## その他
 8 | spadeって名前はあんまり気にしないで下さい
 9 | 単なるproject code的な
10 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_1_introduction.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/1_1_introduction.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_1_introduction.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_1_introduction.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:00.390 --> 00:00:05.447
  5 | [MUSIC]
  6 | 
  7 | 2
  8 | 00:00:05.447 --> 00:00:07.250
  9 | Hello, and welcome.
 10 | 
 11 | 3
 12 | 00:00:07.250 --> 00:00:08.170
 13 | My name is Dimitri, and
 14 | 
 15 | 4
 16 | 00:00:08.170 --> 00:00:12.060
 17 | I'm happy to see you are interested
 18 | in competitive data science.
 19 | 
 20 | 5
 21 | 00:00:12.060 --> 00:00:14.710
 22 | Data science is all about
 23 | machine learning applications.
 24 | 
 25 | 6
 26 | 00:00:14.710 --> 00:00:17.640
 27 | And in data science, like everywhere else,
 28 | people are looking for
 29 | 
 30 | 7
 31 | 00:00:17.640 --> 00:00:20.380
 32 | the very best solutions to their problems.
 33 | 
 34 | 8
 35 | 00:00:20.380 --> 00:00:24.240
 36 | They're looking for the models that
 37 | have the best predictive capabilities,
 38 | 
 39 | 9
 40 | 00:00:24.240 --> 00:00:27.850
 41 | the models that make as
 42 | few mistakes as possible.
 43 | 
 44 | 10
 45 | 00:00:27.850 --> 00:00:31.830
 46 | And the competition for one becomes
 47 | an essential way to find such solutions.
 48 | 
 49 | 11
 50 | 00:00:31.830 --> 00:00:35.340
 51 | Competing for the prize,
 52 | participants push through the limits,
 53 | 
 54 | 12
 55 | 00:00:35.340 --> 00:00:37.680
 56 | come up with novel ideas.
 57 | 
 58 | 13
 59 | 00:00:37.680 --> 00:00:41.250
 60 | Companies organize data science
 61 | competitions to get top quality models for
 62 | 
 63 | 14
 64 | 00:00:41.250 --> 00:00:42.940
 65 | not so high price.
 66 | 
 67 | 15
 68 | 00:00:42.940 --> 00:00:45.980
 69 | And for data scientists,
 70 | competitions become a truly unique
 71 | 
 72 | 16
 73 | 00:00:45.980 --> 00:00:49.215
 74 | opportunity to learn, well,
 75 | and of course win a prize.
 76 | 
 77 | 17
 78 | 00:00:50.360 --> 00:00:54.090
 79 | This course is a chance for you to catch
 80 | up on the trends in competitive data
 81 | 
 82 | 18
 83 | 00:00:54.090 --> 00:00:58.060
 84 | science and learn what we,
 85 | competition addicts and at the same time,
 86 | 
 87 | 19
 88 | 00:00:58.060 --> 00:01:01.239
 89 | lecturers of this course,
 90 | have already learned while competing.
 91 | 
 92 | 20
 93 | 00:01:02.390 --> 00:01:05.976
 94 | In this course, we will go through
 95 | competition solving process step by
 96 | 
 97 | 21
 98 | 00:01:05.976 --> 00:01:09.982
 99 | step and tell you about exploratory data
100 | analysis, basic and advanced feature
101 | 
102 | 22
103 | 00:01:09.982 --> 00:01:13.712
104 | generation and preprocessing,
105 | various model validation techniques.
106 | 
107 | 23
108 | 00:01:13.712 --> 00:01:18.498
109 | Data leakages, competition's metric
110 | optimization, model ensembling,
111 | 
112 | 24
113 | 00:01:18.498 --> 00:01:20.370
114 | and hyperparameter tuning.
115 | 
116 | 25
117 | 00:01:20.370 --> 00:01:25.050
118 | We've put together all our experience and
119 | created this course for you.
120 | 
121 | 26
122 | 00:01:25.050 --> 00:01:26.520
123 | We've also designed quizzes and
124 | 
125 | 27
126 | 00:01:26.520 --> 00:01:31.000
127 | programming assignments to let you
128 | apply your newly acquired skills.
129 | 
130 | 28
131 | 00:01:31.000 --> 00:01:34.570
132 | Moreover, as a final project, you will
133 | have an opportunity to compete with
134 | 
135 | 29
136 | 00:01:34.570 --> 00:01:37.545
137 | other students and
138 | participate in a special competition,
139 | 
140 | 30
141 | 00:01:37.545 --> 00:01:43.460
142 | hosted on the world's largest platform for
143 | data science challenges called Kaggle.
144 | 
145 | 31
146 | 00:01:43.460 --> 00:01:46.354
147 | Now, let's meet other lecturers and
148 | get started.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_2_meet_your_lecturers.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:04.210 --> 00:00:09.390
  5 | And now, I want to introduce other lecturers of this course.
  6 | 
  7 | 2
  8 | 00:00:09.390 --> 00:00:13.190
  9 | Alexander, Dmitry, Mikhail, and Marios.
 10 | 
 11 | 3
 12 | 00:00:13.190 --> 00:00:15.520
 13 | Mikhail is aka Cassanova,
 14 | 
 15 | 4
 16 | 00:00:15.520 --> 00:00:20.180
 17 | the person who reached the very top of competitive data science.
 18 | 
 19 | 5
 20 | 00:00:20.180 --> 00:00:22.925
 21 | I will tell you a couple of thoughts about the origins of the course.
 22 | 
 23 | 6
 24 | 00:00:22.925 --> 00:00:27.660
 25 | In year 2014, we started our win in data science by joining competitions.
 26 | 
 27 | 7
 28 | 00:00:27.660 --> 00:00:30.880
 29 | We've been meeting every week and discussing the past competitions, solutions,
 30 | 
 31 | 8
 32 | 00:00:30.880 --> 00:00:33.885
 33 | ideas and tweaks what worked and what did not,
 34 | 
 35 | 9
 36 | 00:00:33.885 --> 00:00:36.760
 37 | this exchange of knowledge and experience helped us
 38 | 
 39 | 10
 40 | 00:00:36.760 --> 00:00:39.915
 41 | to learn quickly from each other and improve our skills.
 42 | 
 43 | 11
 44 | 00:00:39.915 --> 00:00:41.680
 45 | Initially our community was small,
 46 | 
 47 | 12
 48 | 00:00:41.680 --> 00:00:44.545
 49 | but over time more and more people were joining.
 50 | 
 51 | 13
 52 | 00:00:44.545 --> 00:00:47.230
 53 | From the format of groups of discussion.
 54 | 
 55 | 14
 56 | 00:00:47.230 --> 00:00:49.550
 57 | We moved on to the format of well organized meetings.
 58 | 
 59 | 15
 60 | 00:00:49.550 --> 00:00:54.185
 61 | Where a speaker makes an overview of his approach and ideas in front of 50 people.
 62 | 
 63 | 16
 64 | 00:00:54.185 --> 00:00:56.585
 65 | These meetings are called machine learning trainings.
 66 | 
 67 | 17
 68 | 00:00:56.585 --> 00:01:00.110
 69 | Now with the help and support of Yandex and get a hundred of participants.
 70 | 
 71 | 18
 72 | 00:01:00.110 --> 00:01:06.720
 73 | Thus we started from zero and learned everything by hard work and collaboration.
 74 | 
 75 | 19
 76 | 00:01:06.720 --> 00:01:08.240
 77 | We had an excellent teacher,
 78 | 
 79 | 20
 80 | 00:01:08.240 --> 00:01:11.010
 81 | Alexander D'yakonov who was top one on Kaggle,
 82 | 
 83 | 21
 84 | 00:01:11.010 --> 00:01:13.870
 85 | he took the course on critical data analysis.
 86 | 
 87 | 22
 88 | 00:01:13.870 --> 00:01:18.235
 89 | In Moscow state university and there we're grateful to him.
 90 | 
 91 | 23
 92 | 00:01:18.235 --> 00:01:21.160
 93 | At some point we started to share our knowledge with
 94 | 
 95 | 24
 96 | 00:01:21.160 --> 00:01:25.925
 97 | other people and some of us even started to read lectures at the university.
 98 | 
 99 | 25
100 | 00:01:25.925 --> 00:01:31.630
101 | So now we have decided to summarize everything and make it available for everyone.
102 | 
103 | 26
104 | 00:01:31.630 --> 00:01:35.835
105 | Together. We've finished and procesed in about 20 different competitions
106 | 
107 | 27
108 | 00:01:35.835 --> 00:01:40.585
109 | only on Kaggle and just as many on other not so famous platforms.
110 | 
111 | 28
112 | 00:01:40.585 --> 00:01:44.050
113 | All of us have a tremendous amount of skill and experience in
114 | 
115 | 29
116 | 00:01:44.050 --> 00:01:48.250
117 | competitive data science and now we want to share this experience with you.
118 | 
119 | 30
120 | 00:01:48.250 --> 00:01:49.500
121 | For all of us,
122 | 
123 | 31
124 | 00:01:49.500 --> 00:01:52.555
125 | competitive data science opened a number of opportunities
126 | 
127 | 32
128 | 00:01:52.555 --> 00:01:56.745
129 | as the competitions we took part were dedicated to a large variety of tasks.
130 | 
131 | 33
132 | 00:01:56.745 --> 00:01:59.065
133 | Mikhail works in e-commerce.
134 | 
135 | 34
136 | 00:01:59.065 --> 00:02:02.140
137 | Alexander builds predictive model for taxi services,
138 | 
139 | 35
140 | 00:02:02.140 --> 00:02:04.180
141 | Dmitri works with financial data,
142 | 
143 | 36
144 | 00:02:04.180 --> 00:02:08.725
145 | Mario develops machinery learning frameworks and I am a deep learning researcher.
146 | 
147 | 37
148 | 00:02:08.725 --> 00:02:10.660
149 | Competitions, without a doubt,
150 | 
151 | 38
152 | 00:02:10.660 --> 00:02:14.140
153 | became a stepping stone for our careers and believe me,
154 | 
155 | 39
156 | 00:02:14.140 --> 00:02:18.040
157 | good comparative record will bring success to you as well.
158 | 
159 | 40
160 | 00:02:18.040 --> 00:02:23.330
161 | We hope you will find something interesting in this course and wish you good luck.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_3_cource_overview.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/1_3_cource_overview.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/1_3_cource_overview.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/2_1_competition_mechanics.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/2_1_competition_mechanics.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/2_1_competition_mechanics.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/2_2_kaggle_overview.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/2_2_kaggle_overview.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/2_2_kaggle_overview.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/2_3_real_world.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/2_3_real_world.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/2_3_real_world.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/3_1_recap.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/3_1_recap.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/3_1_recap.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/4_1_Software_Hardware_Requirements.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:04.460 --> 00:00:05.670
  5 | Hi, everyone.
  6 | 
  7 | 2
  8 | 00:00:05.670 --> 00:00:10.490
  9 | In this video, I want to do an overview
 10 | of hardware and software requirements.
 11 | 
 12 | 3
 13 | 00:00:10.490 --> 00:00:14.520
 14 | You will know what is typical stuff for
 15 | data science competitions.
 16 | 
 17 | 4
 18 | 00:00:14.520 --> 00:00:17.900
 19 | I want to start from
 20 | hardware related things.
 21 | 
 22 | 5
 23 | 00:00:17.900 --> 00:00:19.399
 24 | Participating in competitions,
 25 | 
 26 | 6
 27 | 00:00:19.399 --> 00:00:22.890
 28 | you generally don't need a lot
 29 | of computation resources.
 30 | 
 31 | 7
 32 | 00:00:22.890 --> 00:00:23.890
 33 | A lot of competitions,
 34 | 
 35 | 8
 36 | 00:00:23.890 --> 00:00:28.650
 37 | except imaged based,
 38 | have under several gigabytes of data.
 39 | 
 40 | 9
 41 | 00:00:28.650 --> 00:00:33.350
 42 | It's not very huge and can be processed on
 43 | a high level laptop with 16 gigabyte ram
 44 | 
 45 | 10
 46 | 00:00:33.350 --> 00:00:34.500
 47 | and four physical cores.
 48 | 
 49 | 11
 50 | 00:00:35.600 --> 00:00:39.480
 51 | Quite a good setup is a tower
 52 | PC with 32 gigabyte of ram and
 53 | 
 54 | 12
 55 | 00:00:39.480 --> 00:00:42.570
 56 | six physical cores,
 57 | this is what I personally use.
 58 | 
 59 | 13
 60 | 00:00:43.630 --> 00:00:45.660
 61 | You have a choice of hardware to use.
 62 | 
 63 | 14
 64 | 00:00:45.660 --> 00:00:48.910
 65 | I suggest you to pay attention
 66 | to the following things.
 67 | 
 68 | 15
 69 | 00:00:48.910 --> 00:00:52.190
 70 | First is RAM, for this more is better.
 71 | 
 72 | 16
 73 | 00:00:52.190 --> 00:00:56.150
 74 | If you can keep your data in memory,
 75 | your life will be much, much easier.
 76 | 
 77 | 17
 78 | 00:00:56.150 --> 00:00:59.600
 79 | Personally, I found 64
 80 | gigabytes is quite enough, but
 81 | 
 82 | 18
 83 | 00:00:59.600 --> 00:01:03.569
 84 | some programmers prefer to have
 85 | 128 gigabytes or even more.
 86 | 
 87 | 19
 88 | 00:01:04.618 --> 00:01:10.020
 89 | Next are cores, the more core you have
 90 | the more or faster experiments you can do.
 91 | 
 92 | 20
 93 | 00:01:10.020 --> 00:01:12.910
 94 | I find it comfortable to
 95 | work with fixed cores, but
 96 | 
 97 | 21
 98 | 00:01:12.910 --> 00:01:14.990
 99 | sometimes even 32 are not enough.
100 | 
101 | 22
102 | 00:01:16.270 --> 00:01:19.910
103 | Next thing to pay attention for
104 | is storage.
105 | 
106 | 23
107 | 00:01:19.910 --> 00:01:23.570
108 | If you work with large datasets
109 | that don't fit into the memory,
110 | 
111 | 24
112 | 00:01:23.570 --> 00:01:27.530
113 | it's crucial to have fast disk to read and
114 | write chunks of data.
115 | 
116 | 25
117 | 00:01:27.530 --> 00:01:32.070
118 | SSD is especially important if you train
119 | narrowness or large number of images.
120 | 
121 | 26
122 | 00:01:33.270 --> 00:01:35.660
123 | In case you really need
124 | computational resources.
125 | 
126 | 27
127 | 00:01:35.660 --> 00:01:38.640
128 | For example, if you are part of team or
129 | 
130 | 28
131 | 00:01:38.640 --> 00:01:43.260
132 | have a computational heavy approach,
133 | you can rent it on cloud platforms.
134 | 
135 | 29
136 | 00:01:43.260 --> 00:01:47.530
137 | They offer machines with a lot of RAMs,
138 | cores, and GPUs.
139 | 
140 | 30
141 | 00:01:47.530 --> 00:01:49.150
142 | There are several cloud providers,
143 | 
144 | 31
145 | 00:01:49.150 --> 00:01:54.520
146 | most famous are Amazon AWS,
147 | Microsoft's Azure, and Google Cloud.
148 | 
149 | 32
150 | 00:01:54.520 --> 00:01:56.335
151 | Each one has its own pricing, so
152 | 
153 | 33
154 | 00:01:56.335 --> 00:01:59.840
155 | we can choose which one best
156 | fits your needs and budget.
157 | 
158 | 34
159 | 00:01:59.840 --> 00:02:04.150
160 | I especially want to draw your
161 | attention to AWS spot option.
162 | 
163 | 35
164 | 00:02:04.150 --> 00:02:07.800
165 | Spot instances enable you
166 | to be able to use instance,
167 | 
168 | 36
169 | 00:02:07.800 --> 00:02:09.400
170 | which can lower your cost significantly.
171 | 
172 | 37
173 | 00:02:09.400 --> 00:02:13.590
174 | The higher your price for
175 | spot instance is set by Amazon and
176 | 
177 | 38
178 | 00:02:13.590 --> 00:02:18.090
179 | fluctuates depending on supply and
180 | demand for spot instances.
181 | 
182 | 39
183 | 00:02:18.090 --> 00:02:22.630
184 | Your spot instance run whenever you
185 | bid exceeds the current market price.
186 | 
187 | 40
188 | 00:02:22.630 --> 00:02:25.450
189 | Generally, it's much
190 | cheaper than other options.
191 | 
192 | 41
193 | 00:02:25.450 --> 00:02:29.640
194 | But you always have risk that your bid
195 | will get under current market price, and
196 | 
197 | 42
198 | 00:02:29.640 --> 00:02:30.820
199 | your source will be terminated.
200 | 
201 | 43
202 | 00:02:31.840 --> 00:02:33.450
203 | Tutorials about how to setup and
204 | 
205 | 44
206 | 00:02:33.450 --> 00:02:36.500
207 | configure cloud resources you may
208 | find in additional materials.
209 | 
210 | 45
211 | 00:02:37.500 --> 00:02:39.948
212 | Another important thing I
213 | want to discuss is software.
214 | 
215 | 46
216 | 00:02:39.948 --> 00:02:44.260
217 | Usually, rules in competitions
218 | prohibit to use commercial software,
219 | 
220 | 47
221 | 00:02:44.260 --> 00:02:47.910
222 | since it requires to buy
223 | a license to reproduce results.
224 | 
225 | 48
226 | 00:02:47.910 --> 00:02:50.770
227 | Some competitors prefer
228 | R as basic language.
229 | 
230 | 49
231 | 00:02:50.770 --> 00:02:53.960
232 | But we will describe Python's tech
233 | as more common and more general.
234 | 
235 | 50
236 | 00:02:55.290 --> 00:02:58.310
237 | Python is quite a good language for
238 | fast prototyping.
239 | 
240 | 51
241 | 00:02:58.310 --> 00:03:02.090
242 | It has a huge amount of high quality and
243 | open source libraries.
244 | 
245 | 52
246 | 00:03:02.090 --> 00:03:03.850
247 | And I want to reuse several of them.
248 | 
249 | 53
250 | 00:03:05.060 --> 00:03:07.430
251 | Let's start with NumPy.
252 | 
253 | 54
254 | 00:03:07.430 --> 00:03:11.210
255 | It's a linear algebra library
256 | to work with dimensional arrays,
257 | 
258 | 55
259 | 00:03:11.210 --> 00:03:15.380
260 | which contains useful linear algebra
261 | routines and random number capabilities.
262 | 
263 | 56
264 | 00:03:16.550 --> 00:03:20.660
265 | Pandas is a library providing fast,
266 | flexible, and expressive way to work with
267 | 
268 | 57
269 | 00:03:20.660 --> 00:03:24.520
270 | a relational or table of data,
271 | both easily and intuitive.
272 | 
273 | 58
274 | 00:03:24.520 --> 00:03:27.585
275 | It allows you to process your
276 | data in a way similar to SQL.
277 | 
278 | 59
279 | 00:03:27.585 --> 00:03:32.190
280 | Scikit-learn is a library of classic
281 | machine learning algorithms.
282 | 
283 | 60
284 | 00:03:32.190 --> 00:03:36.320
285 | It features various classification,
286 | regression, and clustering algorithms,
287 | 
288 | 61
289 | 00:03:36.320 --> 00:03:40.750
290 | including support virtual machines,
291 | random force, and a lot more.
292 | 
293 | 62
294 | 00:03:41.950 --> 00:03:44.030
295 | Matplotlib is a plotting library.
296 | 
297 | 63
298 | 00:03:44.030 --> 00:03:47.070
299 | It allows you to do
300 | a variety of visualization,
301 | 
302 | 64
303 | 00:03:47.070 --> 00:03:50.980
304 | like line plots, histograms,
305 | scatter plots and a lot more.
306 | 
307 | 65
308 | 00:03:52.050 --> 00:03:56.460
309 | As IDE, I suggest you to use
310 | IPython with Jupyter node box,
311 | 
312 | 66
313 | 00:03:56.460 --> 00:04:00.190
314 | since they allow you to work
315 | interactively and remotely.
316 | 
317 | 67
318 | 00:04:00.190 --> 00:04:03.390
319 | The last property is especially
320 | useful if you use cloud resources.
321 | 
322 | 68
323 | 00:04:04.490 --> 00:04:08.380
324 | Additional packages contain
325 | implementation of more specific tools.
326 | 
327 | 69
328 | 00:04:08.380 --> 00:04:11.685
329 | Usually, single packages
330 | implement single algorithm.
331 | 
332 | 70
333 | 00:04:11.685 --> 00:04:15.900
334 | XGBoost and LightGBM packages implement
335 | gradient-boosted decision trees
336 | 
337 | 71
338 | 00:04:15.900 --> 00:04:18.320
339 | in a very efficient and optimized way.
340 | 
341 | 72
342 | 00:04:18.320 --> 00:04:20.230
343 | You definitely should
344 | know about such tools.
345 | 
346 | 73
347 | 00:04:21.370 --> 00:04:25.100
348 | Keras is a user-friendly framework for
349 | neural nets.
350 | 
351 | 74
352 | 00:04:25.100 --> 00:04:28.000
353 | This new package is an efficient
354 | implementation of this new ]projection
355 | 
356 | 75
357 | 00:04:28.000 --> 00:04:29.990
358 | method which we will
359 | discuss in our course.
360 | 
361 | 76
362 | 00:04:31.050 --> 00:04:34.890
363 | Also, I want to say a few words about
364 | external tools which usually don't have
365 | 
366 | 77
367 | 00:04:34.890 --> 00:04:38.670
368 | any connection despite, but
369 | still very used for computations.
370 | 
371 | 78
372 | 00:04:38.670 --> 00:04:41.120
373 | One such tool is Vowpal Wabbit.
374 | 
375 | 79
376 | 00:04:41.120 --> 00:04:44.020
377 | It is a tool designed to
378 | provide blazing speed and
379 | 
380 | 80
381 | 00:04:44.020 --> 00:04:48.060
382 | handle really large data sets,
383 | which don't fit into memory.
384 | 
385 | 81
386 | 00:04:48.060 --> 00:04:52.860
387 | Libfm and libffm implement different
388 | types of optimization machines, and
389 | 
390 | 82
391 | 00:04:52.860 --> 00:04:57.810
392 | often used for sparse data like
393 | click-through rate prediction.
394 | 
395 | 83
396 | 00:04:57.810 --> 00:05:02.910
397 | Rgf is an alternative base method,
398 | which I suggest you to use in ensembles.
399 | 
400 | 84
401 | 00:05:02.910 --> 00:05:05.220
402 | You can install these packages one by one.
403 | 
404 | 85
405 | 00:05:05.220 --> 00:05:07.250
406 | But as alternative, you can use byte and
407 | 
408 | 86
409 | 00:05:07.250 --> 00:05:11.230
410 | distribution like Anaconda, which already
411 | contains a lot of mentioned packages.
412 | 
413 | 87
414 | 00:05:12.260 --> 00:05:13.927
415 | And then, through this video,
416 | 
417 | 88
418 | 00:05:13.927 --> 00:05:17.953
419 | I want to emphasize the proposed setup
420 | is the most common but not the only one.
421 | 
422 | 89
423 | 00:05:17.953 --> 00:05:22.799
424 | Don't overestimate the role of hardware
425 | and software, since they are just tools.
426 | 
427 | 90
428 | 00:05:22.799 --> 00:05:24.964
429 | Thank you for your attention.
430 | 
431 | 91
432 | 00:05:24.964 --> 00:05:34.964
433 | [MUSIC]


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_1_overview.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_1_overview.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_1_overview.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_2_numeric_feature.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_2_numeric_feature.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_2_numeric_feature.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_3_categorical_and_ordinal.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_3_categorical_and_ordinal.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_3_categorical_and_ordinal.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_4_datetime_and_coordinates.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_4_datetime_and_coordinates.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_4_datetime_and_coordinates.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_5_handling_missing_values.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/5_5_handling_missing_values.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/5_5_handling_missing_values.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/6_1_bag_of_words.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/6_1_bag_of_words.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/6_1_bag_of_words.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/6_2_word2vec.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/6_2_word2vec.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/6_2_word2vec.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/7_1_final_project_overview.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/7_1_final_project_overview.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/7_1_final_project_overview.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/7_1_final_project_overview.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:02.420 --> 00:00:05.126
  5 | Hello, everyone. In this video,
  6 | 
  7 | 2
  8 | 00:00:05.126 --> 00:00:08.550
  9 | we will talk a little bit about the main assignment of this course,
 10 | 
 11 | 3
 12 | 00:00:08.550 --> 00:00:11.880
 13 | the competition, which plays the role of the final project.
 14 | 
 15 | 4
 16 | 00:00:11.880 --> 00:00:14.800
 17 | Now, let's briefly discuss the data.
 18 | 
 19 | 5
 20 | 00:00:14.800 --> 00:00:18.860
 21 | For more details, see the competition web page on Kaggle.
 22 | 
 23 | 6
 24 | 00:00:18.860 --> 00:00:21.880
 25 | The data in this competition is quite challenging.
 26 | 
 27 | 7
 28 | 00:00:21.880 --> 00:00:26.400
 29 | You can work with a time series data set consisting of daily sales data,
 30 | 
 31 | 8
 32 | 00:00:26.400 --> 00:00:30.153
 33 | kindly provided by one of the largest Russian software company.
 34 | 
 35 | 9
 36 | 00:00:30.153 --> 00:00:31.975
 37 | It's called 1C.
 38 | 
 39 | 10
 40 | 00:00:31.975 --> 00:00:35.860
 41 | The training data consists of records with information that
 42 | 
 43 | 11
 44 | 00:00:35.860 --> 00:00:39.550
 45 | a particular item had been sold in a particular shop,
 46 | 
 47 | 12
 48 | 00:00:39.550 --> 00:00:42.560
 49 | in a particular day, in the training period.
 50 | 
 51 | 13
 52 | 00:00:42.560 --> 00:00:48.630
 53 | The task is to forecast the sales for every item in every shop in the testing period.
 54 | 
 55 | 14
 56 | 00:00:48.630 --> 00:00:51.952
 57 | There are about 6 million such records in the training set,
 58 | 
 59 | 15
 60 | 00:00:51.952 --> 00:00:57.430
 61 | collected over 30 shops selling 20,000 unique items.
 62 | 
 63 | 16
 64 | 00:00:57.430 --> 00:00:59.770
 65 | But don't be afraid of these numbers.
 66 | 
 67 | 17
 68 | 00:00:59.770 --> 00:01:03.580
 69 | This is the moderate-sized competition data set nowadays.
 70 | 
 71 | 18
 72 | 00:01:03.580 --> 00:01:07.150
 73 | The training period is about one and a half year,
 74 | 
 75 | 19
 76 | 00:01:07.150 --> 00:01:11.515
 77 | and the testing period is the month that falls on training period.
 78 | 
 79 | 20
 80 | 00:01:11.515 --> 00:01:14.500
 81 | Note that you provide these daily sales in training period.
 82 | 
 83 | 21
 84 | 00:01:14.500 --> 00:01:19.370
 85 | Well, you need to predict aggregated sales for testing period.
 86 | 
 87 | 22
 88 | 00:01:19.370 --> 00:01:24.055
 89 | That is, you need to predict monthly sales for every possible shop item pair.
 90 | 
 91 | 23
 92 | 00:01:24.055 --> 00:01:27.382
 93 | In fact, correct aggregation of
 94 | 
 95 | 24
 96 | 00:01:27.382 --> 00:01:32.880
 97 | overall daily sales and generation of appropriate features is a part of this challenge.
 98 | 
 99 | 25
100 | 00:01:32.880 --> 00:01:35.632
101 | As in the majority of competitions,
102 | 
103 | 26
104 | 00:01:35.632 --> 00:01:38.945
105 | that this data is split into public and private parts.
106 | 
107 | 27
108 | 00:01:38.945 --> 00:01:42.975
109 | You can submit your test predictions up to five times every day on
110 | 
111 | 28
112 | 00:01:42.975 --> 00:01:45.790
113 | Kaggle platform and up to five times every
114 | 
115 | 29
116 | 00:01:45.790 --> 00:01:49.105
117 | week to Coursera's programming assignment grader.
118 | 
119 | 30
120 | 00:01:49.105 --> 00:01:54.885
121 | Kaggle will evaluate the quality of your predictions on the public part of test set,
122 | 
123 | 31
124 | 00:01:54.885 --> 00:01:57.825
125 | while Coursera's grader will report quality,
126 | 
127 | 32
128 | 00:01:57.825 --> 00:02:00.730
129 | both in public and private parts.
130 | 
131 | 33
132 | 00:02:00.730 --> 00:02:04.390
133 | That is, you can rarely peek at your private score.
134 | 
135 | 34
136 | 00:02:04.390 --> 00:02:08.295
137 | Remember, the earlier you start working on the competition,
138 | 
139 | 35
140 | 00:02:08.295 --> 00:02:11.500
141 | the more private score feedback you can get.
142 | 
143 | 36
144 | 00:02:11.500 --> 00:02:13.915
145 | We encourage you to get familiar with the data
146 | 
147 | 37
148 | 00:02:13.915 --> 00:02:17.105
149 | right away and not to wait until the very end.
150 | 
151 | 38
152 | 00:02:17.105 --> 00:02:22.160
153 | Start simple and then improve your solution every week.
154 | 
155 | 39
156 | 00:02:22.160 --> 00:02:26.830
157 | Remember, your final grades will depend on how would you have performed on
158 | 
159 | 40
160 | 00:02:26.830 --> 00:02:32.135
161 | the private part of the leaderboard and on the quality of your solution report,
162 | 
163 | 41
164 | 00:02:32.135 --> 00:02:34.550
165 | which will be graded by your peers.
166 | 
167 | 42
168 | 00:02:34.550 --> 00:02:40.050
169 | You can read more about this in the reading material in the end of this week.
170 | 
171 | 43
172 | 00:02:40.050 --> 00:02:45.290
173 | And, finally, the goal of the competition is to learn as much as possible,
174 | 
175 | 44
176 | 00:02:45.290 --> 00:02:48.370
177 | so we strongly encourage you to participate in teams.
178 | 
179 | 45
180 | 00:02:48.370 --> 00:02:50.740
181 | It is always fun and engaging.
182 | 
183 | 46
184 | 00:02:50.740 --> 00:02:54.005
185 | In teams, you can discuss ideas and get feedback.
186 | 
187 | 47
188 | 00:02:54.005 --> 00:02:56.842
189 | You can share a code and learn new tricks,
190 | 
191 | 48
192 | 00:02:56.842 --> 00:02:59.380
193 | and you can get help if you're stuck.
194 | 
195 | 49
196 | 00:02:59.380 --> 00:03:01.523
197 | If you don't have any teammates yet,
198 | 
199 | 50
200 | 00:03:01.523 --> 00:03:04.845
201 | you can find them and meet them on forums.
202 | 
203 | 51
204 | 00:03:04.845 --> 00:03:09.030
205 | Please never, never share your code on forums,
206 | 
207 | 52
208 | 00:03:09.030 --> 00:03:11.240
209 | neither on Coursera forums,
210 | 
211 | 53
212 | 00:03:11.240 --> 00:03:13.195
213 | nor on Kaggle's forums.
214 | 
215 | 54
216 | 00:03:13.195 --> 00:03:16.810
217 | Sharing codes outside of the teams is strictly forbidden.
218 | 
219 | 55
220 | 00:03:16.810 --> 00:03:19.925
221 | You are encouraged to share and discuss interesting ideas,
222 | 
223 | 56
224 | 00:03:19.925 --> 00:03:23.750
225 | thoughts, even small quote snippets held by the learners,
226 | 
227 | 57
228 | 00:03:23.750 --> 00:03:27.950
229 | but do not even share the complete code for your solution
230 | 
231 | 58
232 | 00:03:27.950 --> 00:03:30.560
233 | because many people will blindly copy
234 | 
235 | 59
236 | 00:03:30.560 --> 00:03:33.930
237 | paste your code without even trying to understand it.
238 | 
239 | 60
240 | 00:03:33.930 --> 00:03:38.960
241 | It will reduce quality of skills acquired by fellow students,
242 | 
243 | 61
244 | 00:03:38.960 --> 00:03:41.255
245 | it will ruin the fun of the fair competition.
246 | 
247 | 62
248 | 00:03:41.255 --> 00:03:44.175
249 | On the other hand, every time you're stuck,
250 | 
251 | 63
252 | 00:03:44.175 --> 00:03:48.335
253 | go in forums, and you will definitely find some inspiration there.
254 | 
255 | 64
256 | 00:03:48.335 --> 00:03:53.560
257 | Good luck with the project and have fun.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_1/ここにmp4をおいて再生してね:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_1/ここにmp4をおいて再生してね


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_1_Exploratory_data analysis.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_1_Exploratory_data analysis.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_1_Exploratory_data analysis.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_2_Building_intuition_about_the_data.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_2_Building_intuition_about_the_data.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_2_Building_intuition_about_the_data.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_3_Exploring_anonymized_data.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_3_Exploring_anonymized_data.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_3_Exploring_anonymized_data.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_4_Visualizations.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_4_Visualizations.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_4_Visualizations.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_5_Dataset_cleaning_and_other_things_to_check.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_5_Dataset_cleaning_and_other_things_to_check.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_5_Dataset_cleaning_and_other_things_to_check.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_6_Springleaf_competition_EDA_I.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_6_Springleaf_competition_EDA_I.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_6_Springleaf_competition_EDA_I.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_7_Springleaf_competition_EDA_II.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_7_Springleaf_competition_EDA_II.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_7_Springleaf_competition_EDA_II.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/1_8_Numerai_competition_EDA.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:02.790 --> 00:00:06.285
  5 | Hi, everyone. In this video,
  6 | 
  7 | 2
  8 | 00:00:06.285 --> 00:00:08.290
  9 | I will tell you about the specifics of
 10 | 
 11 | 3
 12 | 00:00:08.290 --> 00:00:12.731
 13 | Numerai Competition that was held throughout year 2016.
 14 | 
 15 | 4
 16 | 00:00:12.731 --> 00:00:17.980
 17 | Note that Numerai organizers changed the format in 2017.
 18 | 
 19 | 5
 20 | 00:00:17.980 --> 00:00:22.335
 21 | So, the findings I'm going to read will not work on new data.
 22 | 
 23 | 6
 24 | 00:00:22.335 --> 00:00:24.860
 25 | Let's state the problem.
 26 | 
 27 | 7
 28 | 00:00:24.860 --> 00:00:28.250
 29 | Participants were solving a binary classification task on
 30 | 
 31 | 8
 32 | 00:00:28.250 --> 00:00:32.320
 33 | a data set with 21 anonymized numeric features.
 34 | 
 35 | 9
 36 | 00:00:32.320 --> 00:00:38.305
 37 | Unusual part is that both train and test data sets have been updating every week.
 38 | 
 39 | 10
 40 | 00:00:38.305 --> 00:00:41.660
 41 | Data sets were also shuffled column-wise.
 42 | 
 43 | 11
 44 | 00:00:41.660 --> 00:00:44.215
 45 | So it was like a new task every week.
 46 | 
 47 | 12
 48 | 00:00:44.215 --> 00:00:47.405
 49 | Pretty challenging. As it turned out,
 50 | 
 51 | 13
 52 | 00:00:47.405 --> 00:00:50.210
 53 | this competition had a data leak.
 54 | 
 55 | 14
 56 | 00:00:50.210 --> 00:00:55.320
 57 | Organizers did not disclose any information about the nature of data set.
 58 | 
 59 | 15
 60 | 00:00:55.320 --> 00:00:59.120
 61 | But allegedly, it was some time series data with target
 62 | 
 63 | 16
 64 | 00:00:59.120 --> 00:01:03.770
 65 | variable highly dependent on transitions between time points.
 66 | 
 67 | 17
 68 | 00:01:03.770 --> 00:01:07.910
 69 | Think of something like predicting price change in stock market here.
 70 | 
 71 | 18
 72 | 00:01:07.910 --> 00:01:13.165
 73 | Means that, if we knew true order or had timestamp variable,
 74 | 
 75 | 19
 76 | 00:01:13.165 --> 00:01:15.890
 77 | we could easily get nearly perfect score.
 78 | 
 79 | 20
 80 | 00:01:15.890 --> 00:01:20.140
 81 | And therefore, we had to somehow reconstruct this order.
 82 | 
 83 | 21
 84 | 00:01:20.140 --> 00:01:21.805
 85 | Of course, approximately.
 86 | 
 87 | 22
 88 | 00:01:21.805 --> 00:01:27.440
 89 | But even a rough approximation was giving a huge advantage over other participants.
 90 | 
 91 | 23
 92 | 00:01:27.440 --> 00:01:30.725
 93 | The first and most important step is to find
 94 | 
 95 | 24
 96 | 00:01:30.725 --> 00:01:33.995
 97 | a nearest neighbor for every point in a data set,
 98 | 
 99 | 25
100 | 00:01:33.995 --> 00:01:39.230
101 | and add all 21 features from that neighbor to original point.
102 | 
103 | 26
104 | 00:01:39.230 --> 00:01:43.160
105 | Simple logistic regression of those 42 features,
106 | 
107 | 27
108 | 00:01:43.160 --> 00:01:46.610
109 | 21 from original, and 21 from neighboring points,
110 | 
111 | 28
112 | 00:01:46.610 --> 00:01:50.285
113 | allowed to get into top 10 on the leader board.
114 | 
115 | 29
116 | 00:01:50.285 --> 00:01:54.945
117 | Of course, we can get better scores with some Hardcore EDA.
118 | 
119 | 30
120 | 00:01:54.945 --> 00:01:59.500
121 | Let's start exploring correlation metrics of new 21 features.
122 | 
123 | 31
124 | 00:01:59.500 --> 00:02:03.943
125 | If group features with highest correlation coefficient next to each other,
126 | 
127 | 32
128 | 00:02:03.943 --> 00:02:06.735
129 | we'll get a right picture.
130 | 
131 | 33
132 | 00:02:06.735 --> 00:02:10.340
133 | This picture can help us in two different ways.
134 | 
135 | 34
136 | 00:02:10.340 --> 00:02:13.810
137 | First, we can actually fix some column order.
138 | 
139 | 35
140 | 00:02:13.810 --> 00:02:17.735
141 | So, weekly column shuffling won't affect our models.
142 | 
143 | 36
144 | 00:02:17.735 --> 00:02:20.480
145 | And second, we can clearly notice
146 | 
147 | 37
148 | 00:02:20.480 --> 00:02:25.115
149 | seven groups with three highly correlated features in each of them.
150 | 
151 | 38
152 | 00:02:25.115 --> 00:02:29.600
153 | So, the data actually has some non-trivial structure.
154 | 
155 | 39
156 | 00:02:29.600 --> 00:02:35.615
157 | Now, let's remember that we get new data sets every week. What is more?
158 | 
159 | 40
160 | 00:02:35.615 --> 00:02:40.110
161 | Each week, train data sets have the same number of points.
162 | 
163 | 41
164 | 00:02:40.110 --> 00:02:45.170
165 | We can assume that there is some connection between consecutive data sets.
166 | 
167 | 42
168 | 00:02:45.170 --> 00:02:49.360
169 | This is a little strange because we already have a time series.
170 | 
171 | 43
172 | 00:02:49.360 --> 00:02:53.200
173 | So, what's the connection between the data from different weeks?
174 | 
175 | 44
176 | 00:02:53.200 --> 00:02:56.480
177 | Well, if we find nearest neighbors from
178 | 
179 | 45
180 | 00:02:56.480 --> 00:03:00.065
181 | every point in current data set from previous data set,
182 | 
183 | 46
184 | 00:03:00.065 --> 00:03:02.195
185 | and plot distance distributions,
186 | 
187 | 47
188 | 00:03:02.195 --> 00:03:04.910
189 | we can notice that first neighbor is much,
190 | 
191 | 48
192 | 00:03:04.910 --> 00:03:07.370
193 | much closer than the second.
194 | 
195 | 49
196 | 00:03:07.370 --> 00:03:11.585
197 | So, we indeed have some connection between consecutive data sets.
198 | 
199 | 50
200 | 00:03:11.585 --> 00:03:16.000
201 | And it looks like we can build a bijective mapping between them.
202 | 
203 | 51
204 | 00:03:16.000 --> 00:03:21.470
205 | But let's not quickly jump into conclusions and do more exploration.
206 | 
207 | 52
208 | 00:03:21.470 --> 00:03:25.650
209 | Okay. We found a nearest neighbor in previous data set.
210 | 
211 | 53
212 | 00:03:25.650 --> 00:03:28.070
213 | What if we examine the distances between
214 | 
215 | 54
216 | 00:03:28.070 --> 00:03:32.793
217 | the neighboring objects at the level of individual features?
218 | 
219 | 55
220 | 00:03:32.793 --> 00:03:36.735
221 | We clearly have three different groups of seven features.
222 | 
223 | 56
224 | 00:03:36.735 --> 00:03:40.090
225 | Now remember, the sorted correlation matrix?
226 | 
227 | 57
228 | 00:03:40.090 --> 00:03:46.470
229 | It turns out that each of three highly correlated features belong to a different group.
230 | 
231 | 58
232 | 00:03:46.470 --> 00:03:48.140
233 | A perfect match.
234 | 
235 | 59
236 | 00:03:48.140 --> 00:03:52.245
237 | And if we multiply seven features from the first group by three,
238 | 
239 | 60
240 | 00:03:52.245 --> 00:03:56.565
241 | and seven features from the second group by two in the original data set,
242 | 
243 | 61
244 | 00:03:56.565 --> 00:04:01.500
245 | recalculate nearest neighbor-based features within the data sets,
246 | 
247 | 62
248 | 00:04:01.500 --> 00:04:03.165
249 | and re-train our models,
250 | 
251 | 63
252 | 00:04:03.165 --> 00:04:06.020
253 | we'll get a nice improvement.
254 | 
255 | 64
256 | 00:04:06.020 --> 00:04:09.650
257 | So, after this magic multiplications, of course,
258 | 
259 | 65
260 | 00:04:09.650 --> 00:04:11.445
261 | I'd tried other constants,
262 | 
263 | 66
264 | 00:04:11.445 --> 00:04:15.450
265 | our true order approximation became a little better.
266 | 
267 | 67
268 | 00:04:15.450 --> 00:04:20.840
269 | Great. Now, let's move to the true relation.
270 | 
271 | 68
272 | 00:04:20.840 --> 00:04:23.835
273 | New data, weekly updates,
274 | 
275 | 69
276 | 00:04:23.835 --> 00:04:25.955
277 | all of it was a lie.
278 | 
279 | 70
280 | 00:04:25.955 --> 00:04:31.290
281 | Remember, how we were calculating neighbors between consecutive data sets?
282 | 
283 | 71
284 | 00:04:31.290 --> 00:04:33.685
285 | Well, we can forget about consecutiveness.
286 | 
287 | 72
288 | 00:04:33.685 --> 00:04:36.750
289 | Calculate neighbors between current data set,
290 | 
291 | 73
292 | 00:04:36.750 --> 00:04:40.550
293 | and the data set from two weeks ago or two months ago.
294 | 
295 | 74
296 | 00:04:40.550 --> 00:04:45.350
297 | No matter what, we will be getting pretty much the same distances.
298 | 
299 | 75
300 | 00:04:45.350 --> 00:04:51.535
301 | Why? The simplest answer is that the data actually didn't change.
302 | 
303 | 76
304 | 00:04:51.535 --> 00:04:54.505
305 | And every week, we were getting the same data,
306 | 
307 | 77
308 | 00:04:54.505 --> 00:04:56.275
309 | plus a little bit of noise.
310 | 
311 | 78
312 | 00:04:56.275 --> 00:05:00.750
313 | And thus, we could find nearest neighbor in each of previous data sets,
314 | 
315 | 79
316 | 00:05:00.750 --> 00:05:02.305
317 | and average them all,
318 | 
319 | 80
320 | 00:05:02.305 --> 00:05:05.770
321 | successfully reducing the variance of added noise.
322 | 
323 | 81
324 | 00:05:05.770 --> 00:05:10.720
325 | After averaging, true order approximation became even better.
326 | 
327 | 82
328 | 00:05:10.720 --> 00:05:16.115
329 | I have to say that a little bit of test data actually did change from time to time.
330 | 
331 | 83
332 | 00:05:16.115 --> 00:05:20.765
333 | But nonetheless, most of the roles migrated from week to week.
334 | 
335 | 84
336 | 00:05:20.765 --> 00:05:23.320
337 | Because of that, it was possible to probe
338 | 
339 | 85
340 | 00:05:23.320 --> 00:05:26.395
341 | the whole public leader board which helped even further,
342 | 
343 | 86
344 | 00:05:26.395 --> 00:05:28.150
345 | and so on, and so on.
346 | 
347 | 87
348 | 00:05:28.150 --> 00:05:31.495
349 | Of course, there are more details regarding that competition,
350 | 
351 | 88
352 | 00:05:31.495 --> 00:05:33.715
353 | but they aren't very interesting.
354 | 
355 | 89
356 | 00:05:33.715 --> 00:05:37.745
357 | I wanted to focus on the process of reverse engineering.
358 | 
359 | 90
360 | 00:05:37.745 --> 00:05:41.875
361 | Anyway, I hope you like this kind of detective story
362 | 
363 | 91
364 | 00:05:41.875 --> 00:05:46.880
365 | and realized how important exploratory data analysis could be.
366 | 
367 | 92
368 | 00:05:46.880 --> 00:05:51.710
369 | Thank you for your attention and always pay respect to EDA.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_1_Validation and overfitting.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_1_Validation and overfitting.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_1_Validation and overfitting.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_2_Validation strategies.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_2_Validation strategies.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_2_Validation strategies.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_3_Data splitting strategies.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_3_Data splitting strategies.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_3_Data splitting strategies.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_4_Problems occurring during validation.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/2_4_Problems occurring during validation.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/2_4_Problems occurring during validation.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_1_Basic data leaks.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:03.640 --> 00:00:06.470
  5 | Hi everyone. In this section,
  6 | 
  7 | 2
  8 | 00:00:06.470 --> 00:00:12.570
  9 | we will talk about a very sensitive topic data leakage or more simply, leaks.
 10 | 
 11 | 3
 12 | 00:00:12.570 --> 00:00:16.870
 13 | We'll define leakage in a very general sense as
 14 | 
 15 | 4
 16 | 00:00:16.870 --> 00:00:19.450
 17 | an unexpected information in the data that
 18 | 
 19 | 5
 20 | 00:00:19.450 --> 00:00:22.615
 21 | allows us to make unrealistically good predictions.
 22 | 
 23 | 6
 24 | 00:00:22.615 --> 00:00:24.080
 25 | For the time being,
 26 | 
 27 | 7
 28 | 00:00:24.080 --> 00:00:26.485
 29 | you may have think of it as of directly or
 30 | 
 31 | 8
 32 | 00:00:26.485 --> 00:00:31.490
 33 | indirectly adding ground truths into the test data.
 34 | 
 35 | 9
 36 | 00:00:31.490 --> 00:00:33.765
 37 | Data leaks are very, very bad.
 38 | 
 39 | 10
 40 | 00:00:33.765 --> 00:00:36.835
 41 | They are completely unusable in real world.
 42 | 
 43 | 11
 44 | 00:00:36.835 --> 00:00:43.245
 45 | They usually provide way too much signal and thus make competitions lose its main point,
 46 | 
 47 | 12
 48 | 00:00:43.245 --> 00:00:47.560
 49 | and quickly turn them into a leak hunt increase.
 50 | 
 51 | 13
 52 | 00:00:47.560 --> 00:00:50.915
 53 | People often are very sensitive about this matter.
 54 | 
 55 | 14
 56 | 00:00:50.915 --> 00:00:52.960
 57 | They tend to overreact.
 58 | 
 59 | 15
 60 | 00:00:52.960 --> 00:00:54.755
 61 | That's completely understandable.
 62 | 
 63 | 16
 64 | 00:00:54.755 --> 00:00:57.825
 65 | After spending a lot of time on solving the problem,
 66 | 
 67 | 17
 68 | 00:00:57.825 --> 00:01:01.845
 69 | a sudden data leak may render all of that useless.
 70 | 
 71 | 18
 72 | 00:01:01.845 --> 00:01:04.640
 73 | It is not a pleasant position to be in.
 74 | 
 75 | 19
 76 | 00:01:04.640 --> 00:01:09.100
 77 | I cannot force you to turn the blind eye but keep in mind,
 78 | 
 79 | 20
 80 | 00:01:09.100 --> 00:01:13.030
 81 | there is no ill intent whatsoever.
 82 | 
 83 | 21
 84 | 00:01:13.030 --> 00:01:17.515
 85 | Data leaks are the result of unintentional errors, accidents.
 86 | 
 87 | 22
 88 | 00:01:17.515 --> 00:01:19.270
 89 | Even if you find yourself in
 90 | 
 91 | 23
 92 | 00:01:19.270 --> 00:01:23.770
 93 | a competition with an unexpected data leak close to the deadline,
 94 | 
 95 | 24
 96 | 00:01:23.770 --> 00:01:26.520
 97 | please be more tolerant.
 98 | 
 99 | 25
100 | 00:01:26.520 --> 00:01:29.295
101 | The question of whether to exploit the data leak
102 | 
103 | 26
104 | 00:01:29.295 --> 00:01:33.055
105 | or not is exclusive to machine learning competitions.
106 | 
107 | 27
108 | 00:01:33.055 --> 00:01:37.875
109 | In real world, the answer is obviously a no, nothing to discuss.
110 | 
111 | 28
112 | 00:01:37.875 --> 00:01:39.400
113 | But in a competition,
114 | 
115 | 29
116 | 00:01:39.400 --> 00:01:43.325
117 | the ultimate goal is to get a higher leaderboard position.
118 | 
119 | 30
120 | 00:01:43.325 --> 00:01:45.355
121 | And if you truly pursue that goal,
122 | 
123 | 31
124 | 00:01:45.355 --> 00:01:49.045
125 | then exploit the leak in every way possible.
126 | 
127 | 32
128 | 00:01:49.045 --> 00:01:50.440
129 | Further in this section,
130 | 
131 | 33
132 | 00:01:50.440 --> 00:01:53.285
133 | I will show you the main types of data leaks
134 | 
135 | 34
136 | 00:01:53.285 --> 00:01:56.790
137 | that could appear during solving a machine learning problem.
138 | 
139 | 35
140 | 00:01:56.790 --> 00:02:03.550
141 | Also focus on a competition specific leak exploitation technique leaderboard probing.
142 | 
143 | 36
144 | 00:02:03.550 --> 00:02:06.190
145 | Finally, you will find special videos
146 | 
147 | 37
148 | 00:02:06.190 --> 00:02:11.040
149 | dedicated to the most interesting and non-trivial data leaks.
150 | 
151 | 38
152 | 00:02:11.040 --> 00:02:17.910
153 | I will start with the most typical data leaks that may occur in almost every problem.
154 | 
155 | 39
156 | 00:02:17.910 --> 00:02:20.125
157 | Time series is our first target.
158 | 
159 | 40
160 | 00:02:20.125 --> 00:02:23.015
161 | Typically, future picking.
162 | 
163 | 41
164 | 00:02:23.015 --> 00:02:26.780
165 | It is common sense not to pick into the future like,
166 | 
167 | 42
168 | 00:02:26.780 --> 00:02:32.570
169 | can we use stock market's price from day after tomorrow to predict price for tomorrow?
170 | 
171 | 43
172 | 00:02:32.570 --> 00:02:36.215
173 | Of course not. However, direct usage of
174 | 
175 | 44
176 | 00:02:36.215 --> 00:02:41.240
177 | future information in incorrect time splits still exist.
178 | 
179 | 45
180 | 00:02:41.240 --> 00:02:44.830
181 | When you enter a time serious competition at first,
182 | 
183 | 46
184 | 00:02:44.830 --> 00:02:48.005
185 | check train, public, and private splits.
186 | 
187 | 47
188 | 00:02:48.005 --> 00:02:50.630
189 | If even one of them is not on time,
190 | 
191 | 48
192 | 00:02:50.630 --> 00:02:53.105
193 | then you found a data leak.
194 | 
195 | 49
196 | 00:02:53.105 --> 00:03:01.165
197 | In such case, unrealistic features like prices next week will be the most important.
198 | 
199 | 50
200 | 00:03:01.165 --> 00:03:03.210
201 | But even when split by time,
202 | 
203 | 51
204 | 00:03:03.210 --> 00:03:06.245
205 | data still contains information about future.
206 | 
207 | 52
208 | 00:03:06.245 --> 00:03:09.800
209 | We still can access the rows from the test set.
210 | 
211 | 53
212 | 00:03:09.800 --> 00:03:13.790
213 | We can have future user history in CTR task,
214 | 
215 | 54
216 | 00:03:13.790 --> 00:03:20.145
217 | some fundamental indicators in stock market predictions tasks, and so on.
218 | 
219 | 55
220 | 00:03:20.145 --> 00:03:24.510
221 | There are only two ways to eliminate the possibility of data leakage.
222 | 
223 | 56
224 | 00:03:24.510 --> 00:03:29.090
225 | It's called competitions, where one can not access
226 | 
227 | 57
228 | 00:03:29.090 --> 00:03:34.150
229 | rows from future or a test set with no features at all, only IDs.
230 | 
231 | 58
232 | 00:03:34.150 --> 00:03:39.740
233 | For example, just the number and instrument ID in stock market prediction,
234 | 
235 | 59
236 | 00:03:39.740 --> 00:03:45.420
237 | so participants create features based on past and join them themselves.
238 | 
239 | 60
240 | 00:03:45.420 --> 00:03:48.610
241 | Now, let's discuss something more unusual.
242 | 
243 | 61
244 | 00:03:48.610 --> 00:03:52.820
245 | Those types of data leaks are much harder to find.
246 | 
247 | 62
248 | 00:03:52.820 --> 00:03:56.810
249 | We often have more than just train and test files.
250 | 
251 | 63
252 | 00:03:56.810 --> 00:04:01.140
253 | For example, a lot of images or text in archive.
254 | 
255 | 64
256 | 00:04:01.140 --> 00:04:04.970
257 | In such case, we can't access some meta information,
258 | 
259 | 65
260 | 00:04:04.970 --> 00:04:08.950
261 | file creation date, image resolution etcetera.
262 | 
263 | 66
264 | 00:04:08.950 --> 00:04:13.890
265 | It turns out that this meta information may be connected to target variable.
266 | 
267 | 67
268 | 00:04:13.890 --> 00:04:18.535
269 | Imagine classic cats versus dogs classification.
270 | 
271 | 68
272 | 00:04:18.535 --> 00:04:20.640
273 | What if cat pictures were taken before dog?
274 | 
275 | 69
276 | 00:04:20.640 --> 00:04:24.010
277 | Or taken with a different camera?
278 | 
279 | 70
280 | 00:04:24.010 --> 00:04:29.510
281 | Because of that, a good practice from organizers is to erase the meta data,
282 | 
283 | 71
284 | 00:04:29.510 --> 00:04:32.750
285 | resize the pictures, and change creation date.
286 | 
287 | 72
288 | 00:04:32.750 --> 00:04:36.195
289 | Unfortunately, sometimes we will forget about it.
290 | 
291 | 73
292 | 00:04:36.195 --> 00:04:39.210
293 | A good example is Truly Native competition,
294 | 
295 | 74
296 | 00:04:39.210 --> 00:04:44.505
297 | where one could get nearly perfect scores using just the dates from zip archives.
298 | 
299 | 75
300 | 00:04:44.505 --> 00:04:48.380
301 | Another type of leakage could be found in IDs.
302 | 
303 | 76
304 | 00:04:48.380 --> 00:04:54.285
305 | IDs are unique identifiers of every row usually used for convenience.
306 | 
307 | 77
308 | 00:04:54.285 --> 00:04:57.410
309 | It makes no sense to include them into the model.
310 | 
311 | 78
312 | 00:04:57.410 --> 00:05:00.905
313 | It is assumed that they are automatically generated.
314 | 
315 | 79
316 | 00:05:00.905 --> 00:05:04.060
317 | In reality, that's not always true.
318 | 
319 | 80
320 | 00:05:04.060 --> 00:05:06.510
321 | ID may be a hash of something,
322 | 
323 | 81
324 | 00:05:06.510 --> 00:05:09.295
325 | probably not intended for disclosure.
326 | 
327 | 82
328 | 00:05:09.295 --> 00:05:14.075
329 | It may contain traces of information connected to target variable.
330 | 
331 | 83
332 | 00:05:14.075 --> 00:05:16.605
333 | It was a case in Caterpillar competition.
334 | 
335 | 84
336 | 00:05:16.605 --> 00:05:20.270
337 | A link ID as a feature slightly improve the result.
338 | 
339 | 85
340 | 00:05:20.270 --> 00:05:23.930
341 | So I advise you to pay close attention to IDs and
342 | 
343 | 86
344 | 00:05:23.930 --> 00:05:27.875
345 | always check whether they are useful or not.
346 | 
347 | 87
348 | 00:05:27.875 --> 00:05:29.965
349 | Next is row order.
350 | 
351 | 88
352 | 00:05:29.965 --> 00:05:35.230
353 | In trivial case, data may be shuffled by target variable.
354 | 
355 | 89
356 | 00:05:35.230 --> 00:05:39.040
357 | Sometimes simply adding row number or relative number,
358 | 
359 | 90
360 | 00:05:39.040 --> 00:05:41.200
361 | suddenly improves this course.
362 | 
363 | 91
364 | 00:05:41.200 --> 00:05:44.680
365 | Like, in Telstra Network Disruptions competition.
366 | 
367 | 92
368 | 00:05:44.680 --> 00:05:47.995
369 | It's also possible to find something way more interesting
370 | 
371 | 93
372 | 00:05:47.995 --> 00:05:52.420
373 | like in TalkingData Mobile User Demographics competition.
374 | 
375 | 94
376 | 00:05:52.420 --> 00:05:55.220
377 | There was some kind of row duplication,
378 | 
379 | 95
380 | 00:05:55.220 --> 00:05:59.610
381 | rows next to each other usually have the same label.
382 | 
383 | 96
384 | 00:05:59.610 --> 00:06:02.500
385 | This is it with a regular type of leaks.
386 | 
387 | 97
388 | 00:06:02.500 --> 00:06:05.050
389 | To sum things up, in this video,
390 | 
391 | 98
392 | 00:06:05.050 --> 00:06:12.780
393 | we embrace the concept of data leak and cover data leaks from future picking,
394 | 
395 | 99
396 | 00:06:12.780 --> 00:06:16.380
397 | meta data, IDs, and row order.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_2_Leaderboard probing and examples of rare data leaks.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/3_2_Leaderboard probing and examples of rare data leaks.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_2_Leaderboard probing and examples of rare data leaks.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_3_Expedia challenge.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_2/3_3_Expedia challenge.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_2/3_3_Expedia challenge.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_1_Motivation.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_1_Motivation.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_1_Motivation.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_2_Regression_metrics_review1.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_2_Regression_metrics_review1.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_2_Regression_metrics_review1.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_3_Regression_metrics_review2.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_3_Regression_metrics_review2.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_3_Regression_metrics_review2.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_4_Classification_metrics_review.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_4_Classification_metrics_review.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_4_Classification_metrics_review.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_5_General_approaches.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_5_General_approaches.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_5_General_approaches.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_5_General_approaches.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:03.210 --> 00:00:09.520
  5 | In this video, we will discuss what is the loss and what is a metric,
  6 | 
  7 | 2
  8 | 00:00:09.520 --> 00:00:11.905
  9 | and what is the difference between them.
 10 | 
 11 | 3
 12 | 00:00:11.905 --> 00:00:18.155
 13 | And then we'll overview what are the general approaches to metric optimization.
 14 | 
 15 | 4
 16 | 00:00:18.155 --> 00:00:23.545
 17 | Let's start with a comparison between two notions, loss and metric.
 18 | 
 19 | 5
 20 | 00:00:23.545 --> 00:00:27.460
 21 | The metric or target metric is a function which we
 22 | 
 23 | 6
 24 | 00:00:27.460 --> 00:00:31.690
 25 | want to use to evaluate the quality of our model.
 26 | 
 27 | 7
 28 | 00:00:31.690 --> 00:00:34.390
 29 | For example, for a classification task,
 30 | 
 31 | 8
 32 | 00:00:34.390 --> 00:00:38.097
 33 | we may want to maximize accuracy of our predictions,
 34 | 
 35 | 9
 36 | 00:00:38.097 --> 00:00:41.765
 37 | how frequently the model outputs the correct label.
 38 | 
 39 | 10
 40 | 00:00:41.765 --> 00:00:47.415
 41 | But the problem is that no one really knows how to optimize accuracy efficiently.
 42 | 
 43 | 11
 44 | 00:00:47.415 --> 00:00:51.770
 45 | Instead, people come up with the proxy loss functions.
 46 | 
 47 | 12
 48 | 00:00:51.770 --> 00:00:57.105
 49 | They are such evaluation functions that are easy to optimize for a given model.
 50 | 
 51 | 13
 52 | 00:00:57.105 --> 00:01:02.325
 53 | For example, logarithmic loss is widely used as an optimization loss,
 54 | 
 55 | 14
 56 | 00:01:02.325 --> 00:01:07.530
 57 | while the accuracy score is how the solution is eventually evaluated.
 58 | 
 59 | 15
 60 | 00:01:07.530 --> 00:01:11.220
 61 | So, once again, the loss function is a function
 62 | 
 63 | 16
 64 | 00:01:11.220 --> 00:01:15.205
 65 | that our model optimizes and uses to evaluate the solution,
 66 | 
 67 | 17
 68 | 00:01:15.205 --> 00:01:20.455
 69 | and the target metric is how we want the solution to be evaluated.
 70 | 
 71 | 18
 72 | 00:01:20.455 --> 00:01:24.365
 73 | This is kind of expectation versus reality thing.
 74 | 
 75 | 19
 76 | 00:01:24.365 --> 00:01:30.670
 77 | Sometimes we are lucky and the model can optimize our target metric directly.
 78 | 
 79 | 20
 80 | 00:01:30.670 --> 00:01:34.360
 81 | For example, for mean square error metric,
 82 | 
 83 | 21
 84 | 00:01:34.360 --> 00:01:39.960
 85 | most libraries can optimize it from the outset, from the box.
 86 | 
 87 | 22
 88 | 00:01:39.960 --> 00:01:43.745
 89 | So the loss function is the same as the target metric.
 90 | 
 91 | 23
 92 | 00:01:43.745 --> 00:01:46.690
 93 | And sometimes we want to optimize metrics that
 94 | 
 95 | 24
 96 | 00:01:46.690 --> 00:01:50.845
 97 | are really hard or even impossible to optimize directly.
 98 | 
 99 | 25
100 | 00:01:50.845 --> 00:01:53.420
101 | In this case, we usually set the model to optimize
102 | 
103 | 26
104 | 00:01:53.420 --> 00:01:56.545
105 | a loss that is different to a target metric,
106 | 
107 | 27
108 | 00:01:56.545 --> 00:01:58.420
109 | but after a model is trained,
110 | 
111 | 28
112 | 00:01:58.420 --> 00:02:02.290
113 | we use hacks and heuristics to negate the discrepancy
114 | 
115 | 29
116 | 00:02:02.290 --> 00:02:07.520
117 | and adjust the model to better fit the target metric.
118 | 
119 | 30
120 | 00:02:07.520 --> 00:02:11.810
121 | We will see the examples for both cases in the following videos.
122 | 
123 | 31
124 | 00:02:11.810 --> 00:02:14.935
125 | And the last thing to mention is that loss metric,
126 | 
127 | 32
128 | 00:02:14.935 --> 00:02:22.055
129 | cost objective and other notions are more or less used as synonyms.
130 | 
131 | 33
132 | 00:02:22.055 --> 00:02:26.680
133 | It is completely okay to say target loss and optimization metric,
134 | 
135 | 34
136 | 00:02:26.680 --> 00:02:29.895
137 | but we will fix the wording for the clarity now.
138 | 
139 | 35
140 | 00:02:29.895 --> 00:02:33.495
141 | Okay, so far, we've understood
142 | 
143 | 36
144 | 00:02:33.495 --> 00:02:38.745
145 | why it's important to optimize a metric given in a competition.
146 | 
147 | 37
148 | 00:02:38.745 --> 00:02:44.395
149 | And we have discussed the difference between optimization loss and target metric.
150 | 
151 | 38
152 | 00:02:44.395 --> 00:02:50.305
153 | Now, let's overview the approaches to target metrics optimization in general.
154 | 
155 | 39
156 | 00:02:50.305 --> 00:02:54.600
157 | The approaches can be broadly divided into several categories,
158 | 
159 | 40
160 | 00:02:54.600 --> 00:02:57.300
161 | depending on the metric we need to optimize.
162 | 
163 | 41
164 | 00:02:57.300 --> 00:03:01.050
165 | Some metrics can be optimized directly.
166 | 
167 | 42
168 | 00:03:01.050 --> 00:03:06.825
169 | That is, we should just find a model that optimizes this metric and run it.
170 | 
171 | 43
172 | 00:03:06.825 --> 00:03:13.200
173 | In fact, all we need to do is to set the model's loss function to these metric.
174 | 
175 | 44
176 | 00:03:13.200 --> 00:03:16.055
177 | The most common metrics like MSE,
178 | 
179 | 45
180 | 00:03:16.055 --> 00:03:22.470
181 | Logloss are implemented as loss functions in almost every library.
182 | 
183 | 46
184 | 00:03:22.470 --> 00:03:26.090
185 | For some of the metrics that cannot be optimized directly,
186 | 
187 | 47
188 | 00:03:26.090 --> 00:03:29.610
189 | we can somehow pre-process the train set and use
190 | 
191 | 48
192 | 00:03:29.610 --> 00:03:34.245
193 | a model with a metric or loss function which is easy to optimize.
194 | 
195 | 49
196 | 00:03:34.245 --> 00:03:40.265
197 | For example, while MSPE metric cannot be optimized directly with XGBoost,
198 | 
199 | 50
200 | 00:03:40.265 --> 00:03:46.539
201 | we will see later that we can resample the train set and optimize MSE loss instead,
202 | 
203 | 51
204 | 00:03:46.539 --> 00:03:48.930
205 | which XGBoost can optimize.
206 | 
207 | 52
208 | 00:03:48.930 --> 00:03:52.470
209 | Sometimes, we'll optimize incorrect metric,
210 | 
211 | 53
212 | 00:03:52.470 --> 00:03:58.890
213 | but we'll post-process the predictions to fit classification,
214 | 
215 | 54
216 | 00:03:58.890 --> 00:04:01.850
217 | to fit the communication metric better.
218 | 
219 | 55
220 | 00:04:01.850 --> 00:04:03.810
221 | For some models and frameworks,
222 | 
223 | 56
224 | 00:04:03.810 --> 00:04:06.765
225 | it's possible to define a custom loss function,
226 | 
227 | 57
228 | 00:04:06.765 --> 00:04:10.320
229 | and sometimes it's possible to implement a loss function which will
230 | 
231 | 58
232 | 00:04:10.320 --> 00:04:14.345
233 | serve as a nice proxy for the desired metric.
234 | 
235 | 59
236 | 00:04:14.345 --> 00:04:19.715
237 | For example, it can be done for quadratic-weighted Kappa, as we will see later.
238 | 
239 | 60
240 | 00:04:19.715 --> 00:04:24.750
241 | It's actually quite easy to define a custom loss function for XGBoost.
242 | 
243 | 61
244 | 00:04:24.750 --> 00:04:27.735
245 | We only need to implement a single function that
246 | 
247 | 62
248 | 00:04:27.735 --> 00:04:30.910
249 | takes predictions and the target values and
250 | 
251 | 63
252 | 00:04:30.910 --> 00:04:34.090
253 | computes first and second-order derivatives
254 | 
255 | 64
256 | 00:04:34.090 --> 00:04:37.890
257 | of the loss function with respect to the model's predictions.
258 | 
259 | 65
260 | 00:04:37.890 --> 00:04:41.275
261 | For example, here you see one for the Logloss.
262 | 
263 | 66
264 | 00:04:41.275 --> 00:04:47.485
265 | Of course, the loss function should be smooth enough and have well-behaved derivatives,
266 | 
267 | 67
268 | 00:04:47.485 --> 00:04:50.455
269 | otherwise XGBoost will drive crazy.
270 | 
271 | 68
272 | 00:04:50.455 --> 00:04:53.965
273 | In this course, we consider only a small set of metrics,
274 | 
275 | 69
276 | 00:04:53.965 --> 00:04:56.300
277 | but there are plenty of them in fact.
278 | 
279 | 70
280 | 00:04:56.300 --> 00:04:57.960
281 | And for some of them,
282 | 
283 | 71
284 | 00:04:57.960 --> 00:05:00.110
285 | it is really hard to come up with
286 | 
287 | 72
288 | 00:05:00.110 --> 00:05:05.155
289 | a neat optimization procedure or write a custom loss function.
290 | 
291 | 73
292 | 00:05:05.155 --> 00:05:09.020
293 | Thankfully, there is a method that always works.
294 | 
295 | 74
296 | 00:05:09.020 --> 00:05:10.955
297 | It is called early stopping,
298 | 
299 | 75
300 | 00:05:10.955 --> 00:05:13.310
301 | and it is very simple.
302 | 
303 | 76
304 | 00:05:13.310 --> 00:05:16.290
305 | You set a model to optimize any loss function it can
306 | 
307 | 77
308 | 00:05:16.290 --> 00:05:21.225
309 | optimize and you monitor the desired metric on a validation set.
310 | 
311 | 78
312 | 00:05:21.225 --> 00:05:25.820
313 | And you stop the training when the model starts to fit according to
314 | 
315 | 79
316 | 00:05:25.820 --> 00:05:30.815
317 | the desired metric and not according to the metric the model is truly optimizing.
318 | 
319 | 80
320 | 00:05:30.815 --> 00:05:33.155
321 | That is important. Of course,
322 | 
323 | 81
324 | 00:05:33.155 --> 00:05:36.615
325 | some metrics cannot be even easily evaluated.
326 | 
327 | 82
328 | 00:05:36.615 --> 00:05:40.730
329 | For example, if the metric is based on a human assessor's opinions,
330 | 
331 | 83
332 | 00:05:40.730 --> 00:05:44.500
333 | you cannot evaluate it on every iteration.
334 | 
335 | 84
336 | 00:05:44.500 --> 00:05:47.730
337 | For such metrics, we cannot use early stopping,
338 | 
339 | 85
340 | 00:05:47.730 --> 00:05:51.370
341 | but we will never find such metrics in a competition.
342 | 
343 | 86
344 | 00:05:51.370 --> 00:05:53.050
345 | So, in this video,
346 | 
347 | 87
348 | 00:05:53.050 --> 00:05:56.080
349 | we have discussed the discrepancy between our target
350 | 
351 | 88
352 | 00:05:56.080 --> 00:06:00.055
353 | metric and the loss function that our model optimizes.
354 | 
355 | 89
356 | 00:06:00.055 --> 00:06:04.150
357 | We've reviewed several approaches to target metric optimization and,
358 | 
359 | 90
360 | 00:06:04.150 --> 00:06:06.880
361 | in particular, discussed early stopping.
362 | 
363 | 91
364 | 00:06:06.880 --> 00:06:11.480
365 | In the following videos, we will go through the regression and
366 | 
367 | 92
368 | 00:06:11.480 --> 00:06:17.390
369 | classification metrics and see the hacks we can use to optimize them.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_6_Regression_metrics_optimization.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_6_Regression_metrics_optimization.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_6_Regression_metrics_optimization.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_7_Classification_metrics_optimization_1.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_7_Classification_metrics_optimization_1.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_7_Classification_metrics_optimization_1.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_8_Classification_metrics_optimization_2.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/1_8_Classification_metrics_optimization_2.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/1_8_Classification_metrics_optimization_2.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/2_1_Concept_of_mean_encoding.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/2_1_Concept_of_mean_encoding.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/2_1_Concept_of_mean_encoding.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/2_2_Regularization.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/2_2_Regularization.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/2_2_Regularization.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/2_3_Extensions_and_generalizations.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_3/2_3_Extensions_and_generalizations.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_3/2_3_Extensions_and_generalizations.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_1_Hyperparameter_tuning_1.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_1_Hyperparameter_tuning_1.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_1_Hyperparameter_tuning_1.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_2_Hyperparameter_tuning_2.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_2_Hyperparameter_tuning_2.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_2_Hyperparameter_tuning_2.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_3_Hyperparameter_tuning_3.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_3_Hyperparameter_tuning_3.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_3_Hyperparameter_tuning_3.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_4_Practical_guide.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_4_Practical_guide.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_4_Practical_guide.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_5_KazAnova's competition pipeline, part 1.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_5_KazAnova's competition pipeline, part 1.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_5_KazAnova's competition pipeline, part 1.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_6_KazAnova's competition pipeline, part 2.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/1_6_KazAnova's competition pipeline, part 2.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/1_6_KazAnova's competition pipeline, part 2.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_1_Statistics and distance based features.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:02.670 --> 00:00:05.630
  5 | Hi everyone.
  6 | 
  7 | 2
  8 | 00:00:05.630 --> 00:00:10.770
  9 | This video is dedicated to the following advanced feature engineering techniques.
 10 | 
 11 | 3
 12 | 00:00:10.770 --> 00:00:14.400
 13 | Calculating various statistics of one feature grouped by
 14 | 
 15 | 4
 16 | 00:00:14.400 --> 00:00:19.425
 17 | another and features derived from neighborhood analysis of a given point.
 18 | 
 19 | 5
 20 | 00:00:19.425 --> 00:00:21.660
 21 | To make it a little bit clearer,
 22 | 
 23 | 6
 24 | 00:00:21.660 --> 00:00:24.000
 25 | let's consider a simple example.
 26 | 
 27 | 7
 28 | 00:00:24.000 --> 00:00:27.590
 29 | Here we have a chunk of data for some CTR task.
 30 | 
 31 | 8
 32 | 00:00:27.590 --> 00:00:31.735
 33 | Let's forget about target variable and focus on human features.
 34 | 
 35 | 9
 36 | 00:00:31.735 --> 00:00:35.865
 37 | Namely, User_ID, unique identifier of a user,
 38 | 
 39 | 10
 40 | 00:00:35.865 --> 00:00:40.120
 41 | Page_ID, an identifier of a page user visited,
 42 | 
 43 | 11
 44 | 00:00:40.120 --> 00:00:43.890
 45 | Ad_price, item prices in the ad,
 46 | 
 47 | 12
 48 | 00:00:43.890 --> 00:00:49.645
 49 | and Ad_position, relative position of an ad on the web page.
 50 | 
 51 | 13
 52 | 00:00:49.645 --> 00:00:53.310
 53 | The most straightforward way to solve this problem is to label
 54 | 
 55 | 14
 56 | 00:00:53.310 --> 00:00:57.450
 57 | and call the Ad_position and feed some classifier.
 58 | 
 59 | 15
 60 | 00:00:57.450 --> 00:01:00.480
 61 | It would be a very good classifier that could take into
 62 | 
 63 | 16
 64 | 00:01:00.480 --> 00:01:04.395
 65 | account all the hidden relations between variables.
 66 | 
 67 | 17
 68 | 00:01:04.395 --> 00:01:06.720
 69 | But no matter how good it is,
 70 | 
 71 | 18
 72 | 00:01:06.720 --> 00:01:10.670
 73 | it still treats all the data points independently.
 74 | 
 75 | 19
 76 | 00:01:10.670 --> 00:01:13.855
 77 | And this is where we can apply feature engineering.
 78 | 
 79 | 20
 80 | 00:01:13.855 --> 00:01:16.460
 81 | We can imply that an ad with
 82 | 
 83 | 21
 84 | 00:01:16.460 --> 00:01:20.580
 85 | the lowest price on the page will catch most of the attention.
 86 | 
 87 | 22
 88 | 00:01:20.580 --> 00:01:24.450
 89 | The rest of the ads on the page won't be very attractive.
 90 | 
 91 | 23
 92 | 00:01:24.450 --> 00:01:29.165
 93 | It's pretty easy to calculate the features relevant to such an implication.
 94 | 
 95 | 24
 96 | 00:01:29.165 --> 00:01:34.930
 97 | We can add lowest and highest prices for every user and page per ad.
 98 | 
 99 | 25
100 | 00:01:34.930 --> 00:01:40.115
101 | Position of an ad with the lowest price could also be of use in such case.
102 | 
103 | 26
104 | 00:01:40.115 --> 00:01:44.753
105 | Here's one of the ways to implement statistical features with paid ads.
106 | 
107 | 27
108 | 00:01:44.753 --> 00:01:48.615
109 | If our data is stored in the data frame df,
110 | 
111 | 28
112 | 00:01:48.615 --> 00:01:55.550
113 | we call groupby method like this to get maximum and minimum price values.
114 | 
115 | 29
116 | 00:01:55.550 --> 00:01:59.160
117 | Then store this object in gb variable,
118 | 
119 | 30
120 | 00:01:59.160 --> 00:02:04.627
121 | and then join it back to the data frame df. This is it.
122 | 
123 | 31
124 | 00:02:04.627 --> 00:02:09.325
125 | I want to emphasize that you should not stop at this point.
126 | 
127 | 32
128 | 00:02:09.325 --> 00:02:12.210
129 | It's possible to add other useful features not
130 | 
131 | 33
132 | 00:02:12.210 --> 00:02:16.200
133 | necessarily calculated within user and page per.
134 | 
135 | 34
136 | 00:02:16.200 --> 00:02:19.410
137 | It could be how many pages user has visited,
138 | 
139 | 35
140 | 00:02:19.410 --> 00:02:23.455
141 | how many pages user has visited during the given session,
142 | 
143 | 36
144 | 00:02:23.455 --> 00:02:26.280
145 | and ID of the most visited page,
146 | 
147 | 37
148 | 00:02:26.280 --> 00:02:28.965
149 | how many users have visited that page,
150 | 
151 | 38
152 | 00:02:28.965 --> 00:02:31.670
153 | and many, many more features.
154 | 
155 | 39
156 | 00:02:31.670 --> 00:02:35.215
157 | The main idea is to introduce new information.
158 | 
159 | 40
160 | 00:02:35.215 --> 00:02:40.210
161 | By that means, we can drastically increase the quality of the models.
162 | 
163 | 41
164 | 00:02:40.210 --> 00:02:44.090
165 | But what if there is no features to use groupby on?
166 | 
167 | 42
168 | 00:02:44.090 --> 00:02:45.960
169 | Well, in such case,
170 | 
171 | 43
172 | 00:02:45.960 --> 00:02:50.535
173 | we can replace grouping operations with finding the nearest neighbors.
174 | 
175 | 44
176 | 00:02:50.535 --> 00:02:56.370
177 | On the one hand, it's much harder to implement and collect useful information.
178 | 
179 | 45
180 | 00:02:56.370 --> 00:02:59.455
181 | On the other hand, the method is more flexible.
182 | 
183 | 46
184 | 00:02:59.455 --> 00:03:05.370
185 | We can fine tune things like the size of relevant neighborhood or metric.
186 | 
187 | 47
188 | 00:03:05.370 --> 00:03:07.740
189 | The most common and natural example of
190 | 
191 | 48
192 | 00:03:07.740 --> 00:03:12.050
193 | neighborhood analysis arises from purposive pricing.
194 | 
195 | 49
196 | 00:03:12.050 --> 00:03:14.970
197 | Imagine that you need to predict rental prices.
198 | 
199 | 50
200 | 00:03:14.970 --> 00:03:19.150
201 | You would probably have some characteristics like floor space,
202 | 
203 | 51
204 | 00:03:19.150 --> 00:03:22.050
205 | number of rooms, presence of a bus stop.
206 | 
207 | 52
208 | 00:03:22.050 --> 00:03:26.665
209 | But you need something more than that to create a really good model.
210 | 
211 | 53
212 | 00:03:26.665 --> 00:03:30.090
213 | It could be the number of other houses in
214 | 
215 | 54
216 | 00:03:30.090 --> 00:03:35.370
217 | different neighborhoods like in 500 meters, 1,000 meters,
218 | 
219 | 55
220 | 00:03:35.370 --> 00:03:41.080
221 | or 1,500 meters, or average price per square meter in such neighborhoods,
222 | 
223 | 56
224 | 00:03:41.080 --> 00:03:43.140
225 | or the number of schools,
226 | 
227 | 57
228 | 00:03:43.140 --> 00:03:47.190
229 | supermarkets, and parking lots in such neighborhoods.
230 | 
231 | 58
232 | 00:03:47.190 --> 00:03:50.835
233 | The distances to the closest objects of interest
234 | 
235 | 59
236 | 00:03:50.835 --> 00:03:54.950
237 | like subway stations or gyms could also be of use.
238 | 
239 | 60
240 | 00:03:54.950 --> 00:03:56.835
241 | I think you've got the idea.
242 | 
243 | 61
244 | 00:03:56.835 --> 00:04:00.705
245 | In the example, we've used a very simple case,
246 | 
247 | 62
248 | 00:04:00.705 --> 00:04:04.980
249 | where neighborhoods were calculated in geographical space.
250 | 
251 | 63
252 | 00:04:04.980 --> 00:04:08.040
253 | But don't be afraid to apply this method to
254 | 
255 | 64
256 | 00:04:08.040 --> 00:04:11.710
257 | some abstract or even anonymized feature space.
258 | 
259 | 65
260 | 00:04:11.710 --> 00:04:14.055
261 | It still could be very useful.
262 | 
263 | 66
264 | 00:04:14.055 --> 00:04:18.350
265 | My team and I used this method in Spring Leaf competition.
266 | 
267 | 67
268 | 00:04:18.350 --> 00:04:22.910
269 | Furthermore, we did it in supervised fashion.
270 | 
271 | 68
272 | 00:04:22.910 --> 00:04:24.405
273 | Here is how we have done it.
274 | 
275 | 69
276 | 00:04:24.405 --> 00:04:28.260
277 | First of all, we applied mean encoding to all variables.
278 | 
279 | 70
280 | 00:04:28.260 --> 00:04:32.940
281 | By doing so, we created homogeneous feature space so we
282 | 
283 | 71
284 | 00:04:32.940 --> 00:04:38.325
285 | did not worry about scaling and importance of each particular feature.
286 | 
287 | 72
288 | 00:04:38.325 --> 00:04:44.595
289 | After that, we calculated 2,000 nearest neighbors with Bray-Curtis metric.
290 | 
291 | 73
292 | 00:04:44.595 --> 00:04:48.810
293 | Then we evaluated various features from
294 | 
295 | 74
296 | 00:04:48.810 --> 00:04:53.740
297 | those neighbors like mean target of nearest 5, 10, 15, 500,
298 | 
299 | 75
300 | 00:04:53.740 --> 00:04:59.540
301 | 2,000 neighbors, mean distance to 10 closest neighbors,
302 | 
303 | 76
304 | 00:04:59.540 --> 00:05:03.713
305 | mean distance to 10 closest neighbors with target 1,
306 | 
307 | 77
308 | 00:05:03.713 --> 00:05:08.240
309 | and mean distance to 10 closest neighbors with target 0,
310 | 
311 | 78
312 | 00:05:08.240 --> 00:05:10.845
313 | and, it worked great.
314 | 
315 | 79
316 | 00:05:10.845 --> 00:05:16.125
317 | In conclusion, I hope you embrace the main ideas of
318 | 
319 | 80
320 | 00:05:16.125 --> 00:05:20.085
321 | both groupby and nearest neighbor methods
322 | 
323 | 81
324 | 00:05:20.085 --> 00:05:24.935
325 | and you would be able to apply them in practice.
326 | 
327 | 82
328 | 00:05:24.935 --> 00:05:28.510
329 | Thank you for your attention.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_2_Matrix factorizations.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_2_Matrix factorizations.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_2_Matrix factorizations.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_3_Feature Interactions.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:02.990 --> 00:00:05.183
  5 | Hi, everyone.
  6 | 
  7 | 2
  8 | 00:00:05.183 --> 00:00:08.535
  9 | The main topic of this video is Feature Interactions.
 10 | 
 11 | 3
 12 | 00:00:08.535 --> 00:00:12.040
 13 | You will learn how to construct them and use in problem solving.
 14 | 
 15 | 4
 16 | 00:00:12.040 --> 00:00:16.405
 17 | Additionally, we will discuss them for feature extraction from decision trees.
 18 | 
 19 | 5
 20 | 00:00:16.405 --> 00:00:18.100
 21 | Let's start with an example.
 22 | 
 23 | 6
 24 | 00:00:18.100 --> 00:00:20.160
 25 | Suppose that we are building a model to predict
 26 | 
 27 | 7
 28 | 00:00:20.160 --> 00:00:23.245
 29 | the best advertisement banner to display on a website.
 30 | 
 31 | 8
 32 | 00:00:23.245 --> 00:00:27.760
 33 | Among available features, there are two categorical ones that we will concentrate on.
 34 | 
 35 | 9
 36 | 00:00:27.760 --> 00:00:30.810
 37 | The category of the advertising banner itself and
 38 | 
 39 | 10
 40 | 00:00:30.810 --> 00:00:34.150
 41 | the category of the site the banner will be showing on.
 42 | 
 43 | 11
 44 | 00:00:34.150 --> 00:00:37.603
 45 | Certainly, we can use the features as two independent ones,
 46 | 
 47 | 12
 48 | 00:00:37.603 --> 00:00:41.525
 49 | but a really important feature is indeed the combination of them.
 50 | 
 51 | 13
 52 | 00:00:41.525 --> 00:00:43.770
 53 | We can explicitly construct the combination in
 54 | 
 55 | 14
 56 | 00:00:43.770 --> 00:00:47.015
 57 | order to incorporate our knowledge into a model.
 58 | 
 59 | 15
 60 | 00:00:47.015 --> 00:00:52.195
 61 | Let's construct new feature named ad_site that represents the combination.
 62 | 
 63 | 16
 64 | 00:00:52.195 --> 00:00:54.551
 65 | It will be categorical as the old ones,
 66 | 
 67 | 17
 68 | 00:00:54.551 --> 00:01:00.270
 69 | but set of its values will be all possible combinations of two original values.
 70 | 
 71 | 18
 72 | 00:01:00.270 --> 00:01:01.905
 73 | From a technical point of view,
 74 | 
 75 | 19
 76 | 00:01:01.905 --> 00:01:04.785
 77 | there are two ways to construct such interaction.
 78 | 
 79 | 20
 80 | 00:01:04.785 --> 00:01:07.170
 81 | Let's look at a simple example.
 82 | 
 83 | 21
 84 | 00:01:07.170 --> 00:01:08.700
 85 | Consider our first feature,
 86 | 
 87 | 22
 88 | 00:01:08.700 --> 00:01:10.610
 89 | f1, has values A or B.
 90 | 
 91 | 23
 92 | 00:01:10.610 --> 00:01:13.714
 93 | Another feature, f2, has values X or Y or Z,
 94 | 
 95 | 24
 96 | 00:01:13.714 --> 00:01:17.870
 97 | and our data set consist of four data points.
 98 | 
 99 | 25
100 | 00:01:17.870 --> 00:01:21.810
101 | The first approach is to concatenate the text values of f1 and f2,
102 | 
103 | 26
104 | 00:01:21.810 --> 00:01:25.710
105 | and use the result as a new categorical feature f_join.
106 | 
107 | 27
108 | 00:01:25.710 --> 00:01:28.520
109 | We can then apply the OneHot according to it.
110 | 
111 | 28
112 | 00:01:28.520 --> 00:01:30.840
113 | The second approach consist of two steps.
114 | 
115 | 29
116 | 00:01:30.840 --> 00:01:35.025
117 | Firstly, apply OneHot and connect to features f1 and f2.
118 | 
119 | 30
120 | 00:01:35.025 --> 00:01:38.940
121 | Secondly, construct new metrics by multiplying each column from
122 | 
123 | 31
124 | 00:01:38.940 --> 00:01:43.390
125 | f1 encoded metrics to each column from f2 encoded metrics.
126 | 
127 | 32
128 | 00:01:43.390 --> 00:01:46.068
129 | It was nothing that both methods results in
130 | 
131 | 33
132 | 00:01:46.068 --> 00:01:49.410
133 | practically the same new feature representations.
134 | 
135 | 34
136 | 00:01:49.410 --> 00:01:51.075
137 | In the above example,
138 | 
139 | 35
140 | 00:01:51.075 --> 00:01:54.570
141 | we can consider as interactions between categorical features,
142 | 
143 | 36
144 | 00:01:54.570 --> 00:01:58.060
145 | but similar ideas can be applied to real valued features.
146 | 
147 | 37
148 | 00:01:58.060 --> 00:02:01.230
149 | For example, having two real valued features f1 and f2,
150 | 
151 | 38
152 | 00:02:01.230 --> 00:02:07.375
153 | interactions between them can be obtained by multiplications of f1 and f2.
154 | 
155 | 39
156 | 00:02:07.375 --> 00:02:11.035
157 | In fact, we are not limited to use only multiply operation.
158 | 
159 | 40
160 | 00:02:11.035 --> 00:02:14.070
161 | Any function taking two arguments like sum,
162 | 
163 | 41
164 | 00:02:14.070 --> 00:02:16.735
165 | difference, or division is okay.
166 | 
167 | 42
168 | 00:02:16.735 --> 00:02:19.320
169 | The following transformations significantly enlarge
170 | 
171 | 43
172 | 00:02:19.320 --> 00:02:22.695
173 | feature space and makes learning easier,
174 | 
175 | 44
176 | 00:02:22.695 --> 00:02:26.205
177 | but keep in mind that it makes or frequent easier too.
178 | 
179 | 45
180 | 00:02:26.205 --> 00:02:29.610
181 | It should be emphasized that for three ways algorithms such as
182 | 
183 | 46
184 | 00:02:29.610 --> 00:02:32.280
185 | the random forest or gradient boost decision trees
186 | 
187 | 47
188 | 00:02:32.280 --> 00:02:35.530
189 | it's difficult to extract such kind of dependencies.
190 | 
191 | 48
192 | 00:02:35.530 --> 00:02:40.265
193 | That's why they're buffer transformation are very efficient for three based methods.
194 | 
195 | 49
196 | 00:02:40.265 --> 00:02:42.755
197 | Let's discuss practical details now.
198 | 
199 | 50
200 | 00:02:42.755 --> 00:02:47.520
201 | Where wise future generation approaches greatly increase the number of the features.
202 | 
203 | 51
204 | 00:02:47.520 --> 00:02:49.190
205 | If there were any original features,
206 | 
207 | 52
208 | 00:02:49.190 --> 00:02:51.150
209 | there will be n square.
210 | 
211 | 53
212 | 00:02:51.150 --> 00:02:55.240
213 | And will be even more features if several types of interaction are used.
214 | 
215 | 54
216 | 00:02:55.240 --> 00:02:57.550
217 | There are two ways to moderate this,
218 | 
219 | 55
220 | 00:02:57.550 --> 00:03:01.100
221 | either do feature selection or dimensionality reduction.
222 | 
223 | 56
224 | 00:03:01.100 --> 00:03:03.060
225 | I prefer doing the selection since
226 | 
227 | 57
228 | 00:03:03.060 --> 00:03:05.615
229 | not all but only a few interactions often
230 | 
231 | 58
232 | 00:03:05.615 --> 00:03:09.000
233 | achieve the same quality as all combinations of features.
234 | 
235 | 59
236 | 00:03:09.000 --> 00:03:10.830
237 | For each type of interaction,
238 | 
239 | 60
240 | 00:03:10.830 --> 00:03:13.555
241 | I construct all piecewise feature interactions.
242 | 
243 | 61
244 | 00:03:13.555 --> 00:03:18.150
245 | Feature random forests over them and select several most important features.
246 | 
247 | 62
248 | 00:03:18.150 --> 00:03:22.265
249 | Because number of resulting features for each type is relatively small.
250 | 
251 | 63
252 | 00:03:22.265 --> 00:03:25.800
253 | It's possible to join them together along with original features and
254 | 
255 | 64
256 | 00:03:25.800 --> 00:03:29.975
257 | use as input for any machine learning algorithm usually to be by use method.
258 | 
259 | 65
260 | 00:03:29.975 --> 00:03:34.660
261 | During the video, we have examined the method to construct second order interactions.
262 | 
263 | 66
264 | 00:03:34.660 --> 00:03:38.750
265 | But you can similarly produce throned order or higher.
266 | 
267 | 67
268 | 00:03:38.750 --> 00:03:42.680
269 | Due to the fact that number of features grow rapidly with order,
270 | 
271 | 68
272 | 00:03:42.680 --> 00:03:45.225
273 | it has become difficult to work with them.
274 | 
275 | 69
276 | 00:03:45.225 --> 00:03:49.440
277 | Therefore high order directions are often constructed semi-manually.
278 | 
279 | 70
280 | 00:03:49.440 --> 00:03:52.165
281 | And this is an art in some ways.
282 | 
283 | 71
284 | 00:03:52.165 --> 00:03:54.690
285 | Additionally, I would like to talk about methods to
286 | 
287 | 72
288 | 00:03:54.690 --> 00:03:57.880
289 | construct categorical features from decision trees.
290 | 
291 | 73
292 | 00:03:57.880 --> 00:03:59.840
293 | Take a look at the decision tree.
294 | 
295 | 74
296 | 00:03:59.840 --> 00:04:03.475
297 | Let's map each leaf into a binary feature.
298 | 
299 | 75
300 | 00:04:03.475 --> 00:04:09.215
301 | The index of the object's leaf can be used as a value for a new categorical feature.
302 | 
303 | 76
304 | 00:04:09.215 --> 00:04:12.565
305 | If we use not a single tree but an ensemble of them.
306 | 
307 | 77
308 | 00:04:12.565 --> 00:04:14.260
309 | For example, a random forest,
310 | 
311 | 78
312 | 00:04:14.260 --> 00:04:18.070
313 | then such operation can be applied to each of entries.
314 | 
315 | 79
316 | 00:04:18.070 --> 00:04:22.270
317 | This is a powerful way to extract high order interactions.
318 | 
319 | 80
320 | 00:04:22.270 --> 00:04:24.895
321 | This technique is quite simple to implement.
322 | 
323 | 81
324 | 00:04:24.895 --> 00:04:27.970
325 | Tree-based poodles from sklearn library have
326 | 
327 | 82
328 | 00:04:27.970 --> 00:04:30.190
329 | an apply method which takes as
330 | 
331 | 83
332 | 00:04:30.190 --> 00:04:33.830
333 | input feature metrics and rituals corresponding indices of leaves.
334 | 
335 | 84
336 | 00:04:33.830 --> 00:04:39.840
337 | In xgboost, also support to why a parameter breed leaf in predict method.
338 | 
339 | 85
340 | 00:04:39.840 --> 00:04:42.730
341 | I suggest we need to collaborate documentations in order to
342 | 
343 | 86
344 | 00:04:42.730 --> 00:04:46.420
345 | get more information about these methods and IPIs.
346 | 
347 | 87
348 | 00:04:46.420 --> 00:04:48.210
349 | In the end of this video,
350 | 
351 | 88
352 | 00:04:48.210 --> 00:04:50.250
353 | I will tackle the main points.
354 | 
355 | 89
356 | 00:04:50.250 --> 00:04:54.960
357 | We examined method to construct an interactions of categorical features.
358 | 
359 | 90
360 | 00:04:54.960 --> 00:04:58.135
361 | Also, we extend the approach to real-valued features.
362 | 
363 | 91
364 | 00:04:58.135 --> 00:05:00.610
365 | And we have learned how to use trees to extract
366 | 
367 | 92
368 | 00:05:00.610 --> 00:05:04.510
369 | high order interactions. Thank you for your attention.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_4_t-SNE.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/2_4_t-SNE.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_4_t-SNE.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/2_4_t-SNE.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:03.080 --> 00:00:05.268
  5 | Hi, everyone.
  6 | 
  7 | 2
  8 | 00:00:05.268 --> 00:00:10.095
  9 | Today, we will discuss this new method for visualizing data integrating features.
 10 | 
 11 | 3
 12 | 00:00:10.095 --> 00:00:11.540
 13 | At the end of this video,
 14 | 
 15 | 4
 16 | 00:00:11.540 --> 00:00:14.190
 17 | you will be able to use tSNE in your products.
 18 | 
 19 | 5
 20 | 00:00:14.190 --> 00:00:15.745
 21 | In the previous video,
 22 | 
 23 | 6
 24 | 00:00:15.745 --> 00:00:20.930
 25 | we learned about metaphysician technique that is predatory very close to linear models.
 26 | 
 27 | 7
 28 | 00:00:20.930 --> 00:00:22.980
 29 | In this video, we will touch
 30 | 
 31 | 8
 32 | 00:00:22.980 --> 00:00:26.355
 33 | the subject of non-linear methods of dimensionality reduction.
 34 | 
 35 | 9
 36 | 00:00:26.355 --> 00:00:29.180
 37 | That says in general are called manifold learning.
 38 | 
 39 | 10
 40 | 00:00:29.180 --> 00:00:34.225
 41 | For example, look at the data in form of letter S on the left side.
 42 | 
 43 | 11
 44 | 00:00:34.225 --> 00:00:36.380
 45 | On the right, we can see results of running
 46 | 
 47 | 12
 48 | 00:00:36.380 --> 00:00:39.255
 49 | different manifold learning algorithm on the data.
 50 | 
 51 | 13
 52 | 00:00:39.255 --> 00:00:43.560
 53 | This new result is placed at the right bottom corner on the slide.
 54 | 
 55 | 14
 56 | 00:00:43.560 --> 00:00:46.803
 57 | This new algorithm is the main topic of the lecture,
 58 | 
 59 | 15
 60 | 00:00:46.803 --> 00:00:50.170
 61 | as it tells of how this really works won't be explained here.
 62 | 
 63 | 16
 64 | 00:00:50.170 --> 00:00:54.090
 65 | But you will come to look at additional materials for the details.
 66 | 
 67 | 17
 68 | 00:00:54.090 --> 00:00:58.295
 69 | Let's just say that this is a method that tries to project points from
 70 | 
 71 | 18
 72 | 00:00:58.295 --> 00:01:01.340
 73 | high dimensional space into small dimensional space
 74 | 
 75 | 19
 76 | 00:01:01.340 --> 00:01:05.075
 77 | so that the distances between points are approximately preserved.
 78 | 
 79 | 20
 80 | 00:01:05.075 --> 00:01:09.500
 81 | Let's look at the example of the tSNE on the MNIST dataset.
 82 | 
 83 | 21
 84 | 00:01:09.500 --> 00:01:15.225
 85 | Here are points from 700 dimensional space that are projected into two dimensional space.
 86 | 
 87 | 22
 88 | 00:01:15.225 --> 00:01:19.235
 89 | You can see that such projection forms explicit clusters.
 90 | 
 91 | 23
 92 | 00:01:19.235 --> 00:01:22.240
 93 | Coolest shows that these clusters are meaningful and
 94 | 
 95 | 24
 96 | 00:01:22.240 --> 00:01:25.785
 97 | corresponds to the target numbers well.
 98 | 
 99 | 25
100 | 00:01:25.785 --> 00:01:29.400
101 | Moreover, neighbor clusters corresponds to a visually similar numbers.
102 | 
103 | 26
104 | 00:01:29.400 --> 00:01:32.730
105 | For example, cluster of three is located next to the cluster of
106 | 
107 | 27
108 | 00:01:32.730 --> 00:01:37.490
109 | five which in chance is adjustment to the cluster of six and eight.
110 | 
111 | 28
112 | 00:01:37.490 --> 00:01:41.535
113 | If data has explicit structure as in case of MNIST dataset,
114 | 
115 | 29
116 | 00:01:41.535 --> 00:01:44.460
117 | it's likely to be reflected on tSNE plot.
118 | 
119 | 30
120 | 00:01:44.460 --> 00:01:49.410
121 | For the reason tSNE is widely used in exploratory data analysis.
122 | 
123 | 31
124 | 00:01:49.410 --> 00:01:53.875
125 | However, do not assume that tSNE is a magic want that always helps.
126 | 
127 | 32
128 | 00:01:53.875 --> 00:01:58.640
129 | For example, a misfortune choice of hyperparameters may lead to poor results.
130 | 
131 | 33
132 | 00:01:58.640 --> 00:02:02.095
133 | Consider an example, in the center is the least presented
134 | 
135 | 34
136 | 00:02:02.095 --> 00:02:06.590
137 | a tSNE projection of exactly the same MNIST data as in previous example,
138 | 
139 | 35
140 | 00:02:06.590 --> 00:02:09.340
141 | only perplexity parameter has been changed.
142 | 
143 | 36
144 | 00:02:09.340 --> 00:02:11.110
145 | On the left, for comparison,
146 | 
147 | 37
148 | 00:02:11.110 --> 00:02:13.225
149 | we have plots from previous right.
150 | 
151 | 38
152 | 00:02:13.225 --> 00:02:17.190
153 | On the right, so it present a tSNE projection of random data.
154 | 
155 | 39
156 | 00:02:17.190 --> 00:02:20.790
157 | We can see as a choice of hybrid parameters change projection of
158 | 
159 | 40
160 | 00:02:20.790 --> 00:02:24.500
161 | MNIST data significantly so that we cannot see clusters.
162 | 
163 | 41
164 | 00:02:24.500 --> 00:02:30.775
165 | Moreover, new projection become more similar to random data rather than to the original.
166 | 
167 | 42
168 | 00:02:30.775 --> 00:02:34.615
169 | Let's find out what depends on the perplexity hyperparameter value.
170 | 
171 | 43
172 | 00:02:34.615 --> 00:02:36.426
173 | On the left, we have perplexity=3,
174 | 
175 | 44
176 | 00:02:36.426 --> 00:02:42.805
177 | in the center=10, and on the right= 150.
178 | 
179 | 45
180 | 00:02:42.805 --> 00:02:47.910
181 | I want to emphasize that these projections are all made for the same data.
182 | 
183 | 46
184 | 00:02:47.910 --> 00:02:52.875
185 | The illustration shows that these new results strongly depends on its parameters,
186 | 
187 | 47
188 | 00:02:52.875 --> 00:02:57.270
189 | and the interpretation of the results is not a simple task.
190 | 
191 | 48
192 | 00:02:57.270 --> 00:02:59.500
193 | In particular, one cannot infer the size of
194 | 
195 | 49
196 | 00:02:59.500 --> 00:03:02.855
197 | original clusters using the size of projected clusters.
198 | 
199 | 50
200 | 00:03:02.855 --> 00:03:06.050
201 | Similar proposition is valid for a distance between clusters.
202 | 
203 | 51
204 | 00:03:06.050 --> 00:03:09.417
205 | Blog distill.pub contain a post
206 | 
207 | 52
208 | 00:03:09.417 --> 00:03:13.595
209 | about how to understand and interpret the results of tSNE.
210 | 
211 | 53
212 | 00:03:13.595 --> 00:03:16.220
213 | Also, it contains a great interactive demo
214 | 
215 | 54
216 | 00:03:16.220 --> 00:03:19.575
217 | that will help you to get into issues of how tSNE works.
218 | 
219 | 55
220 | 00:03:19.575 --> 00:03:21.980
221 | I strongly advise you to take a look at it.
222 | 
223 | 56
224 | 00:03:21.980 --> 00:03:24.690
225 | In addition to exploratory data analysis,
226 | 
227 | 57
228 | 00:03:24.690 --> 00:03:28.770
229 | tSNE can be considered as a method to obtain new features from data.
230 | 
231 | 58
232 | 00:03:28.770 --> 00:03:33.235
233 | You should just concatenate the transformers coordinates to the original feature matrix.
234 | 
235 | 59
236 | 00:03:33.235 --> 00:03:35.680
237 | Now if you've heard this about practical details,
238 | 
239 | 60
240 | 00:03:35.680 --> 00:03:37.270
241 | as it has been shown earlier,
242 | 
243 | 61
244 | 00:03:37.270 --> 00:03:38.490
245 | the results of tSNE algorithm,
246 | 
247 | 62
248 | 00:03:38.490 --> 00:03:41.480
249 | it strongly depends on hyperparameters.
250 | 
251 | 63
252 | 00:03:41.480 --> 00:03:45.690
253 | It is good practice to use several projections with different perplexities.
254 | 
255 | 64
256 | 00:03:45.690 --> 00:03:49.110
257 | In addition, because of stochastic of this methods results in
258 | 
259 | 65
260 | 00:03:49.110 --> 00:03:52.660
261 | different projections even with the same data and hyperparameters.
262 | 
263 | 66
264 | 00:03:52.660 --> 00:03:58.490
265 | This means the train and test sets should be projected together rather than separately.
266 | 
267 | 67
268 | 00:03:58.490 --> 00:04:02.575
269 | Also, tSNE will run for a long time if you have a lot of features.
270 | 
271 | 68
272 | 00:04:02.575 --> 00:04:05.290
273 | If the number of features is greater than 500,
274 | 
275 | 69
276 | 00:04:05.290 --> 00:04:09.165
277 | you should use one of dimensionality reduction approach and reduce number of features,
278 | 
279 | 70
280 | 00:04:09.165 --> 00:04:11.585
281 | for example, to 100.
282 | 
283 | 71
284 | 00:04:11.585 --> 00:04:15.700
285 | Implementation of tSNE can be found in the sklearn library.
286 | 
287 | 72
288 | 00:04:15.700 --> 00:04:17.255
289 | But personally, I prefer to use
290 | 
291 | 73
292 | 00:04:17.255 --> 00:04:20.975
293 | another implementation from a separate Python package called tSNE,
294 | 
295 | 74
296 | 00:04:20.975 --> 00:04:24.830
297 | since it provide a way more efficient implementation.
298 | 
299 | 75
300 | 00:04:24.830 --> 00:04:28.570
301 | In conclusion, I want to remind you the basic points of the lecture.
302 | 
303 | 76
304 | 00:04:28.570 --> 00:04:31.630
305 | TSNE is an excellent tool for visualizing data.
306 | 
307 | 77
308 | 00:04:31.630 --> 00:04:33.785
309 | If data has an explicit structure,
310 | 
311 | 78
312 | 00:04:33.785 --> 00:04:37.318
313 | then it likely be [inaudible] on tSNE projection.
314 | 
315 | 79
316 | 00:04:37.318 --> 00:04:41.615
317 | However, it requires to be cautious with interpretation of tSNE results.
318 | 
319 | 80
320 | 00:04:41.615 --> 00:04:46.145
321 | Sometimes you can see structure where it does not exist or vice versa,
322 | 
323 | 81
324 | 00:04:46.145 --> 00:04:48.785
325 | see none where structure is actually present.
326 | 
327 | 82
328 | 00:04:48.785 --> 00:04:53.530
329 | It's a good practice to do several tSNE projections with different perplexities.
330 | 
331 | 83
332 | 00:04:53.530 --> 00:04:55.035
333 | And in addition to EJ,
334 | 
335 | 84
336 | 00:04:55.035 --> 00:04:59.125
337 | tSNE is working very well as a feature for feeding models.
338 | 
339 | 85
340 | 00:04:59.125 --> 00:05:01.800
341 | Thank you for your attention.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_1_Introduction into ensemble methods.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:01.030 --> 00:00:05.942
  5 | Hello everyone, this is Marios
  6 | Michailidis, and this will be the first
  7 | 
  8 | 2
  9 | 00:00:05.942 --> 00:00:10.452
 10 | video in a series that we will be
 11 | discussing on ensemble methods for
 12 | 
 13 | 3
 14 | 00:00:10.452 --> 00:00:11.835
 15 | machine learning.
 16 | 
 17 | 4
 18 | 00:00:11.835 --> 00:00:18.165
 19 | To tell you a bit about me, I work as
 20 | Research Data Scientist for H2Oai.
 21 | 
 22 | 5
 23 | 00:00:18.165 --> 00:00:21.976
 24 | In fact,
 25 | my PhD is about assemble methods, and
 26 | 
 27 | 6
 28 | 00:00:21.976 --> 00:00:25.501
 29 | they used to be ranked
 30 | number one in cargo and
 31 | 
 32 | 7
 33 | 00:00:25.501 --> 00:00:30.600
 34 | ensemble methods have greatly
 35 | helped me to achieve this spot.
 36 | 
 37 | 8
 38 | 00:00:30.600 --> 00:00:32.800
 39 | So you might find the course interesting.
 40 | 
 41 | 9
 42 | 00:00:34.480 --> 00:00:37.077
 43 | So what is ensemble modelling?
 44 | 
 45 | 10
 46 | 00:00:37.077 --> 00:00:41.947
 47 | I think with this term, we refer to
 48 | combining many different machine learning
 49 | 
 50 | 11
 51 | 00:00:41.947 --> 00:00:45.620
 52 | models in order to get
 53 | a more powerful prediction.
 54 | 
 55 | 12
 56 | 00:00:45.620 --> 00:00:48.997
 57 | And later on we will see
 58 | examples that this happens,
 59 | 
 60 | 13
 61 | 00:00:48.997 --> 00:00:53.386
 62 | that we combine different models and
 63 | we do get better predictions.
 64 | 
 65 | 14
 66 | 00:00:53.386 --> 00:00:56.175
 67 | There are various ensemble methods.
 68 | 
 69 | 15
 70 | 00:00:56.175 --> 00:01:01.240
 71 | Here we'll discuss a few, those that
 72 | we encounter quite often, in predictive
 73 | 
 74 | 16
 75 | 00:01:01.240 --> 00:01:06.471
 76 | modelling competitions, and they tend
 77 | to be, in general, quite competitive.
 78 | 
 79 | 17
 80 | 00:01:06.471 --> 00:01:10.924
 81 | We will start with simple averaging
 82 | methods, then we'll go to weighted
 83 | 
 84 | 18
 85 | 00:01:10.924 --> 00:01:15.311
 86 | averaging methods, and we will also
 87 | examine conditional averaging.
 88 | 
 89 | 19
 90 | 00:01:15.311 --> 00:01:19.950
 91 | And then we will move to some more
 92 | typical ones like bagging, or
 93 | 
 94 | 20
 95 | 00:01:19.950 --> 00:01:24.942
 96 | the very, very popular, boosting,
 97 | then stacking and StackNet,
 98 | 
 99 | 21
100 | 00:01:24.942 --> 00:01:27.590
101 | which is the result of my research.
102 | 
103 | 22
104 | 00:01:30.350 --> 00:01:34.160
105 | But as I said,
106 | these will be a series of videos, and
107 | 
108 | 23
109 | 00:01:34.160 --> 00:01:38.163
110 | we will initially start
111 | with the averaging methods.
112 | 
113 | 24
114 | 00:01:41.060 --> 00:01:45.366
115 | So, in order to help you understand
116 | a bit more about the averaging methods,
117 | 
118 | 25
119 | 00:01:45.366 --> 00:01:46.791
120 | let's take an example.
121 | 
122 | 26
123 | 00:01:46.791 --> 00:01:51.622
124 | Let's say we have a variable called age,
125 | as in age years,
126 | 
127 | 27
128 | 00:01:51.622 --> 00:01:54.150
129 | and we try to predict this.
130 | 
131 | 28
132 | 00:01:54.150 --> 00:01:57.241
133 | We have a model that yields prediction for
134 | age.
135 | 
136 | 29
137 | 00:01:57.241 --> 00:02:01.386
138 | Let's assume that
139 | the relationship between the two,
140 | 
141 | 30
142 | 00:02:01.386 --> 00:02:08.010
143 | the actual age in our prediction,
144 | looks like in the graph, as in the graph.
145 | 
146 | 31
147 | 00:02:08.010 --> 00:02:15.660
148 | So you can see that the model boasts
149 | quite a higher square of a value of 0.91,
150 | 
151 | 32
152 | 00:02:15.660 --> 00:02:19.980
153 | but it doesn't do so
154 | well in the whole range of values.
155 | 
156 | 33
157 | 00:02:19.980 --> 00:02:25.680
158 | So when age is less than 50,
159 | the model actually does quite well.
160 | 
161 | 34
162 | 00:02:25.680 --> 00:02:28.856
163 | But when age is more than 50,
164 | 
165 | 35
166 | 00:02:28.856 --> 00:02:33.505
167 | you can see that the average
168 | error is higher.
169 | 
170 | 36
171 | 00:02:33.505 --> 00:02:35.960
172 | Now let's take another example.
173 | 
174 | 37
175 | 00:02:35.960 --> 00:02:40.962
176 | Let's assume we have a second model
177 | that also tries to predict age,
178 | 
179 | 38
180 | 00:02:40.962 --> 00:02:43.167
181 | but this one looks like that.
182 | 
183 | 39
184 | 00:02:43.167 --> 00:02:48.988
185 | As you can see, this model does quite
186 | well when age is higher than 50,
187 | 
188 | 40
189 | 00:02:48.988 --> 00:02:56.020
190 | but not so well when age is less than 50,
191 | nevertheless, it scores again 0.91.
192 | 
193 | 41
194 | 00:02:56.020 --> 00:03:01.200
195 | So we have two models that have
196 | a similar predictive power,
197 | 
198 | 42
199 | 00:03:01.200 --> 00:03:04.007
200 | but they look quite different.
201 | 
202 | 43
203 | 00:03:04.007 --> 00:03:08.682
204 | It's quite obvious that they do
205 | better in different parts of
206 | 
207 | 44
208 | 00:03:08.682 --> 00:03:10.707
209 | the distribution of age.
210 | 
211 | 45
212 | 00:03:10.707 --> 00:03:14.394
213 | So what will happen if we
214 | were to try to combine
215 | 
216 | 46
217 | 00:03:14.394 --> 00:03:19.148
218 | this two with a simple averaging method,
219 | in other words,
220 | 
221 | 47
222 | 00:03:19.148 --> 00:03:25.540
223 | just say (model 1 + model two) / 2,
224 | so a simple averaging method.
225 | 
226 | 48
227 | 00:03:25.540 --> 00:03:28.920
228 | The end result will look
229 | as in the new graph.
230 | 
231 | 49
232 | 00:03:28.920 --> 00:03:34.592
233 | So, our square has moved to 0.95,
234 | which is a considerable
235 | 
236 | 50
237 | 00:03:34.592 --> 00:03:40.692
238 | improvement versus the 0.91 we had before,
239 | and as you can see,
240 | 
241 | 51
242 | 00:03:40.692 --> 00:03:46.059
243 | on average, the points tend to
244 | be closer with the reality.
245 | 
246 | 52
247 | 00:03:46.059 --> 00:03:49.723
248 | So the average error is smaller.
249 | 
250 | 53
251 | 00:03:49.723 --> 00:03:56.052
252 | However, as you can see, the model doesn't
253 | do better as an individual models for
254 | 
255 | 54
256 | 00:03:56.052 --> 00:03:59.998
257 | the areas where the models
258 | were doing really well,
259 | 
260 | 55
261 | 00:03:59.998 --> 00:04:03.410
262 | nevertheless, it does better on average.
263 | 
264 | 56
265 | 00:04:03.410 --> 00:04:06.584
266 | This is something we need to understand,
267 | 
268 | 57
269 | 00:04:06.584 --> 00:04:12.195
270 | that there is potentially a better
271 | way to combine these models.
272 | 
273 | 58
274 | 00:04:12.195 --> 00:04:15.354
275 | We could try to take a weighting average.
276 | 
277 | 59
278 | 00:04:15.354 --> 00:04:19.976
279 | So say, I'm going to take 70% of
280 | the first model prediction and
281 | 
282 | 60
283 | 00:04:19.976 --> 00:04:22.893
284 | 30% of the second model prediction.
285 | 
286 | 61
287 | 00:04:22.893 --> 00:04:28.853
288 | In other words,
289 | (model 1x0.7 + model 2x0.3),
290 | 
291 | 62
292 | 00:04:28.853 --> 00:04:33.393
293 | and the end result would
294 | look as in the graph.
295 | 
296 | 63
297 | 00:04:33.393 --> 00:04:38.849
298 | So you can see their square is no better
299 | and that makes sense, because the models
300 | 
301 | 64
302 | 00:04:38.849 --> 00:04:44.560
303 | have quite similar predictive power and
304 | it doesn't make sense to rely more in one.
305 | 
306 | 65
307 | 00:04:46.280 --> 00:04:51.215
308 | And also it is quite clear that
309 | it looks more with model 1,
310 | 
311 | 66
312 | 00:04:51.215 --> 00:04:56.452
313 | because it has better predictions
314 | when age is less than 50,
315 | 
316 | 67
317 | 00:04:56.452 --> 00:05:00.699
318 | and worse predictions
319 | when age is more than 50.
320 | 
321 | 68
322 | 00:05:00.699 --> 00:05:08.250
323 | As a theoretical exercise, what is the
324 | theoretical best we could get out of this?
325 | 
326 | 69
327 | 00:05:08.250 --> 00:05:13.250
328 | We know we have a model that scores
329 | really well when age is less than 50,
330 | 
331 | 70
332 | 00:05:13.250 --> 00:05:17.820
333 | and another model that scores really
334 | well when age is more than 50.
335 | 
336 | 71
337 | 00:05:17.820 --> 00:05:21.776
338 | So ideally, we would like to
339 | get to something like that.
340 | 
341 | 72
342 | 00:05:21.776 --> 00:05:26.420
343 | This is how we leverage the two
344 | models in the best possible way
345 | 
346 | 73
347 | 00:05:26.420 --> 00:05:29.891
348 | here by using a simple
349 | conditioning method.
350 | 
351 | 74
352 | 00:05:29.891 --> 00:05:35.187
353 | So if less than 50 is one I'll just
354 | use the other, and we will see later
355 | 
356 | 75
357 | 00:05:35.187 --> 00:05:40.310
358 | on that there are ensemble methods
359 | that are very good at finding these
360 | 
361 | 76
362 | 00:05:40.310 --> 00:05:46.510
363 | relationships of two or more predictions
364 | in respect to the target variable.
365 | 
366 | 77
367 | 00:05:46.510 --> 00:05:49.210
368 | But, this will be a topic for
369 | another discussion.
370 | 
371 | 78
372 | 00:05:49.210 --> 00:05:53.340
373 | Here we discuss simple averaging methods,
374 | 
375 | 79
376 | 00:05:53.340 --> 00:05:58.250
377 | hopefully you found it useful, and
378 | stay here for the next session to come.
379 | 
380 | 80
381 | 00:05:58.250 --> 00:05:59.170
382 | Thank you very much.


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_2_Bagging.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_2_Bagging.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_2_Bagging.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_3_Boosting.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_3_Boosting.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_3_Boosting.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_4_Stacking.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_4_Stacking.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_4_Stacking.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_5_StackNet.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_5_StackNet.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_5_StackNet.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_6_Ensembling Tips and Tricks.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_4/3_6_Ensembling Tips and Tricks.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_4/3_6_Ensembling Tips and Tricks.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/1_Crowdflower Competition.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/1_Crowdflower\ Competition.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/howtowin_kaggle/week_5/1_Crowdflower\ Competition.srt


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/2_Springleaf Marketing Response.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:00.000 --> 00:00:03.802
  3 | 音楽
  4 | 
  5 | 2
  6 | 00:00:03.802 --> 00:00:08.527
  7 | こんにちは、コースを通して、我々は役に立
  8 | つとしてスプリングリーフの競争を使用する
  9 | 
 10 | 3
 11 | 00:00:08.527 --> 00:00:13.840
 12 | EDA の例では、最も近い近傍に基づいた
 13 | 符号化と機能を意味します。
 14 | 
 15 | 4
 16 | 00:00:13.840 --> 00:00:20.090
 17 | 当時、我々はと一緒にこの大会で3位を取っ
 18 | た。
 19 | 
 20 | 5
 21 | 00:00:20.090 --> 00:00:24.710
 22 | そして今、このビデオでは、私たちのソリュ
 23 | ーションの最後の部分について説明します
 24 | 
 25 | 6
 26 | 00:00:24.710 --> 00:00:28.470
 27 | これは、スタッキングとアンサンブルの使用
 28 | 方法です。
 29 | 
 30 | 7
 31 | 00:00:28.470 --> 00:00:33.349
 32 | この写真では、我々はレベルで生産最終的な
 33 | スタッキングスキームを見ることができます
 34 | 
 35 | 8
 36 | 00:00:33.349 --> 00:00:38.070
 37 | 最初のレベルで0の機能は、基本的なモデル
 38 | による予測。
 39 | 
 40 | 9
 41 | 00:00:38.070 --> 00:00:40.110
 42 | レベル1プラスの組み合わせで。
 43 | 
 44 | 10
 45 | 00:00:40.110 --> 00:00:44.270
 46 | したがって、これらの予測といくつかの正確
 47 | に選択した機能
 48 | 
 49 | 11
 50 | 00:00:44.270 --> 00:00:47.880
 51 | 機能のこの新しいセットの2番目のレベルの
 52 | モデルに。
 53 | 
 54 | 12
 55 | 00:00:47.880 --> 00:00:52.290
 56 | そして最後に、3番目のレベルでは、その線
 57 | 形の組み合わせ。
 58 | 
 59 | 13
 60 | 00:00:52.290 --> 00:00:52.910
 61 | このビデオでは、
 62 | 
 63 | 14
 64 | 00:00:52.910 --> 00:00:58.301
 65 | それは、この非自明な ensembled
 66 | スキームにビルドとして我々は、各レベルを
 67 | 通過します。
 68 | 
 69 | 15
 70 | 00:00:59.430 --> 00:01:03.590
 71 | しかし、まず、すぐに問題について自分自身
 72 | を思い出させる。
 73 | 
 74 | 16
 75 | 00:01:03.590 --> 00:01:07.428
 76 | これは、曲線メトリックの下の領域を持つバ
 77 | イナリ分類タスクでした。
 78 | 
 79 | 17
 80 | 00:01:07.428 --> 00:01:15.270
 81 | 我々は、トレーニングデータと約2000匿
 82 | 名の機能で145000サンプルを持ってい
 83 | た。
 84 | 
 85 | 18
 86 | 00:01:15.270 --> 00:01:19.740
 87 | これらは、EDA をしながら私たちによっ
 88 | て導き出された有用な洞察でした。
 89 | 
 90 | 19
 91 | 00:01:19.740 --> 00:01:26.080
 92 | そして、あなたのメモリをリフレッシュする
 93 | ために我々のコースで以前に行われた
 94 | EDA
 95 | をチェックアウトすることができます。
 96 | 
 97 | 20
 98 | 00:01:26.080 --> 00:01:29.150
 99 | だから今の機能から始めましょう。
100 | 
101 | 21
102 | 00:01:29.150 --> 00:01:33.040
103 | ここでは、機能の4つのパックがあります。
104 | 
105 | 22
106 | 00:01:33.040 --> 00:01:37.760
107 | 最初の2つは、基本データセットと処理され
108 | たデータセットです。
109 | 
110 | 23
111 | 00:01:37.760 --> 00:01:41.560
112 | それをシンプルに保つために、我々は単にか
113 | ら派生した洞察を使用
114 | 
115 | 24
116 | 00:01:41.560 --> 00:01:45.690
117 | EDA は、データをきれいに
118 | [聞こえない] と新機能を生成します。
119 | 
120 | 25
121 | 00:01:45.690 --> 00:01:49.280
122 | たとえば、重複した機能を削除し、
123 | 
124 | 26
125 | 00:01:49.280 --> 00:01:53.729
126 | 散布図と相関関係に基づいていくつかのフィ
127 | ーチャインタラクションを編集します。
128 | 
129 | 27
130 | 00:01:54.790 --> 00:01:59.815
131 | その後、我々は、成長関係のループを使用し
132 | てすべてのカテゴリ機能を意味エンコード
133 | 
134 | 28
135 | 00:01:59.815 --> 00:02:02.050
136 | データとスムージングに署名します。
137 | 
138 | 29
139 | 00:02:02.050 --> 00:02:06.704
140 | さらに、平均エンコードされたデータセット
141 | を使用して、最も近い
142 | 
143 | 30
144 | 00:02:06.704 --> 00:02:07.620
145 | 隣人。
146 | 
147 | 31
148 | 00:02:07.620 --> 00:02:12.390
149 | と同様に、クラスゼロの最も近いオブジェク
150 | トでは何ですか?
151 | 
152 | 32
153 | 00:02:12.390 --> 00:02:18.280
154 | そして、どのように多くのオブジェクトのう
155 | ち、10の最寄りの隣人クラス1に属してい
156 | る?
157 | 
158 | 33
159 | 00:02:18.280 --> 00:02:21.119
160 | これがどのように行われるかを確認すること
161 | ができます
162 | 
163 | 34
164 | 00:02:21.119 --> 00:02:24.677
165 | 関連トピックでは、デミトリ
166 | Altihof によって導入。
167 | 
168 | 35
169 | 00:02:24.677 --> 00:02:31.469
170 | そこで最後に、これらの4つの機能のパック
171 | は、私たちのソリューションのレベル0でし
172 | た。
173 | 
174 | 36
175 | 00:02:31.469 --> 00:02:35.952
176 | 2番目のレベルは、内のいくつかの異なるグ
177 | ラデーションで表された
178 | 
179 | 37
180 | 00:02:35.952 --> 00:02:39.570
181 | デシジョンツリーモデルと1つのニューラル
182 | ネットワーク
183 | 
184 | 38
185 | 00:02:39.570 --> 00:02:44.120
186 | ここでの主な考え方は、メタ機能は多様でな
187 | ければならないということです。
188 | 
189 | 39
190 | 00:02:44.120 --> 00:02:48.340
191 | 各メタ機能は、ターゲットに関する新しい情
192 | 報をもたらす必要があります。
193 | 
194 | 40
195 | 00:02:48.340 --> 00:02:55.714
196 | だから我々のモデルの両方の異なるパラメー
197 | タと機能のさまざまなセットを使用します。
198 | 
199 | 41
200 | 00:02:55.714 --> 00:03:00.449
201 | ニューラルネットワークについては、我々は
202 | さらに事前に処理された機能
203 | 
204 | 42
205 | 00:03:00.449 --> 00:03:04.600
206 | 共通のスカラー、ランクおよび力の変形。
207 | 
208 | 43
209 | 00:03:04.600 --> 00:03:10.960
210 | 問題は、ネットワークのトレーニング結果を
211 | スキュー巨大な飛び地にあった。
212 | 
213 | 44
214 | 00:03:10.960 --> 00:03:15.120
215 | 従ってランクおよび力の変形はこの問題の処
216 | 理を助けた。
217 | 
218 | 45
219 | 00:03:16.566 --> 00:03:19.990
220 | それに決定を後押しすることで漸進的である
221 | メタ特徴を作り出した後
222 | 
223 | 46
224 | 00:03:19.990 --> 00:03:21.230
225 | ニューラルネットワーク、
226 | 
227 | 47
228 | 00:03:21.230 --> 00:03:26.280
229 | 我々は、次のレベルのモデルを支援するため
230 | にそれらの賃金上昇の違いを計算した。
231 | 
232 | 48
233 | 00:03:26.280 --> 00:03:30.570
234 | これはまた、モデルを強制的に興味深いトリ
235 | ックであることに注意してください
236 | 
237 | 49
238 | 00:03:30.570 --> 00:03:35.290
239 | 最初のレベルのモデルの予測の違いを活用す
240 | る。
241 | 
242 | 50
243 | 00:03:35.290 --> 00:03:40.370
244 | ここでは、最も近い近傍に基づいてフィーチ
245 | ャの2つのデータセットを編集します。
246 | 
247 | 51
248 | 00:03:40.370 --> 00:03:45.380
249 | 1つはレベル0から直接取得され、同じ機能
250 | が含まれています。
251 | 
252 | 52
253 | 00:03:45.380 --> 00:03:51.040
254 | しかし、それは、平均符号化されたデータセ
255 | ットで半分の力に計算しました。
256 | 
257 | 53
258 | 00:03:51.040 --> 00:03:55.704
259 | ここでのポイントは、これらの機能が完全に
260 | 利用されていないということでした
261 | 
262 | 54
263 | 00:03:55.704 --> 00:03:57.370
264 | 最初のレベルのモデル。
265 | 
266 | 55
267 | 00:03:57.370 --> 00:04:02.690
268 | そして確かに、彼らはこのレベルに情報の新
269 | しい部分をもたらした。
270 | 
271 | 56
272 | 00:04:03.740 --> 00:04:08.850
273 | 今、我々はすでに最初のレベルから自動フォ
274 | ールディング折り曲げる予測を持っていると
275 | 
276 | 57
277 | 00:04:08.850 --> 00:04:11.110
278 | 我々はそれらのモデルを訓練します。
279 | 
280 | 58
281 | 00:04:11.110 --> 00:04:16.230
282 | 他の民族のせいで標的が漏れる
283 | 
284 | 59
285 | 00:04:16.230 --> 00:04:18.610
286 | また、機能が非常によくないため、
287 | 
288 | 60
289 | 00:04:18.610 --> 00:04:23.600
290 | モデルが検出するデータには、ほとんどパタ
291 | ーンが残っていません。
292 | 
293 | 61
294 | 00:04:23.600 --> 00:04:29.160
295 | 我々は、予測は多様であるべきであることを
296 | 念頭に置き、単純な分類器を選んだ。
297 | 
298 | 62
299 | 00:04:29.160 --> 00:04:31.670
300 | 4種類のモデルを使用しました。
301 | 
302 | 63
303 | 00:04:31.670 --> 00:04:34.780
304 | 勾配ブーストデシジョンツリー,
305 | ニューラルネットワーク,
306 | 
307 | 64
308 | 00:04:34.780 --> 00:04:38.710
309 | ランダムフォレストとロジスティック回帰。
310 | 
311 | 65
312 | 00:04:38.710 --> 00:04:42.289
313 | だから、このすべての2番目のレベルのモデ
314 | ルです。
315 | 
316 | 66
317 | 00:04:43.340 --> 00:04:48.410
318 | そして最後に、我々は2番目のレベルのモデ
319 | ルのあなたの組み合わせでリニアを取った。
320 | 
321 | 67
322 | 00:04:48.410 --> 00:04:54.770
323 | 線形モデルは、我々は推定係数に傾いていな
324 | いので、
325 | 
326 | 68
327 | 00:04:54.770 --> 00:04:59.890
328 | 直接これらの4つの予測とデータを投げるた
329 | めの我々のターゲットを使用します。
330 | 
331 | 69
332 | 00:04:59.890 --> 00:05:01.450
333 | だから、これは。
334 | 
335 | 70
336 | 00:05:01.450 --> 00:05:06.244
337 | 我々は、この積み重ねスキームの各レベルを
338 | 経て、学生を行った。
339 | 
340 | 71
341 | 00:05:06.244 --> 00:05:08.390
342 | なぜ我々はこのような複雑さが必要ですか?
343 | 
344 | 72
345 | 00:05:08.390 --> 00:05:13.795
346 | 別のモデルが異なるパターンを利用するので
347 | 、まあ、通常それは
348 | 
349 | 73
350 | 00:05:13.795 --> 00:05:19.610
351 | データでは、我々は1つの強大なモデルでは
352 | 、このパターンのすべてを団結したい。
353 | 
354 | 74
355 | 00:05:19.610 --> 00:05:22.930
356 | そして積み重ねは私達のためのそれを丁度す
357 | ることができる。
358 | 
359 | 75
360 | 00:05:22.930 --> 00:05:24.970
361 | これはあまりにも複雑に見えるかもしれませ
362 | ん。
363 | 
364 | 76
365 | 00:05:24.970 --> 00:05:29.480
366 | もちろん、それは競争の中でスキームのこの
367 | 種の上に移動するのに時間がかかります。
368 | 
369 | 77
370 | 00:05:29.480 --> 00:05:32.630
371 | しかし、私たちのコースを完了した後、必ず
372 | 
373 | 78
374 | 00:05:32.630 --> 00:05:37.310
375 | あなたはすでにこれを行う方法について十分
376 | な知識を持っている。
377 | 
378 | 79
379 | 00:05:37.310 --> 00:05:41.580
380 | これらのスキームは、コンペティションの開
381 | 始時に最終的な形状には表示されません。
382 | 
383 | 80
384 | 00:05:41.580 --> 00:05:44.800
385 | ほとんどの仕事は、通常、最初のレベルで行
386 | われます。
387 | 
388 | 81
389 | 00:05:44.800 --> 00:05:51.770
390 | だから、多様なメタ機能を作成しようとする
391 | と、1つのシンプルモデルでそれらを団結。
392 | 
393 | 82
394 | 00:05:51.770 --> 00:05:56.880
395 | 通常、あなたは、スタッキングの高品位第二
396 | レベルを作成し始める
397 | 
398 | 83
399 | 00:05:56.880 --> 00:05:59.362
400 | 数日しか残っていないとき。
401 | 
402 | 84
403 | 00:05:59.362 --> 00:06:04.480
404 | そして、その後、主にこのスキームの改善に
405 | 取り組んでいます。
406 | 
407 | 85
408 | 00:06:04.480 --> 00:06:08.262
409 | そうは言っても、あなたはすでに必要な知識
410 | と
411 | 
412 | 86
413 | 00:06:08.262 --> 00:06:11.570
414 | 今、あなたはちょうどそこにいくつかの練習
415 | を取得する必要があります。
416 | 
417 | 87
418 | 00:06:11.570 --> 00:06:16.594
419 | 勤勉であり、疑いもなく、あなたは成功しま
420 | す。
421 | 
422 | 88
423 | 00:06:16.594 --> 00:06:18.964
424 | 音
425 | 
426 | 89
427 | 00:06:18.964 --> 00:06:28.964
428 | 音楽
429 | 
430 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/2_Springleaf Marketing Response.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/2_Springleaf Marketing Response.vtt:
--------------------------------------------------------------------------------
  1 | WEBVTT
  2 | 
  3 | 1
  4 | 00:00:00.000 --> 00:00:03.802
  5 | [MUSIC]
  6 | 
  7 | 2
  8 | 00:00:03.802 --> 00:00:08.527
  9 | Hi, throughout the course, we use
 10 | the Springleaf competition as a useful
 11 | 
 12 | 3
 13 | 00:00:08.527 --> 00:00:13.840
 14 | example of EDA, mean encodings and
 15 | features based on nearest neighbors.
 16 | 
 17 | 4
 18 | 00:00:13.840 --> 00:00:20.090
 19 | Back then, we took the third place in
 20 | this competition together with and.
 21 | 
 22 | 5
 23 | 00:00:20.090 --> 00:00:24.710
 24 | And now in this video, I will describe
 25 | the last part of our solution,
 26 | 
 27 | 6
 28 | 00:00:24.710 --> 00:00:28.470
 29 | which is the usage of stacking and
 30 | ensembles.
 31 | 
 32 | 7
 33 | 00:00:28.470 --> 00:00:33.349
 34 | On this picture, you can see the final
 35 | stacking scheme we produced on the level
 36 | 
 37 | 8
 38 | 00:00:33.349 --> 00:00:38.070
 39 | 0 features, on the first level,
 40 | predictions by basic models.
 41 | 
 42 | 9
 43 | 00:00:38.070 --> 00:00:40.110
 44 | On the level one plus combination.
 45 | 
 46 | 10
 47 | 00:00:40.110 --> 00:00:44.270
 48 | So these predictions and
 49 | some accurately chosen features
 50 | 
 51 | 11
 52 | 00:00:44.270 --> 00:00:47.880
 53 | on the second level models
 54 | on this new set of features.
 55 | 
 56 | 12
 57 | 00:00:47.880 --> 00:00:52.290
 58 | And finally, on the third level,
 59 | their linear combination.
 60 | 
 61 | 13
 62 | 00:00:52.290 --> 00:00:52.910
 63 | In this video,
 64 | 
 65 | 14
 66 | 00:00:52.910 --> 00:00:58.301
 67 | we will go through each level as it builds
 68 | up to this non-trivial ensembled scheme.
 69 | 
 70 | 15
 71 | 00:00:59.430 --> 00:01:03.590
 72 | But first, let's quickly remind
 73 | ourselves about the problem.
 74 | 
 75 | 16
 76 | 00:01:03.590 --> 00:01:07.428
 77 | This was a binary classification
 78 | task with area under curve metric.
 79 | 
 80 | 17
 81 | 00:01:07.428 --> 00:01:15.270
 82 | We had 145,000 samples in training data
 83 | and about 2,000 anonymized features.
 84 | 
 85 | 18
 86 | 00:01:15.270 --> 00:01:19.740
 87 | These were useful insights
 88 | derived by us while doing EDA.
 89 | 
 90 | 19
 91 | 00:01:19.740 --> 00:01:26.080
 92 | And you can check out EDA done by earlier
 93 | in our course to refresh your memory.
 94 | 
 95 | 20
 96 | 00:01:26.080 --> 00:01:29.150
 97 | So now let's start with features.
 98 | 
 99 | 21
100 | 00:01:29.150 --> 00:01:33.040
101 | Here we have four packs of features.
102 | 
103 | 22
104 | 00:01:33.040 --> 00:01:37.760
105 | First two are the basic dataset and
106 | the processed dataset.
107 | 
108 | 23
109 | 00:01:37.760 --> 00:01:41.560
110 | To keep it simple,
111 | we just used insights derived from
112 | 
113 | 24
114 | 00:01:41.560 --> 00:01:45.690
115 | EDA to clean data [INAUDIBLE] and
116 | to generate new features.
117 | 
118 | 25
119 | 00:01:45.690 --> 00:01:49.280
120 | For example,
121 | we remove duplicated features and
122 | 
123 | 26
124 | 00:01:49.280 --> 00:01:53.729
125 | edit some feature interaction based
126 | on scatter plots and correlations.
127 | 
128 | 27
129 | 00:01:54.790 --> 00:01:59.815
130 | Then, we mean-encoded all categorical
131 | features using growth relation loop and
132 | 
133 | 28
134 | 00:01:59.815 --> 00:02:02.050
135 | sign data and smoothing.
136 | 
137 | 29
138 | 00:02:02.050 --> 00:02:06.704
139 | We further used the mean-encoded dataset
140 | to calculate features based on nearest
141 | 
142 | 30
143 | 00:02:06.704 --> 00:02:07.620
144 | neighbors.
145 | 
146 | 31
147 | 00:02:07.620 --> 00:02:12.390
148 | Like, what is the least in
149 | closest object of the class zero?
150 | 
151 | 32
152 | 00:02:12.390 --> 00:02:18.280
153 | And how many objects out of ten
154 | nearest neighbors belong to class one?
155 | 
156 | 33
157 | 00:02:18.280 --> 00:02:21.119
158 | You can review how this could be done in
159 | 
160 | 34
161 | 00:02:21.119 --> 00:02:24.677
162 | related topics introduced
163 | by Dmitri Altihof.
164 | 
165 | 35
166 | 00:02:24.677 --> 00:02:31.469
167 | So finally, these four packs of
168 | feature were level 0 of our solution.
169 | 
170 | 36
171 | 00:02:31.469 --> 00:02:35.952
172 | And the second level was represented
173 | by several different gradient within
174 | 
175 | 37
176 | 00:02:35.952 --> 00:02:39.570
177 | decision tree models,
178 | and one neural network.
179 | 
180 | 38
181 | 00:02:39.570 --> 00:02:44.120
182 | The main idea here is that meta
183 | features should be diverse.
184 | 
185 | 39
186 | 00:02:44.120 --> 00:02:48.340
187 | Each meta feature should bring
188 | new information about the target.
189 | 
190 | 40
191 | 00:02:48.340 --> 00:02:55.714
192 | So we use both distinct parameters and
193 | different sets of features for our models.
194 | 
195 | 41
196 | 00:02:55.714 --> 00:03:00.449
197 | For the neural network, we additionally
198 | pre-processed features with
199 | 
200 | 42
201 | 00:03:00.449 --> 00:03:04.600
202 | common scalars, ranks and
203 | power transformation.
204 | 
205 | 43
206 | 00:03:04.600 --> 00:03:10.960
207 | The problem there was in huge outliers
208 | which skew network training results.
209 | 
210 | 44
211 | 00:03:10.960 --> 00:03:15.120
212 | So ranks and power transformation
213 | helped to handle this problem.
214 | 
215 | 45
216 | 00:03:16.566 --> 00:03:19.990
217 | After producing meta features who is
218 | gradual in boosting decision to it and
219 | 
220 | 46
221 | 00:03:19.990 --> 00:03:21.230
222 | neural networks,
223 | 
224 | 47
225 | 00:03:21.230 --> 00:03:26.280
226 | we calculated pay rise differences
227 | on them to help next level models.
228 | 
229 | 48
230 | 00:03:26.280 --> 00:03:30.570
231 | Note that this is also an interesting
232 | trick to force the model
233 | 
234 | 49
235 | 00:03:30.570 --> 00:03:35.290
236 | to utilize the differences in
237 | the first level models predictions.
238 | 
239 | 50
240 | 00:03:35.290 --> 00:03:40.370
241 | Here we edit two datasets of
242 | features based on nearest neighbors.
243 | 
244 | 51
245 | 00:03:40.370 --> 00:03:45.380
246 | One was taken directly from level 0 and
247 | they contain the same features.
248 | 
249 | 52
250 | 00:03:45.380 --> 00:03:51.040
251 | But it was calculated on the mean-encoded
252 | dataset to the power of one-half.
253 | 
254 | 53
255 | 00:03:51.040 --> 00:03:55.704
256 | The point here was that these features
257 | were not completely utilized by
258 | 
259 | 54
260 | 00:03:55.704 --> 00:03:57.370
261 | the first level models.
262 | 
263 | 55
264 | 00:03:57.370 --> 00:04:02.690
265 | And indeed, they brought new pieces
266 | of information to this level.
267 | 
268 | 56
269 | 00:04:03.740 --> 00:04:08.850
270 | Now we already have autofold
271 | predictions from the first level and
272 | 
273 | 57
274 | 00:04:08.850 --> 00:04:11.110
275 | we will train with the models on them.
276 | 
277 | 58
278 | 00:04:11.110 --> 00:04:16.230
279 | Because we could have target leakage
280 | here because of other folk, and
281 | 
282 | 59
283 | 00:04:16.230 --> 00:04:18.610
284 | also because features not very good and
285 | 
286 | 60
287 | 00:04:18.610 --> 00:04:23.600
288 | there are almost no patterns left
289 | in the data for models to discover.
290 | 
291 | 61
292 | 00:04:23.600 --> 00:04:29.160
293 | We chose simple classifiers, keeping in
294 | mind that predictions should be diverse.
295 | 
296 | 62
297 | 00:04:29.160 --> 00:04:31.670
298 | We used four different models.
299 | 
300 | 63
301 | 00:04:31.670 --> 00:04:34.780
302 | Gradient boosted decision tree,
303 | neural networks,
304 | 
305 | 64
306 | 00:04:34.780 --> 00:04:38.710
307 | random forest and logistic regression.
308 | 
309 | 65
310 | 00:04:38.710 --> 00:04:42.289
311 | So this is all with
312 | the second level models.
313 | 
314 | 66
315 | 00:04:43.340 --> 00:04:48.410
316 | And finally, we took a linear in your
317 | combination of the second level models.
318 | 
319 | 67
320 | 00:04:48.410 --> 00:04:54.770
321 | Because a linear model is not inclined
322 | to that we estimated coefficients
323 | 
324 | 68
325 | 00:04:54.770 --> 00:04:59.890
326 | directly using these four predictions and
327 | our target for throwing in data.
328 | 
329 | 69
330 | 00:04:59.890 --> 00:05:01.450
331 | So, this is it.
332 | 
333 | 70
334 | 00:05:01.450 --> 00:05:06.244
335 | We just went through each level of this
336 | stacking scheme and then the student.
337 | 
338 | 71
339 | 00:05:06.244 --> 00:05:08.390
340 | Why we need this kind of complexity?
341 | 
342 | 72
343 | 00:05:08.390 --> 00:05:13.795
344 | Well, usually it's because different
345 | models utilize different patterns
346 | 
347 | 73
348 | 00:05:13.795 --> 00:05:19.610
349 | in the data and we want to unite all
350 | of this patterns in one mighty model.
351 | 
352 | 74
353 | 00:05:19.610 --> 00:05:22.930
354 | And stacking can do exactly that for us.
355 | 
356 | 75
357 | 00:05:22.930 --> 00:05:24.970
358 | This may seem too complicated.
359 | 
360 | 76
361 | 00:05:24.970 --> 00:05:29.480
362 | Of course, it takes time to move up to
363 | this kind of scheme in a competition.
364 | 
365 | 77
366 | 00:05:29.480 --> 00:05:32.630
367 | But be sure that after
368 | completion our course,
369 | 
370 | 78
371 | 00:05:32.630 --> 00:05:37.310
372 | you already have enough
373 | knowledge about how to do this.
374 | 
375 | 79
376 | 00:05:37.310 --> 00:05:41.580
377 | These schemes never appear in the final
378 | shape at the beginning of the competition.
379 | 
380 | 80
381 | 00:05:41.580 --> 00:05:44.800
382 | Most work here usually is
383 | done on the first level.
384 | 
385 | 81
386 | 00:05:44.800 --> 00:05:51.770
387 | So you try to create diverse meta features
388 | and unite them in one simple model.
389 | 
390 | 82
391 | 00:05:51.770 --> 00:05:56.880
392 | Usually, you start to create the high
393 | grade second level of stacking,
394 | 
395 | 83
396 | 00:05:56.880 --> 00:05:59.362
397 | when you have only a few days left.
398 | 
399 | 84
400 | 00:05:59.362 --> 00:06:04.480
401 | And after that, you mostly work on
402 | the improvement of this scheme.
403 | 
404 | 85
405 | 00:06:04.480 --> 00:06:08.262
406 | That said, you already have
407 | the required knowledge and
408 | 
409 | 86
410 | 00:06:08.262 --> 00:06:11.570
411 | now you just need to get
412 | some practice out there.
413 | 
414 | 87
415 | 00:06:11.570 --> 00:06:16.594
416 | Be diligent, and without a doubt,
417 | you will succeed.
418 | 
419 | 88
420 | 00:06:16.594 --> 00:06:18.964
421 | [SOUND]
422 | 
423 | 89
424 | 00:06:18.964 --> 00:06:28.964
425 | [MUSIC]


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/3_Microsoft Malware Classification Challenge.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/4_Walmart Trip Type Classification.srt:
--------------------------------------------------------------------------------
  1 | 1
  2 | 00:00:03.010 --> 00:00:05.620
  3 | こんにちは。このビデオでは、
  4 | 
  5 | 2
  6 | 00:00:05.620 --> 00:00:06.845
  7 | 私は話をするつもりです
  8 | 
  9 | 3
 10 | 00:00:06.845 --> 00:00:12.380
 11 | 年前の Kaggle カップルで開催され
 12 | たウォルマートの旅行型分類の挑戦。
 13 | 
 14 | 4
 15 | 00:00:12.380 --> 00:00:14.795
 16 | 私はその競技で1位を獲得した。
 17 | 
 18 | 5
 19 | 00:00:14.795 --> 00:00:17.990
 20 | そして今、私はの最も興味深い部分について
 21 | 教えてくれます
 22 | 
 23 | 6
 24 | 00:00:17.990 --> 00:00:21.704
 25 | 問題と私の解決策について。
 26 | 
 27 | 7
 28 | 00:00:21.704 --> 00:00:25.814
 29 | つまり、このプレゼンテーションは4つの部
 30 | 分で構成されています。
 31 | 
 32 | 8
 33 | 00:00:25.814 --> 00:00:28.100
 34 | まず、問題を述べます。
 35 | 
 36 | 9
 37 | 00:00:28.100 --> 00:00:31.699
 38 | 第二に、我々はどのようなデータフォーマッ
 39 | トとデータの再処理を理解する。
 40 | 
 41 | 10
 42 | 00:00:31.699 --> 00:00:35.570
 43 | 第3に、モデルについてお話します。
 44 | 
 45 | 11
 46 | 00:00:35.570 --> 00:00:39.665
 47 | その相対的な品質と一般のカシメスキームと
 48 | の関係。
 49 | 
 50 | 12
 51 | 00:00:39.665 --> 00:00:44.285
 52 | そして最後に、我々はここに新機能を生成す
 53 | るいくつかの可能性を概説します。
 54 | 
 55 | 13
 56 | 00:00:44.285 --> 00:00:46.605
 57 | では、始めましょう。
 58 | 
 59 | 14
 60 | 00:00:46.605 --> 00:00:52.400
 61 | 私たちのデータでは、我々はウォルマートで
 62 | 2週間で自分の店を訪問した購入の人々があ
 63 | った
 64 | 
 65 | 15
 66 | 00:00:52.400 --> 00:00:58.620
 67 | そして、我々は38の訪問旅行の種類やクラ
 68 | スにそれらを分類しなければならなかった。
 69 | 
 70 | 16
 71 | 00:00:58.620 --> 00:01:01.985
 72 | のは、データ内の機能を簡単に見てみましょ
 73 | う。
 74 | 
 75 | 17
 76 | 00:01:01.985 --> 00:01:04.947
 77 | トリップタイプ列はターゲットを表し、
 78 | 
 79 | 18
 80 | 00:01:04.947 --> 00:01:08.110
 81 | 訪問番号は、結合する ID を表します。
 82 | 
 83 | 19
 84 | 00:01:08.110 --> 00:01:12.075
 85 | 1つのショッピング旅行で1つの顧客によっ
 86 | て行われた購入。
 87 | 
 88 | 20
 89 | 00:01:12.075 --> 00:01:15.785
 90 | たとえば、訪問番号7を作った顧客は、
 91 | 
 92 | 21
 93 | 00:01:15.785 --> 00:01:18.470
 94 | に配置されている2つの項目を購入
 95 | 
 96 | 22
 97 | 00:01:18.470 --> 00:01:21.890
 98 | このデータフレームの3番目の行にあります
 99 | 。
100 | 
101 | 23
102 | 00:01:21.890 --> 00:01:26.319
103 | 同じ訪問番号を持つすべての行に同じトリッ
104 | プタイプがあることに注意してください。
105 | 
106 | 24
107 | 00:01:26.319 --> 00:01:32.383
108 | 重要な瞬間は、我々は訪問番号の旅行の種類
109 | を予測する必要があることです
110 | 
111 | 25
112 | 00:01:32.383 --> 00:01:35.325
113 | そして列車データの各行のためではない。
114 | 
115 | 26
116 | 00:01:35.325 --> 00:01:36.920
117 | 見ての通り
118 | 
119 | 27
120 | 00:01:36.920 --> 00:01:42.850
121 | 電車の中で我々は約647000を持ってい
122 | る
123 | 
124 | 28
125 | 00:01:42.850 --> 00:01:49.305
126 | 行とのみ95000の訪問。
127 | 
128 | 29
129 | 00:01:49.305 --> 00:01:51.085
130 | 機能に戻る, 次の機能は、平日です
131 | 
132 | 30
133 | 00:01:51.085 --> 00:01:54.030
134 | 明らかに訪問の平日を表しています。
135 | 
136 | 31
137 | 00:01:54.030 --> 00:01:55.980
138 | 次は UPC です。
139 | 
140 | 32
141 | 00:01:55.980 --> 00:01:59.635
142 | UPC は、購入したアイテムの正確な
143 | ID です。
144 | 
145 | 33
146 | 00:01:59.635 --> 00:02:01.742
147 | 次に、スキャンカウント。
148 | 
149 | 34
150 | 00:02:01.742 --> 00:02:05.408
151 | スキャン数は、購入したアイテムの正確な数
152 | です。
153 | 
154 | 35
155 | 00:02:05.408 --> 00:02:10.420
156 | ここでマイナス1は購入ではなく返品を表し
157 | ていることに注意してください。
158 | 
159 | 36
160 | 00:02:10.420 --> 00:02:13.285
161 | 次の機能、部門の説明、
162 | 
163 | 37
164 | 00:02:13.285 --> 00:02:18.330
165 | と68ユニークな値は、項目のための広いカ
166 | テゴリです。
167 | 
168 | 38
169 | 00:02:18.330 --> 00:02:20.307
170 | そして最後に、fineline 番号、
171 | 
172 | 39
173 | 00:02:20.307 --> 00:02:22.670
174 | 約5000のユニークな値で、
175 | 
176 | 40
177 | 00:02:22.670 --> 00:02:25.950
178 | は、アイテムのより洗練されたカテゴリです
179 | 。
180 | 
181 | 41
182 | 00:02:25.950 --> 00:02:29.095
183 | この機能が何を表しているかを理解した後、
184 | 
185 | 42
186 | 00:02:29.095 --> 00:02:33.655
187 | 訪問番号ごとに1つの予測を行う必要がある
188 | ことを思い出してみましょう。
189 | 
190 | 43
191 | 00:02:33.655 --> 00:02:37.255
192 | のは、訪問番号8のデータを見てみましょう
193 | 。
194 | 
195 | 44
196 | 00:02:37.255 --> 00:02:39.565
197 | 我々はここを見ることができます
198 | 
199 | 45
200 | 00:02:39.565 --> 00:02:44.315
201 | この特定の訪問は、カテゴリの塗料やアクセ
202 | サリーの購入がたくさんある
203 | 
204 | 46
205 | 00:02:44.315 --> 00:02:47.920
206 | つまり、トリップタイプ番号26は、
207 | 
208 | 47
209 | 00:02:47.920 --> 00:02:52.360
210 | そのカテゴリ内のほとんどの購入を訪問を表
211 | します。
212 | 
213 | 48
214 | 00:02:52.360 --> 00:02:55.465
215 | さて、ここで鉄道模型にアプローチする方法
216 | 。
217 | 
218 | 49
219 | 00:02:55.465 --> 00:02:59.525
220 | データをもう一度見て、可能性を評価してみ
221 | ましょう。
222 | 
223 | 50
224 | 00:02:59.525 --> 00:03:05.027
225 | 我々は、リスト上の各項目のトリップタイプ
226 | を予測するか、我々は別の方法を選択する必
227 | 要がありますか?
228 | 
229 | 51
230 | 00:03:05.027 --> 00:03:07.637
231 | もちろん二人とも可能ですが、
232 | 
233 | 52
234 | 00:03:07.637 --> 00:03:09.095
235 | でも最初のうちは
236 | 
237 | 53
238 | 00:03:09.095 --> 00:03:13.428
239 | 各データセットを使用して各行のトリップタ
240 | イプを予測し、
241 | 
242 | 54
243 | 00:03:13.428 --> 00:03:18.170
244 | 私達は同じ訪問に属する項目間の重要な相互
245 | 作用を逃す。
246 | 
247 | 55
248 | 00:03:18.170 --> 00:03:22.223
249 | 例えば、トリップタイプは、26の数がある
250 | かもしれませんが、
251 | 
252 | 56
253 | 00:03:22.223 --> 00:03:27.309
254 | その項目の半分以上の場合は、塗料やアクセ
255 | サリーからです。
256 | 
257 | 57
258 | 00:03:27.309 --> 00:03:31.170
259 | しかし、我々はこれらの項目間の相互作用を
260 | 考慮しない場合は、
261 | 
262 | 58
263 | 00:03:31.170 --> 00:03:33.580
264 | それはかなり予測するのは難しいことができ
265 | ます。
266 | 
267 | 59
268 | 00:03:33.580 --> 00:03:38.155
269 | だから、訪問とメイキングですべての購入を
270 | 結合する2番目のオプション
271 | 
272 | 60
273 | 00:03:38.155 --> 00:03:43.250
274 | 各行が完全な訪問を表すデータセットは、よ
275 | り合理的なようです。
276 | 
277 | 61
278 | 00:03:43.250 --> 00:03:45.658
279 | と、期待できるように、
280 | 
281 | 62
282 | 00:03:45.658 --> 00:03:51.375
283 | このアプローチは、競争の中でより重要な利
284 | 点につながります。
285 | 
286 | 63
287 | 00:03:51.375 --> 00:03:56.330
288 | 私はあなたの目的の1つにデータ形式を変更
289 | する最も簡単な方法を示すつもりです。
290 | 
291 | 64
292 | 00:03:56.330 --> 00:04:00.815
293 | 例を目的として、部署の説明機能を選択して
294 | みましょう。
295 | 
296 | 65
297 | 00:04:00.815 --> 00:04:04.900
298 | まず、データフレームを訪問番号でグループ
299 | 化して、
300 | 
301 | 66
302 | 00:04:04.900 --> 00:04:09.915
303 | 各部門の説明が訪問に何回存在するかを計算
304 | します。
305 | 
306 | 67
307 | 00:04:09.915 --> 00:04:14.010
308 | では、最後のグループを unstack
309 | てみましょう。
310 | 
311 | 68
312 | 00:04:14.010 --> 00:04:19.285
313 | 列ので、各部門の説明値の一意の列を取得し
314 | ます。
315 | 
316 | 69
317 | 00:04:19.285 --> 00:04:22.210
318 | 今、これは我々が欲しかった形式です。
319 | 
320 | 70
321 | 00:04:22.210 --> 00:04:27.645
322 | 各行は訪問を表し、各列はその訪問で説明さ
323 | れている機能です。
324 | 
325 | 71
326 | 00:04:27.645 --> 00:04:32.710
327 | 我々は、部門の説明以外の他の機能のアプロ
328 | ーチでこのグループを使用することができま
329 | す。
330 | 
331 | 72
332 | 00:04:32.710 --> 00:04:39.755
333 | また、訪問中の項目は、実際にはテキスト内
334 | の単語に非常に似ていることに注意してくだ
335 | さい。
336 | 
337 | 73
338 | 00:04:39.755 --> 00:04:44.680
339 | 我々の確認後、各機能は、ここでカウントを
340 | 表し、
341 | 
342 | 74
343 | 00:04:44.680 --> 00:04:47.865
344 | だから、通常のテキストで動作するアイデア
345 | を適用することが、
346 | 
347 | 75
348 | 00:04:47.865 --> 00:04:51.215
349 | たとえば、tf-idf 変換。
350 | 
351 | 76
352 | 00:04:51.215 --> 00:04:55.565
353 | お察しのとおり、多くの可能性がここに出て
354 | くる。
355 | 
356 | 77
357 | 00:04:55.565 --> 00:05:00.999
358 | すごい。これが行われ、我々は所望の形式で
359 | データを処理した後、
360 | 
361 | 78
362 | 00:05:00.999 --> 00:05:03.100
363 | モデル選びに移りましょう。
364 | 
365 | 79
366 | 00:05:03.100 --> 00:05:05.750
367 | すでに話し合ったことを踏まえて、
368 | 
369 | 80
370 | 00:05:05.750 --> 00:05:08.620
371 | もし我々が大きな違いを期待する必要があり
372 | ます推測することができます
373 | 
374 | 81
375 | 00:05:08.620 --> 00:05:12.485
376 | 線形モデルとツリーベースのモデルの間のス
377 | コアはここですか?
378 | 
379 | 82
380 | 00:05:12.485 --> 00:05:15.715
381 | これについて少し考えてみてください。
382 | 
383 | 83
384 | 00:05:15.715 --> 00:05:19.795
385 | たとえば、線形モデルが
386 | 
387 | 84
388 | 00:05:19.795 --> 00:05:24.745
389 | ツリーベースのモデルと比較して実行します
390 | か?はい、あります。
391 | 
392 | 85
393 | 00:05:24.745 --> 00:05:27.785
394 | 繰り返しますが、私はここでの相互作用につ
395 | いて話している。
396 | 
397 | 86
398 | 00:05:27.785 --> 00:05:31.120
399 | 実際、ニューラルネットワークにおける木ベ
400 | ースのモデルは、
401 | 
402 | 87
403 | 00:05:31.120 --> 00:05:36.315
404 | この非常に理由のためにこの競争の中で品質
405 | の重要な優位性。
406 | 
407 | 88
408 | 00:05:36.315 --> 00:05:42.855
409 | しかし、それでも、1つは便利なメソッドの
410 | 機能をここに生成する線形モデルと TNN
411 | を使用することができます。
412 | 
413 | 89
414 | 00:05:42.855 --> 00:05:46.430
415 | 彼らは相互作用を意味していないという事実
416 | にもかかわらず、
417 | 
418 | 90
419 | 00:05:46.430 --> 00:05:50.230
420 | 彼らは私の一般的な賭けのスキームで貴重な
421 | 資産だった。
422 | 
423 | 91
424 | 00:05:50.230 --> 00:05:54.295
425 | 私たちはここで賭けの詳細に行くことはあり
426 | ませんので、
427 | 
428 | 92
429 | 00:05:54.295 --> 00:05:58.590
430 | 既に競争についての他のビデオのほとんどの
431 | 考えを覆った。
432 | 
433 | 93
434 | 00:05:58.590 --> 00:06:03.165
435 | その代わりに、機能の生成について少し話し
436 | ます。
437 | 
438 | 94
439 | 00:06:03.165 --> 00:06:07.798
440 | 1回の来店で購入した項目間のやりとり以外
441 | は、
442 | 
443 | 95
444 | 00:06:07.798 --> 00:06:10.975
445 | 1つは機能間の相互作用を利用することを試
446 | みることができる。
447 | 
448 | 96
449 | 00:06:10.975 --> 00:06:15.290
450 | ここで面白いと予想外の結果は、
451 | 
452 | 97
453 | 00:06:15.290 --> 00:06:19.993
454 | 1つの fineline 番号は複数の部
455 | 門の記述に属することができる
456 | 
457 | 98
458 | 00:06:19.993 --> 00:06:23.345
459 | つまり、fineline 番号は
460 | 
461 | 99
462 | 00:06:23.345 --> 00:06:28.195
463 | あなたが考えることができるように、より詳
464 | 細な部門の説明。
465 | 
466 | 100
467 | 00:06:28.195 --> 00:06:33.200
468 | この相互作用を使用して、1つは彼のモデル
469 | を更に改善できる。
470 | 
471 | 101
472 | 00:06:33.200 --> 00:06:35.536
473 | もう一つの興味深い特徴の生成の考え
474 | 
475 | 102
476 | 00:06:35.536 --> 00:06:38.875
477 | データの時間構造に接続した。
478 | 
479 | 103
480 | 00:06:38.875 --> 00:06:41.895
481 | このプロットを見てください,
482 | 
483 | 104
484 | 00:06:41.895 --> 00:06:46.395
485 | これは、行番号に対する曜日機能の変更を表
486 | します。
487 | 
488 | 105
489 | 00:06:46.395 --> 00:06:50.230
490 | これは、データが時間によってここに注文さ
491 | れているように見えます。
492 | 
493 | 106
494 | 00:06:50.230 --> 00:06:54.210
495 | データは31日で構成されています。
496 | 
497 | 107
498 | 00:06:54.210 --> 00:06:57.350
499 | しかし、列車のテストの分割時間ベースでは
500 | なかった。
501 | 
502 | 108
503 | 00:06:57.350 --> 00:07:02.533
504 | そのため、データセットの日番号のような機
505 | 能を派生させることができ、
506 | 
507 | 109
508 | 00:07:02.533 --> 00:07:04.940
509 | 1日の訪問数、
510 | 
511 | 110
512 | 00:07:04.940 --> 00:07:08.645
513 | と一日の訪問の合計金額。
514 | 
515 | 111
516 | 00:07:08.645 --> 00:07:10.970
517 | だから、これは。
518 | 
519 | 112
520 | 00:07:10.970 --> 00:07:15.350
521 | 私達はちょうどこの競争の最も興味深い部分
522 | を論議した。
523 | 
524 | 113
525 | 00:07:15.350 --> 00:07:18.425
526 | データ形式をより適切に変更すると、
527 | 
528 | 114
529 | 00:07:18.425 --> 00:07:21.214
530 | 販売しながら機能を生成する、
531 | 
532 | 115
533 | 00:07:21.214 --> 00:07:24.300
534 | 積み重ねをしながらモデルを操作する。
535 | 
536 | 116
537 | 00:07:24.300 --> 00:07:28.365
538 | そして最後に、追加の機能エンジニアリング
539 | のいくつかを行う。
540 | 
541 | 117
542 | 00:07:28.365 --> 00:07:32.395
543 | 挑戦自体は有用、興味深い証明した。
544 | 
545 | 118
546 | 00:07:32.395 --> 00:07:38.770
547 | そして、私はそれをチェックアウトし、我々
548 | が話しているアプローチを試してみることを
549 | お勧めします。
550 | 
551 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/4_Walmart Trip Type Classification.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/5_Acquire Valued Shoppers Challenge part 1.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/howtowin_kaggle/week_5/6_Acquire Valued Shoppers Challenge part 2.srt.style:
--------------------------------------------------------------------------------
1 | ScriptType: v4.00+
2 | 
3 | [V4+ Styles]
4 | Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
5 | Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1
6 | 


--------------------------------------------------------------------------------
/cousera/script/translate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import http.client, urllib.parse, uuid, json
 4 | import argparse
 5 | import sys
 6 | import textwrap
 7 | 
 8 | parser = argparse.ArgumentParser(description='translate program')
 9 | parser.add_argument('-i','--in', action='store', dest='in_file', required=True, help='tanslate input file')
10 | args = parser.parse_args()
11 | 
12 | if str(args.in_file).endswith('vtt') == False:
13 |     print('invalid in_file. not vtt file. file=' +str(args.in_file))
14 |     sys.exit()
15 | 
16 | # **********************************************
17 | # *** Update or verify the following values. ***
18 | # **********************************************
19 | # Replace the subscriptionKey string value with your valid subscription key.
20 | subscriptionKey = ''
21 | host = 'api.cognitive.microsofttranslator.com'
22 | path = '/translate?api-version=3.0'
23 | # Translate to Japanese
24 | params = "&to=ja";
25 | 
26 | 
27 | def translate (content):
28 | 
29 |     headers = {
30 |         'Ocp-Apim-Subscription-Key': subscriptionKey,
31 |         'Content-type': 'application/json',
32 |         'X-ClientTraceId': str(uuid.uuid4())
33 |     }
34 | 
35 |     conn = http.client.HTTPSConnection(host)
36 |     conn.request ("POST", path + params, content, headers)
37 |     response = conn.getresponse ()
38 |     return response
39 | 
40 | out_file = str(args.in_file).replace('vtt','srt')
41 | out_f = open(out_file, 'w')
42 | comment_no = 1
43 | print("start srt file")
44 | with open(args.in_file,'r') as f:
45 |     for row in f:
46 |         line = row.strip()
47 |         if line == str(comment_no):
48 |             out_f.write(str(comment_no) + "\n")
49 |             out_f.write(next(f))
50 | 
51 |             eng_text = next(f).strip()
52 |             try:
53 |                 work = next(f).strip()
54 |                 while work != '':
55 |                     eng_text += " " + str(work)
56 |                     work = next(f).strip()
57 |             except StopIteration:
58 |                 pass
59 | 
60 |             #print(str(comment_no) + ":" + eng_text +"\n")
61 |             requestBody = [{
62 |                 'Text' : eng_text,
63 |             }]
64 |             content = json.dumps(requestBody, ensure_ascii=False).encode('utf-8')
65 |             result = translate (content)
66 |             result_dict = json.load(result)[0]
67 |             #print(result_dict)
68 |             #print('result_dict:{}'.format(type(result_dict)))
69 | 
70 |             jpn_text = result_dict['translations'][0]['text']
71 |             jpn_text = "\n".join(textwrap.wrap(jpn_text, width=20))
72 |             out_f.write(jpn_text +"\n")
73 | 
74 |             comment_no = comment_no + 1
75 |             out_f.write('\n')
76 | 
77 | out_f.close()
78 | print("end srt file")
79 | 
80 | print("start style file")
81 | with open(out_file+".style",'w') as f:
82 |     f.write("ScriptType: v4.00+\n")
83 |     f.write("\n")
84 |     f.write("[V4+ Styles]\n")
85 |     f.write("Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n")
86 |     f.write("Style: Default,Arial,20,&H00ffffff,&H0000ffff,&H00000000,&H80000000,-1,0,0,0,100,100,0,0.00,1,2,2,1,95,20,20,1\n")
87 | print("end style file")
88 | 


--------------------------------------------------------------------------------
/cousera/share/Week1_(T.Shimano).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week1_(T.Shimano).pdf


--------------------------------------------------------------------------------
/cousera/share/Week_1(T.Nakao).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(T.Nakao).pdf


--------------------------------------------------------------------------------
/cousera/share/Week_1(T.Nakao).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(T.Nakao).pptx


--------------------------------------------------------------------------------
/cousera/share/Week_1(takagishi).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(takagishi).pdf


--------------------------------------------------------------------------------
/cousera/share/Week_1(takagishi).pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/Week_1(takagishi).pptx


--------------------------------------------------------------------------------
/cousera/share/week3_門脇_Concept of mean encoding.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/week3_門脇_Concept of mean encoding.pptx


--------------------------------------------------------------------------------
/cousera/share/week3_門脇_Regularization.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/cousera/share/week3_門脇_Regularization.pptx


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image01.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image02.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image03.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image04.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image05.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image05.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image06.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image06.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image07.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image07.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image08.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image08.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image09.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image09.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image10.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image11.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image12.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_cls_image13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_cls_image13.jpg


--------------------------------------------------------------------------------
/wiki/cousera/3week_image001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image001.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image004.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image007.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image011.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image012.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image022.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image022.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image025.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image025.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image026.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image026.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image027.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image027.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image028.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image028.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image029.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image029.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_image030.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_image030.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding001.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding002.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding003.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding004.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding005.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding006.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding007.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding008.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding009.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding010.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding011.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding011.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding012.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding013.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding013.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding014.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding014.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding015.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding015.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding016.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding016.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding017.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding017.png


--------------------------------------------------------------------------------
/wiki/cousera/3week_mean-encoding018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/3week_mean-encoding018.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures001.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures002.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures003.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures004.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures005.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures006.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures007.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures008.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures009.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_AdvancedFeatures010.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_AdvancedFeatures010.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Ensemble_Tips1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips1.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Ensemble_Tips2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips2.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Ensemble_Tips3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips3.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Ensemble_Tips4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Ensemble_Tips4.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_test.npy


--------------------------------------------------------------------------------
/wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_train.npy.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_cosine_train.npy.zip


--------------------------------------------------------------------------------
/wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_test.npy


--------------------------------------------------------------------------------
/wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_train.npy.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_KNNfeatures/data/knn_feats_minkowski_train.npy.zip


--------------------------------------------------------------------------------
/wiki/cousera/4week_NeuralNet001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_NeuralNet001.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Practicalguide001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide001.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Practicalguide002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide002.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Practicalguide003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide003.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Practicalguide004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Practicalguide004.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_StackNet1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_StackNet1.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_StackNet2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_StackNet2.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_Stacking1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_Stacking1.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_bagging1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging1.jpg


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_bagging2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging2.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_bagging3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging3.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_bagging4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging4.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_bagging5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_bagging5.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting1.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting2.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting3.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting4.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting5.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting6.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting7.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_boosting8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_boosting8.PNG


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_intro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro1.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_intro2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro2.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_intro3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro3.png


--------------------------------------------------------------------------------
/wiki/cousera/4week_ensemble_intro4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/4week_ensemble_intro4.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_001.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_002.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_003.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_004.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_005.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_006.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_007.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_007.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_008.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_008.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_009.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_009.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_101.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_101.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_102.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_102.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_103.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_103.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_104.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_104.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_105.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_105.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_106.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_106.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_107.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_107.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_108.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_108.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_109.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_109.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_110.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_110.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_111.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_111.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_112.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_112.png


--------------------------------------------------------------------------------
/wiki/cousera/5week_image_tn_113.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/5week_image_tn_113.png


--------------------------------------------------------------------------------
/wiki/cousera/Programming assignment, week 4_ Ensembles/__pycache__/grader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/Programming assignment, week 4_ Ensembles/__pycache__/grader.cpython-36.pyc


--------------------------------------------------------------------------------
/wiki/cousera/Programming assignment, week 4_ Ensembles/grader.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import numpy as np
 4 | from collections import OrderedDict
 5 | 
 6 | def array_to_hash(x):
 7 |     x_tupled = None
 8 |     if type(x) == list:
 9 |         x_tupled = tuple(x)
10 |     elif type(x) == np.ndarray:
11 |         x_tupled = tuple(list(x.flatten()))
12 |     elif type(x) == tuple:
13 |         x_tupled = x
14 |     else:
15 |         raise RuntimeError('unexpected type of input: {}'.format(type(x)))
16 |     return hash(tuple(map(float, x_tupled)))
17 | 
18 | def almostEqual(x, y):
19 |     return abs(x - y) < 1e-5
20 | 
21 | 
22 | class Grader(object):
23 |     def __init__(self):
24 |         self.submission_page = 'https://www.coursera.org/api/onDemandProgrammingScriptSubmissions.v1'
25 |         self.assignment_key = 'Lhay-55JEeet3xIBvGMumA'
26 |         self.parts = OrderedDict([
27 |                         ('EyiFH', 'best_alpha'),
28 |                         ('XH82R', 'r2_train_simple_mix'),
29 |                         ('BHeRs', 'r2_test_simple_mix'),
30 |                         ('MkwCS', 'r2_train_stacking'),
31 |                         ('j4Adb', 'r2_test_stacking'),
32 |                      ])
33 |         self.answers = {key: None for key in self.parts}
34 | 
35 |     @staticmethod
36 |     def ravel_output(output):
37 |         '''
38 |            If student accedentally submitted np.array with one
39 |            element instead of number, this function will submit
40 |            this number instead
41 |         '''
42 |         if isinstance(output, np.ndarray) and output.size == 1:
43 |             output = output.item(0)
44 |         return output
45 | 
46 |     def submit(self, email, token):
47 |         submission = {
48 |                     "assignmentKey": self.assignment_key, 
49 |                     "submitterEmail": email, 
50 |                     "secret": token, 
51 |                     "parts": {}
52 |                   }
53 |         for part, output in self.answers.items():
54 |             if output is not None:
55 |                 submission["parts"][part] = {"output": output}
56 |             else:
57 |                 submission["parts"][part] = dict()
58 |         request = requests.post(self.submission_page, data=json.dumps(submission))
59 |         response = request.json()
60 |         if request.status_code == 201:
61 |             print('Submitted to Coursera platform. See results on assignment page!')
62 |         elif u'details' in response and u'learnerMessage' in response[u'details']:
63 |             print(response[u'details'][u'learnerMessage'])
64 |         else:
65 |             print("Unknown response from Coursera: {}".format(request.status_code))
66 |             print(response)
67 | 
68 |     def status(self):
69 |         print("You want to submit these numbers:")
70 |         for part_id, part_name in self.parts.items():
71 |             answer = self.answers[part_id]
72 |             if answer is None:
73 |                 answer = '-'*10
74 |             print("Task {}: {}".format(part_name, answer))
75 |                
76 |     def submit_part(self, part, output):
77 |         self.answers[part] = output
78 |         print("Current answer for task {} is: {}".format(self.parts[part], output))
79 | 
80 |     def submit_tag(self, tag, output):
81 |         part_id = [k for k, v in self.parts.items() if v == tag]
82 |         if len(part_id)!=1:
83 |             raise RuntimeError('cannot match tag with part_id: found {} matches'.format(len(part_id)))
84 |         part_id = part_id[0]
85 |         self.submit_part(part_id, str(self.ravel_output(output)))


--------------------------------------------------------------------------------
/wiki/cousera/clone1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/clone1.png


--------------------------------------------------------------------------------
/wiki/cousera/translate_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/translate_1.png


--------------------------------------------------------------------------------
/wiki/cousera/week1_program_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week1_program_plot.png


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf1.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf2.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf3.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf4.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf5.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf6.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf6.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_cf7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_cf7.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_mm1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_mm1.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_mm2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_mm2.PNG


--------------------------------------------------------------------------------
/wiki/cousera/week5_sl1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takagishi/spade/753c9a653219af21a60678692c78c698025d8343/wiki/cousera/week5_sl1.PNG


--------------------------------------------------------------------------------