├── .gitignore
├── README.md
├── advertising-commerce
    ├── assets
    │   ├── deepfm_001.png
    │   ├── deepfm_002.png
    │   ├── deepfm_003.png
    │   └── deepfm_004.png
    └── deepfm.md
├── cv
    ├── ArcFace.md
    ├── CRAFT.md
    ├── DeiT.md
    ├── EfficientDet.md
    ├── EfficientNet.md
    ├── LAMBERT.md
    ├── MAE.md
    ├── MLP-Mixer.md
    ├── MnasNet.md
    ├── Swin.md
    ├── ViT.md
    ├── assets
    │   ├── ArcFace_01.jpeg
    │   ├── ArcFace_02.jpeg
    │   ├── ArcFace_03.jpeg
    │   ├── ArcFace_04.jpeg
    │   ├── ArcFace_05.jpeg
    │   ├── ArcFace_06.jpeg
    │   ├── ArcFace_07.jpeg
    │   ├── CRAFT_01.png
    │   ├── CRAFT_02.png
    │   ├── CRAFT_03.png
    │   ├── CRAFT_04.png
    │   ├── CRAFT_05.png
    │   ├── CRAFT_06.png
    │   ├── DeiT_01.png
    │   ├── DeiT_02.png
    │   ├── DeiT_03.png
    │   ├── DeiT_04.png
    │   ├── DeiT_05.png
    │   ├── EfficientDet_01.png
    │   ├── EfficientDet_02.png
    │   ├── EfficientDet_03.png
    │   ├── EfficientDet_04.png
    │   ├── EfficientDet_05.png
    │   ├── EfficientDet_06.png
    │   ├── EfficientDet_07.png
    │   ├── EfficientDet_08.png
    │   ├── EfficientDet_09.png
    │   ├── EfficientDet_10.png
    │   ├── EfficientNet_01.png
    │   ├── EfficientNet_02.png
    │   ├── EfficientNet_03.png
    │   ├── EfficientNet_04.png
    │   ├── EfficientNet_05.png
    │   ├── EfficientNet_06.png
    │   ├── EfficientNet_07.png
    │   ├── EfficientNet_08.png
    │   ├── EfficientNet_09.png
    │   ├── EfficientNet_10.png
    │   ├── LAMBERT_01.jpeg
    │   ├── LAMBERT_02.jpeg
    │   ├── MAE_01.jpeg
    │   ├── MAE_02.jpeg
    │   ├── MAE_03.jpeg
    │   ├── MAE_04.jpeg
    │   ├── MAE_05.jpeg
    │   ├── MAE_06.jpeg
    │   ├── MAE_07.jpeg
    │   ├── MLP-Mixer_01.png
    │   ├── MLP-Mixer_02.png
    │   ├── MLP-Mixer_03.png
    │   ├── MnasNet_01.png
    │   ├── MnasNet_02.png
    │   ├── MnasNet_03.png
    │   ├── MnasNet_04.png
    │   ├── MnasNet_05.png
    │   ├── MnasNet_06.png
    │   ├── MnasNet_07.png
    │   ├── MnasNet_08.png
    │   ├── Swin_01.png
    │   ├── Swin_02.png
    │   ├── Swin_03.png
    │   ├── Swin_04.png
    │   ├── Swin_05.png
    │   ├── Swin_06.png
    │   ├── Swin_07.png
    │   ├── Swin_08.png
    │   ├── Swin_09.png
    │   ├── Swin_10.png
    │   ├── Swin_11.png
    │   ├── Swin_12.png
    │   ├── ViT_01.png
    │   ├── ViT_02.png
    │   ├── ViT_03.png
    │   ├── ViT_04.png
    │   ├── ViT_05.png
    │   ├── ViT_06.png
    │   ├── crnn_01.png
    │   ├── fast-rcnn_01.png
    │   ├── fast-rcnn_02.png
    │   ├── fast-rcnn_03.png
    │   ├── faster-rcnn_01.png
    │   ├── faster-rcnn_02.png
    │   ├── faster-rcnn_03.png
    │   ├── faster-rcnn_04.png
    │   ├── faster-rcnn_05.png
    │   ├── faster-rcnn_06.png
    │   ├── gMLP_01.png
    │   ├── gMLP_02.png
    │   ├── gMLP_03.png
    │   ├── iou_giou_diou_ciou_yc.jpeg
    │   ├── mask-rcnn_01.png
    │   ├── mask-rcnn_02.png
    │   ├── mask-rcnn_03.png
    │   ├── mask-rcnn_04.png
    │   ├── rcnn_01.png
    │   ├── rcnn_02.png
    │   ├── rcnn_03.png
    │   ├── rcnn_04.png
    │   ├── rcnn_05.png
    │   ├── rcnn_06.png
    │   ├── rcnn_07.png
    │   ├── rcnn_08.png
    │   ├── what-is-wrong-with-scene-text-recognition-model-comparisons_01.png
    │   ├── what-is-wrong-with-scene-text-recognition-model-comparisons_02.png
    │   ├── what-is-wrong-with-scene-text-recognition-model-comparisons_03.png
    │   ├── what-is-wrong-with-scene-text-recognition-model-comparisons_04.png
    │   ├── what-is-wrong-with-scene-text-recognition-model-comparisons_05.png
    │   ├── yolo-v1_01.png
    │   ├── yolo-v1_02.png
    │   ├── yolo-v1_03.png
    │   ├── yolo-v1_04.png
    │   ├── yolo-v2_01.png
    │   ├── yolo-v2_02.png
    │   ├── yolo-v2_03.png
    │   ├── yolo-v2_04.png
    │   ├── yolo-v2_05.png
    │   ├── yolo-v2_06.png
    │   ├── yolo-v2_07.png
    │   ├── yolo-v3_01.png
    │   ├── yolo-v3_02.png
    │   ├── yolo-v3_03.png
    │   ├── yolo-v4_01.png
    │   ├── yolo-v4_02.png
    │   ├── yolo-v4_03.png
    │   ├── yolo-v4_04.png
    │   ├── yolo-v4_05.png
    │   ├── yolo-v4_06.png
    │   ├── yolo-v4_07.png
    │   ├── yolo-v4_08.png
    │   ├── yolo-v4_09.png
    │   ├── yolo-v4_10.png
    │   ├── yolo-v4_11.png
    │   └── yolo-v4_12.png
    ├── crnn.md
    ├── fast-rcnn.md
    ├── faster-rcnn.md
    ├── gMLP.md
    ├── mask-rcnn.md
    ├── rcnn.md
    ├── what-is-wrong-with-scene-text-recognition-model-comparisons.md
    ├── yolo-v1.md
    ├── yolo-v2.md
    ├── yolo-v3.md
    └── yolo-v4.md
├── generative
    ├── assets
    │   ├── density-estimation-using-real-nvp_01.png
    │   ├── density-estimation-using-real-nvp_02.png
    │   ├── density-estimation-using-real-nvp_03.png
    │   ├── density-estimation-using-real-nvp_04.png
    │   ├── density-estimation-using-real-nvp_05.png
    │   ├── density-estimation-using-real-nvp_06.png
    │   ├── density-estimation-using-real-nvp_07.png
    │   ├── density-estimation-using-real-nvp_08.png
    │   ├── glow_02.png
    │   ├── glow_03.png
    │   ├── glow_04.png
    │   ├── glow_05.png
    │   ├── glow_06.png
    │   ├── glow_07.png
    │   └── glow_08.png
    ├── density-estimation-using-real-nvp.md
    └── glow.md
├── gnn
    ├── GRAPE.md
    ├── SpellGCN.md
    ├── assets
    │   ├── GRAPE_01.png
    │   ├── GRAPE_02.png
    │   ├── GRAPE_03.png
    │   ├── GRAPE_04.png
    │   ├── GRAPE_05.png
    │   ├── GRAPE_06.png
    │   ├── SpellGCN_01.png
    │   ├── SpellGCN_02.png
    │   ├── SpellGCN_03.png
    │   ├── deep-walk_01.png
    │   ├── deep-walk_02.png
    │   ├── deep-walk_03.png
    │   ├── deep-walk_04.png
    │   ├── deep-walk_05.png
    │   ├── deep-walk_06.png
    │   ├── gcn_01.png
    │   ├── gin_01.png
    │   ├── gin_02.png
    │   ├── gin_03.png
    │   ├── gin_04.png
    │   ├── gin_05.png
    │   ├── gin_06.png
    │   ├── graph-neural-networks-a-review-of-methods-and-applications_01.png
    │   ├── graph-neural-networks-a-review-of-methods-and-applications_02.png
    │   ├── graph-neural-networks-a-review-of-methods-and-applications_03.png
    │   ├── graph-neural-networks-a-review-of-methods-and-applications_04.png
    │   ├── graph-neural-networks-a-review-of-methods-and-applications_05.png
    │   ├── graph-sage_01.png
    │   ├── graph-sage_02.png
    │   └── graph-sage_03.png
    ├── deep-walk.md
    ├── gcn.md
    ├── gin.md
    ├── graph-neural-networks-a-review-of-methods-and-applications.md
    └── graph-sage.md
├── index.html
├── nlp
    ├── ALBERT.md
    ├── BiDAF.md
    ├── CNN-for-sentence-classification.md
    ├── GPT.md
    ├── GPT2.md
    ├── GPT3.md
    ├── GloVe.md
    ├── QANet.md
    ├── RoBERTa.md
    ├── T5.md
    ├── Transformer-XL.md
    ├── XLNet.md
    ├── a-fast-and-accurate-dependency-parser-using-nural-networks.md
    ├── assets
    │   ├── ALBERT_01.png
    │   ├── ALBERT_02.png
    │   ├── ALBERT_03.png
    │   ├── ALBERT_04.png
    │   ├── ALBERT_05.png
    │   ├── ALBERT_06.png
    │   ├── BiDAF_01.png
    │   ├── CNN-for-sentence-classification_01.png
    │   ├── CNN-for-sentence-classification_02.png
    │   ├── CNN-for-sentence-classification_03.png
    │   ├── GPT2_01.png
    │   ├── GPT2_02.png
    │   ├── GPT2_03.png
    │   ├── GPT3_01.png
    │   ├── GPT3_02.png
    │   ├── GPT3_03.png
    │   ├── GPT3_04.png
    │   ├── GPT_01.png
    │   ├── GPT_02.png
    │   ├── GPT_03.png
    │   ├── GPT_04.png
    │   ├── QANet_01.png
    │   ├── QANet_02.png
    │   ├── QANet_03.png
    │   ├── QANet_04.png
    │   ├── RoBERTa_01.png
    │   ├── RoBERTa_02.png
    │   ├── T5_01.png
    │   ├── T5_02.png
    │   ├── T5_03.png
    │   ├── T5_04.png
    │   ├── T5_05.png
    │   ├── T5_06.png
    │   ├── T5_07.png
    │   ├── T5_08.png
    │   ├── T5_09.png
    │   ├── T5_10.png
    │   ├── T5_11.png
    │   ├── T5_12.png
    │   ├── T5_13.png
    │   ├── T5_14.png
    │   ├── T5_15.png
    │   ├── Transformer-XL_01.png
    │   ├── Transformer-XL_02.png
    │   ├── Transformer-XL_03.png
    │   ├── Transformer-XL_04.png
    │   ├── XLNet_01.png
    │   ├── XLNet_02.png
    │   ├── XLNet_03.png
    │   ├── XLNet_04.png
    │   ├── XLNet_05.png
    │   ├── XLNet_06.png
    │   ├── XLNet_07.png
    │   ├── XLNet_08.png
    │   ├── XLNet_09.png
    │   ├── a-fast-and-accurate-dependency-parser-using-nural-networks_01.png
    │   ├── a-fast-and-accurate-dependency-parser-using-nural-networks_02.png
    │   ├── a-fast-and-accurate-dependency-parser-using-nural-networks_03.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_01.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_02.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_03.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_04.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_05.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_06.png
    │   ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_07.png
    │   ├── attention-is-all-you-need_01.png
    │   ├── attention-is-all-you-need_02.png
    │   ├── attention-is-all-you-need_03.png
    │   ├── attention-is-all-you-need_04.png
    │   ├── attention-is-all-you-need_05.png
    │   ├── bert_01.png
    │   ├── bert_02.png
    │   ├── bert_03.png
    │   ├── bert_04.png
    │   ├── bert_05.png
    │   ├── bert_06.png
    │   ├── bert_07.png
    │   ├── bert_08.png
    │   ├── bert_09.png
    │   ├── bert_13.png
    │   ├── distributed-representations-of-words-and-phrases-and-their-compositionality_01.png
    │   ├── distributed-representations-of-words-and-phrases-and-their-compositionality_02.png
    │   ├── doc2vec_01.png
    │   ├── doc2vec_02.png
    │   ├── efficient-estimation-of-word-representations-in-vector-space_01.png
    │   ├── efficient-estimation-of-word-representations-in-vector-space_02.png
    │   ├── efficient-estimation-of-word-representations-in-vector-space_03.png
    │   ├── efficient-estimation-of-word-representations-in-vector-space_04.png
    │   ├── efficient-estimation-of-word-representations-in-vector-space_05.png
    │   ├── efficient-estimation-of-word-representations-in-vector-space_06.png
    │   ├── efficient-transformers-a-survey_01.png
    │   ├── efficient-transformers-a-survey_02.png
    │   ├── efficient-transformers-a-survey_03.png
    │   ├── efficient-transformers-a-survey_04.png
    │   ├── efficient-transformers-a-survey_05.png
    │   ├── efficient-transformers-a-survey_06.png
    │   ├── efficient-transformers-a-survey_07.png
    │   ├── efficient-transformers-a-survey_08.png
    │   ├── efficient-transformers-a-survey_09.png
    │   ├── efficient-transformers-a-survey_10.png
    │   ├── efficient-transformers-a-survey_11.png
    │   ├── efficient-transformers-a-survey_12.png
    │   ├── efficient-transformers-a-survey_13.png
    │   ├── efficient-transformers-a-survey_14.png
    │   ├── efficient-transformers-a-survey_15.png
    │   ├── efficient-transformers-a-survey_16.png
    │   ├── efficient-transformers-a-survey_17.png
    │   ├── elmo_01.png
    │   ├── elmo_02.png
    │   ├── elmo_03.png
    │   ├── elmo_04.png
    │   ├── glove_01.png
    │   ├── glove_02.png
    │   ├── glove_03.png
    │   ├── glove_04.png
    │   ├── neural-machine-translation-by-jointly-learning-to-align-and-translate_01.png
    │   ├── neural-machine-translation-by-jointly-learning-to-align-and-translate_02.png
    │   ├── neural-machine-translation-by-jointly-learning-to-align-and-translate_03.png
    │   ├── sequence-to-sequence-learning-with-neural-networks_01.png
    │   └── sequence-to-sequence-learning-with-neural-networks_02.png
    ├── attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling.md
    ├── attention-is-all-you-need.md
    ├── bert.md
    ├── distributed-representations-of-words-and-phrases-and-their-compositionality.md
    ├── doc2vec.md
    ├── dr-qa.md
    ├── efficient-estimation-of-word-representations-in-vector-space.md
    ├── efficient-transformers-a-survey.md
    ├── elmo.md
    ├── neural-machine-translation-by-jointly-learning-to-align-and-translate.md
    └── sequence-to-sequence-learning-with-neural-networks.md
├── optimization-training-techniques
    ├── AMSGrad.md
    ├── AdamW.md
    ├── RAdam.md
    ├── an-overview-of-gradient-descent-optimization-algorithms.md
    ├── assets
    │   ├── AMSGrad_01.png
    │   ├── AMSGrad_02.png
    │   ├── AMSGrad_03.png
    │   ├── AMSGrad_04.png
    │   ├── AMSGrad_05.png
    │   ├── AdamW_01.png
    │   ├── AdamW_02.png
    │   ├── AdamW_03.png
    │   ├── AdamW_04.png
    │   ├── AdamW_05.png
    │   ├── AdamW_06.png
    │   ├── RAdam_01.png
    │   ├── RAdam_02.png
    │   ├── RAdam_03.png
    │   ├── RAdam_04.png
    │   ├── RAdam_05.png
    │   ├── RAdam_06.png
    │   ├── RAdam_07.png
    │   ├── RAdam_08.png
    │   ├── RAdam_09.png
    │   ├── RAdam_10.png
    │   ├── RAdam_11.png
    │   ├── RAdam_12.png
    │   ├── RAdam_13.png
    │   ├── an-overview-of-gradient-descent-optimization-algorithms_01.png
    │   ├── an-overview-of-gradient-descent-optimization-algorithms_02.png
    │   ├── an-overview-of-gradient-descent-optimization-algorithms_03.png
    │   ├── an-overview-of-gradient-descent-optimization-algorithms_04.png
    │   ├── lars_01.png
    │   ├── lars_02.png
    │   ├── lars_03.png
    │   ├── lars_04.png
    │   ├── lars_05.png
    │   ├── lookahead_01.png
    │   ├── lookahead_02.png
    │   ├── lookahead_03.png
    │   ├── lookahead_04.png
    │   ├── lookahead_05.png
    │   ├── lookahead_06.png
    │   └── lookahead_07.png
    ├── lars.md
    └── lookahead.md
├── speech
    ├── assets
    │   ├── attention_mechanism.jpg
    │   ├── ctc_01.png
    │   ├── ctc_02.png
    │   ├── ctc_03.png
    │   ├── ctc_04.png
    │   ├── ctc_05.png
    │   ├── ctc_06.png
    │   ├── ctc_07.jpeg
    │   ├── ctc_08.jpeg
    │   ├── ctc_09.png
    │   ├── ctc_10.png
    │   ├── ctc_11.png
    │   ├── ctc_12.png
    │   ├── ctc_13.png
    │   ├── ctc_14.png
    │   ├── ctc_15.png
    │   ├── ctc_16.png
    │   ├── tacotron2_01.png
    │   ├── tacotron2_02.png
    │   ├── tacotron2_03.png
    │   ├── tacotron2_04.png
    │   ├── tacotron2_decoder.png
    │   ├── tacotron_01.png
    │   ├── tacotron_02.png
    │   ├── tacotron_03.png
    │   ├── tacotron_04.png
    │   ├── waveglow_01.png
    │   ├── waveglow_02.png
    │   ├── waveglow_03.png
    │   ├── wavenet_01.png
    │   ├── wavenet_02.png
    │   ├── wavenet_03.png
    │   ├── wavenet_04.png
    │   └── wavenet_05.png
    ├── ctc.md
    ├── tacotron.md
    ├── tacotron2.md
    ├── waveglow.md
    └── wavenet.md
├── summary.md
├── time-series
    ├── NeuralODE.md
    └── assets
    │   ├── NeuralODE_01.png
    │   ├── NeuralODE_02.png
    │   ├── NeuralODE_03.png
    │   ├── NeuralODE_04.png
    │   ├── NeuralODE_05.png
    │   └── NeuralODE_06.png
└── understanding-generalization-transfer
    ├── assets
        ├── deep-neural-networks-are-easily-fooled_001.png
        ├── deep-neural-networks-are-easily-fooled_002.png
        ├── deep-neural-networks-are-easily-fooled_003.png
        ├── deep-neural-networks-are-easily-fooled_004.png
        ├── deep-neural-networks-are-easily-fooled_005.png
        ├── deep-neural-networks-are-easily-fooled_006.png
        ├── distilling-the-knowledge-in-a-neural-network_001.png
        ├── how-transferable-are-features-in-deep-neural-networks_001.png
        ├── how-transferable-are-features-in-deep-neural-networks_002.png
        ├── how-transferable-are-features-in-deep-neural-networks_003.png
        ├── image.png
        ├── rethinking_pre-training_and_self-training_01.png
        ├── rethinking_pre-training_and_self-training_02.png
        ├── rethinking_pre-training_and_self-training_03.png
        ├── rethinking_pre-training_and_self-training_04.png
        ├── rethinking_pre-training_and_self-training_05.png
        └── rethinking_pre-training_and_self-training_06.png
    ├── deep-neural-networks-are-easily-fooled-high-confidence-predictions-for-unrecognizable-images.md
    ├── distilling-the-knowledge-in-a-neural-network.md
    ├── how-transferable-are-features-in-deep-neural-networks.md
    └── rethinking_pre-training_and_self-training.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # pipenv
 86 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 87 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 88 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 89 | #   install all needed dependencies.
 90 | #Pipfile.lock
 91 | 
 92 | # celery beat schedule file
 93 | celerybeat-schedule
 94 | 
 95 | # SageMath parsed files
 96 | *.sage.py
 97 | 
 98 | # Environments
 99 | .env
100 | .venv
101 | env/
102 | venv/
103 | ENV/
104 | env.bak/
105 | venv.bak/
106 | 
107 | # Spyder project settings
108 | .spyderproject
109 | .spyproject
110 | 
111 | # Rope project settings
112 | .ropeproject
113 | 
114 | # mkdocs documentation
115 | /site
116 | 
117 | # mypy
118 | .mypy_cache/
119 | .dmypy.json
120 | dmypy.json
121 | 
122 | # Pyre type checker
123 | .pyre/
124 | 
125 | # mac
126 | .DS_Store
127 | 
128 | # pelican files
129 | output
130 | 
131 | raw_markdown/content/
132 | 


--------------------------------------------------------------------------------
/advertising-commerce/assets/deepfm_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/advertising-commerce/assets/deepfm_001.png


--------------------------------------------------------------------------------
/advertising-commerce/assets/deepfm_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/advertising-commerce/assets/deepfm_002.png


--------------------------------------------------------------------------------
/advertising-commerce/assets/deepfm_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/advertising-commerce/assets/deepfm_003.png


--------------------------------------------------------------------------------
/advertising-commerce/assets/deepfm_004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/advertising-commerce/assets/deepfm_004.png


--------------------------------------------------------------------------------
/advertising-commerce/deepfm.md:
--------------------------------------------------------------------------------
 1 | # DeepFM: A Factorization-Machine based Neural Network for CTR Prediction (2017), H. Guo et al. 
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1703.04247)\] \[[code](https://github.com/ChenglongChen/tensorflow-DeepFM)\]
 6 | 
 7 | ---
 8 | 
 9 | **TR;DR**
10 | 
11 | - design advantages:
12 |   - it does not need any pre-training
13 |   - it learns both high- and low-order feature interactions
14 |   - it introduces a sharing strategy of feature embedding to avoid feature engineering
15 | - performance:
16 |   - DeepFM outperforms the state-of- the-art models in terms of AUC and Logloss on both datasets (Criteo Dataset and Company Dataset)
17 |   - The efficiency of DeepFM is comparable to the most effi- cient deep model in the state-of-the-art
18 | 
19 | **Introduction**
20 | 
21 | - prediction target: click-through rate (CTR)
22 | - feature interactions
23 |   - download apps for food delivery at meal-time, suggesting that the (order-2) interaction between app category and time-stamp
24 |   - male teenagers like shooting games and RPG games, which means that the (order-3) interaction of app category, user gender and age
25 | 
26 | **Approach**
27 | 
28 | ![](assets/deepfm_001.png)
29 | 
30 | (Weight-1 Connection: point-wise operation to keep embedding vector format and information)
31 | 
32 | - denote
33 |   - size of the features: $M$
34 |   - size of the feature fields: $F$
35 |   - size of the feature embedding: $K$
36 | - FM Component: first order
37 |   - obtain $\{w_m x_m\}$ terms
38 | - FM Component: second order
39 |   - obtain $\{x_ix_j(V_i\odot V_j)\}$ ($i\neq j$ ; $i,j=1,...,M$) terms
40 | - Deep Component
41 |   - $a^{(l+1)}=\sigma(W^{(l)}a^{(l)}+b^{(l)})$
42 | - DeepFM: FM Component: first order + second order & Deep Component
43 |   - output: $sigmoid(concat[\{w_m x_m\}, \{x_ix_j(V_i\odot V_j)\},a^{(L)}])$
44 | 
45 | - benefits:
46 |   - it learns both low- and high-order fea- ture interactions from raw features
47 |   - there is no need for ex- pertise feature engineering of the input, as required in Wide & Deep
48 | 
49 | **Relationship with the other Neural Networks**
50 | 
51 | ![](assets/deepfm_002.png)
52 | 
53 | ![](assets/deepfm_003.png)
54 | 
55 | **Experiments**
56 | 
57 | - datasets:
58 |   - Criteo Dataset
59 |   - Company Dataset
60 | - Model Comparison: We compare 9 models in our experiments: LR, FM, FNN, PNN (three variants), Wide & Deep, and DeepFM.
61 | 
62 | ![](assets/deepfm_004.png)


--------------------------------------------------------------------------------
/cv/ArcFace.md:
--------------------------------------------------------------------------------
 1 | # ArcFace: Additive Angular Margin Loss for Deep Face Recognition (2019), Jiankang Deng et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1801.07698.pdf)\] \[[code](https://github.com/deepinsight/insightface)\]
 6 | 
 7 | ---
 8 | 
 9 | 
10 | 
11 | - ![](assets/ArcFace_01.jpeg)
12 | 
13 | - Method:
14 |   - ArcFace Loss:
15 |     - ![](assets/ArcFace_03.jpeg)
16 |   - ![](assets/ArcFace_02.jpeg)
17 |   
18 | - Margin Boundary: Norm-Softmax, SphereFace, CosFace and ArcFace
19 |   - loss:
20 |     - ![](assets/ArcFace_04.jpeg)
21 |     - Norm-Softmax: $m_1=1$, $m_2=0$, $m_3=0$
22 |     - SphereFace: $m_1>1$, $m_2=0$, $m_3=0$
23 |     - CosFace: $m_1=1$, $m_2=0$, $m_3>0$
24 |     - ArcFace: $m_1=1$, $m_2>0$, $m_3=0$
25 |     
26 |   - equations of margin boundary
27 |     - given two-class problem: $Class1$ and $Class2$
28 |     - according to above loss function (4), the boundary is where two scores from $Class1$ and $Class2$ are equal.
29 |       $$
30 |       For\ Class1,\ s\cdot(cos(m_1\theta_1+m_2)-m_3)=s\cdot cos(\theta_2)
31 |       $$
32 |     - Norm-Softmax: $cos\theta_1-cos\theta_2=0\Rightarrow \theta_1-\theta_2=0$
33 |     - SphereFace:  $cos(m_1\theta_1)-cos\theta_2=0\Rightarrow m_1\theta_1-\theta_2=0$
34 |     - CosFace: $cos\theta_1-m_3-cos\theta_2=0\Rightarrow cos\theta_1-cos\theta_2-m_3=0$
35 |     - ArcFace: $cos(\theta_1+m_2)-cos\theta_2=0\Rightarrow \theta_1-\theta_2+m_2=0$
36 |     
37 |   - figures of margin boundary
38 |   
39 |     - ![](assets/ArcFace_05.jpeg)
40 |     - ArcFace enhance the intra-class compactness and inter-class discrepancy.
41 |     - Despite the numerical similarity between ArcFace and previous works, ArcFace has a better geometric attribute as the angular margin has the exact correspondence to the geodesic distance.
42 |   
43 | - Results
44 | 
45 |   - ![](assets/ArcFace_06.jpeg)
46 |   - ![](assets/ArcFace_07.jpeg)
47 | 


--------------------------------------------------------------------------------
/cv/DeiT.md:
--------------------------------------------------------------------------------
 1 | # Training data-efficient image transformers & distillation through attention (2020), Hugo Touvron et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/2012.12877)\] \[[code](https://github.com/facebookresearch/deit)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Introduction
10 | 
11 | - Data-efficient image Transformers (DeiT)
12 | - we introduce a teacher-student strategy specific to transformers
13 | - In this paper, we train a vision transformer on a single 8-GPU node in two to three days (53 hours of pre-training, and optionally 20 hours of fine-tuning).
14 |   - The paper of ViT concluded that transformers “do not generalize well when trained on insufficient amounts of data”, and the training of these models involved extensive computing resources.
15 | 
16 | 
17 | 
18 | ### Distillation through attention
19 | 
20 | - This section covers two axes of distillation: hard distillation versus soft distillation, and classical distillation versus the distillation token
21 | - Soft distillation
22 |   - minimizes the Kullback-Leibler divergence between the softmax of the teacher and the softmax of the student model
23 |   - ![](assets/DeiT_02.png)
24 |     - $Z_t$ : the logits of the teacher model
25 |     - $Z_s$ : the logits of the student model
26 |     - $\tau$ : the temperature for the distillation
27 |     - $\lambda$ : the coefficient balancing the Kullbak-Leibler divergence loss and the cross-entropy
28 |     - $y$ : ground true labels
29 |     - $ψ$ : the softmax function
30 | - Hard-label distillation
31 |   - We introduce a variant of distillation where we take the hard decision of the teacher as a true label.
32 |   - ![](assets/DeiT_03.png)
33 | - Distillation token
34 |   - ![](assets/DeiT_04.png)
35 | 
36 | 
37 | 
38 | ### Experiment
39 | 
40 | - Distillation
41 |   - default teacher is a RegNetY-16GF (84M parameters)
42 |   - Hard distillation significantly outperforms soft distillation for transformers
43 |     - ![](assets/DeiT_05.png)
44 |   - Interestingly, the distilled model outperforms its teacher in terms of the trade-off between accuracy and throughput.
45 |     - ![](assets/DeiT_01.png)


--------------------------------------------------------------------------------
/cv/EfficientDet.md:
--------------------------------------------------------------------------------
 1 | # EfficientDet: Scalable and Efficient Object Detection (2020), Mingxing Tan et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1911.09070)\] \[[code](https://github.com/google/automl/tree/master/efficientdet)\]
 6 | 
 7 | ---
 8 | 
 9 | - related work
10 |   - AmoebaNet-based NASFPN
11 |     - but large model sizes and expensive computation costs
12 |   - more efficient detector architectures: one-stage, anchor-free
13 |     - but sacrifice accuracy
14 | - contributions
15 |   - we propose a weighted bi-directional feature pyramid network (BiFPN), which allows easy and fast multiscale feature fusion
16 |   - we propose a compound scaling method that uniformly scales the resolution, depth, and width for all backbone, feature network, and box/class prediction networks at the same time
17 | - challenges
18 |   - Challenge 1: efficient multi-scale feature fusion
19 |     - related: FPN, PANet, NAS-FPN (employs neural architecture search to search for better cross-scale feature network topology)
20 |     - bi-directional feature pyramid network (BiFPN)
21 |       - which introduces learnable weights to learn the importance of different input features, while repeatedly applying top-down and bottom-up multi-scale feature fusion
22 |   - Challenge 2: model scaling
23 | - BiFPN
24 |   - ![](assets/EfficientDet_01.png)
25 |     - $P_i^{in}$ represents a feature level with resolution of $1/2i$ of the input images
26 |   - Cross-Scale Connections
27 |     - First, we remove those nodes that only have one input edge. Our intuition is simple: if a node has only one input edge with no feature fusion, then it will have less contribution to feature network that aims at fusing different features
28 |     - Second, we add an extra edge from the original input to output node if they are at the same level, in order to fuse more features without adding much cost
29 |     - Third, bidirectional (top-down & bottom-up) path
30 |   - Weighted Feature Fusion
31 |     - All previous methods treat all input features equally without distinction
32 |     - Fast normalized fusion: $O=\sum_i\frac{w_i}{ε+\sum_jw_j}I_i$
33 | - EfficientDet
34 |   - ![](assets/EfficientDet_03.png)
35 |   - ImageNet-pretrained EfficientNets as the backbone network
36 |     - same width/depth scaling coefficients of EfficientNet-B0 to B6
37 |   - Compound Scaling
38 |     - jointly scaling up all dimensions of network width, depth, and input resolution
39 |     - ![](assets/EfficientDet_04.png)
40 |       - $W_{bifpn}$: # of channels in BiFPN layers
41 |         - we perform a grid search on a list of values {1.2, 1.25, 1.3, 1.35, 1.4, 1.45}, and pick the best value 1.35 as the BiFPN width scaling factor
42 |       - $D_{bifpn}$: BiFPN depth (# of layers)
43 |     - ![](assets/EfficientDet_05.png)
44 |     - ![](assets/EfficientDet_06.png)
45 |   - ![](assets/EfficientDet_07.png)
46 |   - ![](assets/EfficientDet_08.png)
47 | - Experiments
48 |   - ![](assets/EfficientDet_09.png)
49 |   - ![](assets/EfficientDet_02.png)
50 |   - ![](assets/EfficientDet_10.png)
51 | 
52 | 


--------------------------------------------------------------------------------
/cv/EfficientNet.md:
--------------------------------------------------------------------------------
 1 | # EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks (2020), Mingxing Tan et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1905.11946)\] \[[code](https://github.com/qubvel/efficientnet)\]
 6 | 
 7 | ---
 8 | 
 9 | - Our empirical study shows that it is critical to balance all dimensions of network width/depth/resolution, and surprisingly such balance can be achieved by simply scaling each of them with constant ratio.
10 | - Compound Model Scaling
11 |   - Problem Formulation
12 |     - A ConvNet Layer $i$ can be defined as a function: $Y_i = F_i(X_i)$, where $F_i$ is the operator, $Y_i$ is output tensor, $X_i$ is input tensor, with tensor shape $\langle H_i, W_i, C_i\rangle$, where $H_i$ and $W_i$ are spatial dimension and $C_i$ is the channel dimension.
13 |     - Define a ConvNet as:
14 |       ![](assets/EfficientNet_01.png)
15 |       - where $F^{L_i}_i$ denotes layer $F_i$ is repeated $L_i$ times in stage $i$, $\langle H_i, W_i, C_i\rangle$ denotes the shape of input tensor $X$ of layer $i$
16 |     - Unlike regular ConvNet designs that mostly focus on finding the best layer architecture $F_i$, model scaling tries to expand the network length ($L_i$), width ($C_i$), and/or resolution ($H_i$,$W_i$) without changing $F_i$ predefined in the baseline network.
17 |     - In order to further reduce the design space, we restrict that all layers must be scaled uniformly with constant ratio.
18 |     - So, the optimization problem:
19 |       ![](assets/EfficientNet_02.png)
20 |   - Scaling Dimensions
21 |     - ![](assets/EfficientNet_03.png)
22 |       - Observation 1 – Scaling up any dimension of network width, depth, or resolution improves accuracy, but the accuracy gain diminishes for bigger models.
23 |   - Compound Scaling
24 |     - ![](assets/EfficientNet_04.png)
25 |       - Observation 2 – In order to pursue better accuracy and efficiency, it is critical to balance all dimensions of network width, depth, and resolution during ConvNet scaling.
26 |     - ![](assets/EfficientNet_05.png)
27 |     - In this paper, we propose a new **compound scaling method**, which use a compound coefficient $\phi$ to uniformly scales network width, depth, and resolution in a principled way:
28 |       ![](assets/EfficientNet_06.png)
29 |       - where $\alpha$, $\beta$, $\gamma$ are constants that can be determined by a small grid search
30 |       - Notably, the FLOPS of a regular convolution op is proportional to $d$, $w^2$, $r^2$, i.e., doubling network depth will double FLOPS, but doubling network width or resolution will increase FLOPS by four times.
31 |       - In this paper, we constraint $\alpha\cdot\beta^2\cdot\gamma^2=2$  such that for any new $\phi$, the total FLOPS will approximately increase by $2^\phi$
32 | - EfficientNet Architecture
33 |   - Inspired by (Tan et al., 2019, [MnasNet](cv/MnasNet.md)), we develop our baseline network by leveraging a multi-objective neural architecture search that optimizes both accuracy and FLOPS.
34 |   - Our search produces an efficient network, which we name EfficientNet-B0.
35 |     - ![](assets/EfficientNet_07.png)
36 |   - Starting from the baseline EfficientNet-B0, we apply our compound scaling method to scale it up with two steps:
37 |     - STEP 1: we first fix $\phi=1$ , assuming twice more resources available, and do a small grid search of  $\alpha$, $\beta$, $\gamma$ based on Equation 2 and 3. In particular, we find the best values for EfficientNet-B0 are $\alpha=1.2$, $\beta=1.1$, $\gamma=1.15$, under constraint of $\alpha\cdot\beta^2\cdot\gamma^2=2$ .
38 |     - STEP 2: we then fix $\alpha$, $\beta$, $\gamma$ as constants and scale up baseline network with different $\phi$ using Equation 3, to obtain EfficientNet-B1 to B7 (Details in Table 2).
39 |   - ![](assets/EfficientNet_08.png)
40 | 
41 | - Experiments
42 |   - ![](assets/EfficientNet_09.png)
43 |   - ![](assets/EfficientNet_10.png)
44 | 
45 | 


--------------------------------------------------------------------------------
/cv/MLP-Mixer.md:
--------------------------------------------------------------------------------
 1 | # MLP-Mixer: An all-MLP Architecture for Vision (2021), Ilya Tolstikhin et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/2105.01601)\] \[[code](https://github.com/google-research/vision_transformer)\]
 6 | 
 7 | ---
 8 | 
 9 | - In this paper we show that while convolutions and attention are both sufficient for good performance, neither of them are necessary. We present MLP-Mixer, an architecture based exclusively on multi-layer perceptrons (MLPs).
10 | - ![](assets/MLP-Mixer_01.png)
11 |   - MLP-Mixer contains two types of layers:
12 |     - one with MLPs applied across patches (i.e. “mixing” spatial information)
13 |     - one with MLPs applied independently to image patches (i.e. “mixing” the per-location features)
14 |   - Unlike ViTs, Mixer does not use position embeddings because the token-mixing MLPs are sensitive to the order of the input tokens.
15 |   - Finally, Mixer uses a standard classification head with the global average pooling layer followed by a linear classifier.
16 | - In the extreme case, our architecture can be seen as a very special CNN, which uses 1×1 convolutions for channel mixing, and single-channel depth-wise convolutions of a full receptive field and parameter sharing for token mixing.
17 | - Experiment Setup
18 |   - dataset for pre-training
19 |     - ILSVRC2021 ImageNet
20 |     - ImageNet-21k
21 |     - JFT-300M
22 |   - ![](assets/MLP-Mixer_02.png)
23 | - When trained on large datasets, or with modern regularization schemes, MLP-Mixer attains competitive scores on image classification benchmarks, with pre-training and inference cost comparable to state-of-the-art models.
24 |   - ![](assets/MLP-Mixer_03.png)
25 | 
26 | 


--------------------------------------------------------------------------------
/cv/MnasNet.md:
--------------------------------------------------------------------------------
 1 | # MnasNet: Platform-Aware Neural Architecture Search for Mobile (2019), Mingxing Tan et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1807.11626)\] \[[code](https://github.com/mingxingtan/mnasnet)\]
 6 | 
 7 | ---
 8 | 
 9 | - Introduction
10 |   - In this paper, we propose an automated mobile neural architecture search (MNAS) approach, which explicitly incorporate model latency into the main objective so that the search can identify a model that achieves a good trade-off between accuracy and latency.
11 |   - Unlike previous work, where latency is considered via another, often inaccurate proxy (e.g., FLOPS), our approach directly measures real-world inference latency by executing the model on mobile phones.
12 |   - To further strike the right balance between flexibility and search space size, we propose a novel factorized hierarchical search space that encourages layer diversity throughout the network.
13 | - We introduce a multi-objective neural architecture search approach that optimizes both accuracy and real- world latency on mobile devices.
14 |   - A common method: $argmax_m ACC(m)\ s.t.\ LAT(m)\leq T$
15 |   - However, above approach only maximizes a single metric and does not provide multiple Pareto optimal solutions. Informally, a model is called Pareto optimal if either it has the highest accuracy without increasing latency or it has the lowest latency without decreasing accuracy. 
16 |   - Our method: $argmax_m ACC(m)*[LAT(m)/T]^w$
17 |     - $w=\alpha\ \text{if}\ LAT(m)\leq T\ \text{else}\ \beta$
18 |   - ![](assets/MnasNet_01.png)
19 |   - Assumption of soft constraint: Given two models: (1) M1 has latency $l$ and accuracy $a$; (2) M2 has latency $2l$ and $5\%$ higher accuracy $a · (1 + 5\%)$, they should have similar reward: $Reward(M2) = a · (1 + 5\%)·(2l/T)^\beta\sim Reward(M1)=a·(l/T)^\beta$. Solving this gives $\beta\sim -0.07$. Therefore, we use $\alpha=\beta=-0.07$ in our experiments unless explicitly stated.
20 | - We propose a novel factorized hierarchical search space to enable layer diversity yet still strike the right balance between flexibility and search space size, called "Factorized Hierarchical Search Space".
21 |   - A sub search space for a block $i$ consists of the following choices:
22 |     - Convolutional ops *ConvOp*:
23 |       - regular conv (Conv)
24 |       - depthwise conv (DWConv)
25 |         - ![](assets/MnasNet_03.png)
26 |       - mobile inverted bottleneck conv (MBConv)
27 |         - ![](assets/MnasNet_04.png)
28 |     - Convolutional kernel size *KernelSize*: 3x3, 5x5
29 |     - Squeeze-and-excitation ratio *SERatio*: 0, 0.25
30 |       - ![](assets/MnasNet_05.png)
31 |     - Skip ops *SkipOp*: pooling, identity residual, or no skip
32 |     - Output filter size $F_i$
33 |     - Number of layers per block $N_i$
34 |   - ![](assets/MnasNet_02.png)
35 |   - Search Algorithm: 
36 |     - The search framework consists of three components: a recurrent neural network (RNN) based controller, a trainer to obtain the model accuracy, and a mobile phone based inference engine for measuring the la- tency.
37 |     - trainer: Proximal Policy Optimization (PPO)
38 | - We demonstrate new state-of-the-art accuracy on both ImageNet classification and COCO object detection under typical mobile latency constraints.
39 |   - ![](assets/MnasNet_08.png)
40 |   - ![](assets/MnasNet_06.png)
41 |   - ![](assets/MnasNet_07.png)
42 | 
43 | 


--------------------------------------------------------------------------------
/cv/Swin.md:
--------------------------------------------------------------------------------
 1 | # Swin Transformer: Hierarchical Vision Transformer using Shifted Windows (2021), Ze Liu et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/2103.14030)\] \[[code](https://github.com/microsoft/Swin-Transformer)\]
 6 | 
 7 | ---
 8 | 
 9 | - Problem: large variations in the scale of visual entities and the high resolution of pixels in images
10 |   - Need 1: flexibility to model at various scales
11 |   - Need 2: linear computational complexity with respect to image size
12 | 
13 | - Empirically, we find our Swin Transformer architecture to achieve the best speed-accuracy trade-off among these methods on image classification, even though our work focuses on general-purpose performance, e.g. object detection and segmentation , rather than specifically on classification.
14 | 
15 | 
16 | 
17 | ### Model
18 | 
19 | ![](assets/Swin_02.png)
20 | 
21 | - Shifted Window based Self-Attention
22 |   - Self-attention in non-overlapped windows
23 |     - For efficient modeling, we propose to compute self-attention within local windows. The windows are arranged to evenly partition the image in a non-overlapping manner.
24 |     - Supposing each window contains $M × M$ patches, the computational complexity of a global MSA (Multi-head Self-Attention) module and a window based one on an image of $h × w$ patches are:
25 |       ![](assets/Swin_03.png)
26 |       ![](assets/Swin_04.png)
27 |   - Shifted window partitioning in successive blocks
28 |     - The window-based self-attention module lacks connections across windows, which limits its modeling power. To introduce cross-window connections while maintaining the efficient computation of non-overlapping windows, we propose a shifted window partitioning approach which alternates between two partitioning configurations in consecutive Swin Transformer blocks.
29 |       - Shown in Figure 3(b)
30 |     - ![](assets/Swin_05.png)
31 |   - Efficient batch computation for shifted configuration
32 |     - we propose a more efficient batch computation approach by cyclic-shifting toward the top-left direction, as illustrated in Figure 4
33 |     - After this shift, a batched window may be composed of several sub-windows that are not adjacent in the feature map, so a masking mechanism is employed to limit self-attention computation to within each sub-window.
34 |     - ![](assets/Swin_06.png)
35 |   - Relative position bias
36 |     - ![](assets/Swin_07.png)
37 |     - Since the relative position along each axis lies in the range $[−M + 1, M − 1]$, we parameterize a smaller-sized bias matrix $\hat{B} ∈ R^{(2M−1)×(2M−1)}$, and values in $B$ are taken from $\hat{B}$.
38 |       - ![](assets/Swin_08.png)
39 | - Architecture Variants
40 |   - ![](assets/Swin_09.png)
41 | 
42 | - combine different scale feature maps to do down-stream tasks
43 |   - ![](assets/Swin_01.png)
44 | 
45 | 
46 | 
47 | ### Experiments
48 | 
49 | - image classification
50 |   - ![](assets/Swin_10.png)
51 | - object detection
52 |   - ![](assets/Swin_11.png)
53 | - semantic segmentation
54 |   - ![](assets/Swin_12.png)
55 | 


--------------------------------------------------------------------------------
/cv/assets/ArcFace_01.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_01.jpeg


--------------------------------------------------------------------------------
/cv/assets/ArcFace_02.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_02.jpeg


--------------------------------------------------------------------------------
/cv/assets/ArcFace_03.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_03.jpeg


--------------------------------------------------------------------------------
/cv/assets/ArcFace_04.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_04.jpeg


--------------------------------------------------------------------------------
/cv/assets/ArcFace_05.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_05.jpeg


--------------------------------------------------------------------------------
/cv/assets/ArcFace_06.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_06.jpeg


--------------------------------------------------------------------------------
/cv/assets/ArcFace_07.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ArcFace_07.jpeg


--------------------------------------------------------------------------------
/cv/assets/CRAFT_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/CRAFT_01.png


--------------------------------------------------------------------------------
/cv/assets/CRAFT_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/CRAFT_02.png


--------------------------------------------------------------------------------
/cv/assets/CRAFT_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/CRAFT_03.png


--------------------------------------------------------------------------------
/cv/assets/CRAFT_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/CRAFT_04.png


--------------------------------------------------------------------------------
/cv/assets/CRAFT_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/CRAFT_05.png


--------------------------------------------------------------------------------
/cv/assets/CRAFT_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/CRAFT_06.png


--------------------------------------------------------------------------------
/cv/assets/DeiT_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/DeiT_01.png


--------------------------------------------------------------------------------
/cv/assets/DeiT_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/DeiT_02.png


--------------------------------------------------------------------------------
/cv/assets/DeiT_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/DeiT_03.png


--------------------------------------------------------------------------------
/cv/assets/DeiT_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/DeiT_04.png


--------------------------------------------------------------------------------
/cv/assets/DeiT_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/DeiT_05.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_01.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_02.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_03.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_04.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_05.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_06.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_07.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_08.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_09.png


--------------------------------------------------------------------------------
/cv/assets/EfficientDet_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientDet_10.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_01.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_02.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_03.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_04.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_05.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_06.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_07.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_08.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_09.png


--------------------------------------------------------------------------------
/cv/assets/EfficientNet_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/EfficientNet_10.png


--------------------------------------------------------------------------------
/cv/assets/LAMBERT_01.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/LAMBERT_01.jpeg


--------------------------------------------------------------------------------
/cv/assets/LAMBERT_02.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/LAMBERT_02.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_01.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_01.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_02.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_02.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_03.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_03.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_04.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_04.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_05.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_05.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_06.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_06.jpeg


--------------------------------------------------------------------------------
/cv/assets/MAE_07.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MAE_07.jpeg


--------------------------------------------------------------------------------
/cv/assets/MLP-Mixer_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MLP-Mixer_01.png


--------------------------------------------------------------------------------
/cv/assets/MLP-Mixer_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MLP-Mixer_02.png


--------------------------------------------------------------------------------
/cv/assets/MLP-Mixer_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MLP-Mixer_03.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_01.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_02.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_03.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_04.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_05.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_06.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_07.png


--------------------------------------------------------------------------------
/cv/assets/MnasNet_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/MnasNet_08.png


--------------------------------------------------------------------------------
/cv/assets/Swin_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_01.png


--------------------------------------------------------------------------------
/cv/assets/Swin_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_02.png


--------------------------------------------------------------------------------
/cv/assets/Swin_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_03.png


--------------------------------------------------------------------------------
/cv/assets/Swin_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_04.png


--------------------------------------------------------------------------------
/cv/assets/Swin_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_05.png


--------------------------------------------------------------------------------
/cv/assets/Swin_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_06.png


--------------------------------------------------------------------------------
/cv/assets/Swin_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_07.png


--------------------------------------------------------------------------------
/cv/assets/Swin_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_08.png


--------------------------------------------------------------------------------
/cv/assets/Swin_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_09.png


--------------------------------------------------------------------------------
/cv/assets/Swin_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_10.png


--------------------------------------------------------------------------------
/cv/assets/Swin_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_11.png


--------------------------------------------------------------------------------
/cv/assets/Swin_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/Swin_12.png


--------------------------------------------------------------------------------
/cv/assets/ViT_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ViT_01.png


--------------------------------------------------------------------------------
/cv/assets/ViT_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ViT_02.png


--------------------------------------------------------------------------------
/cv/assets/ViT_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ViT_03.png


--------------------------------------------------------------------------------
/cv/assets/ViT_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ViT_04.png


--------------------------------------------------------------------------------
/cv/assets/ViT_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ViT_05.png


--------------------------------------------------------------------------------
/cv/assets/ViT_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/ViT_06.png


--------------------------------------------------------------------------------
/cv/assets/crnn_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/crnn_01.png


--------------------------------------------------------------------------------
/cv/assets/fast-rcnn_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/fast-rcnn_01.png


--------------------------------------------------------------------------------
/cv/assets/fast-rcnn_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/fast-rcnn_02.png


--------------------------------------------------------------------------------
/cv/assets/fast-rcnn_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/fast-rcnn_03.png


--------------------------------------------------------------------------------
/cv/assets/faster-rcnn_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/faster-rcnn_01.png


--------------------------------------------------------------------------------
/cv/assets/faster-rcnn_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/faster-rcnn_02.png


--------------------------------------------------------------------------------
/cv/assets/faster-rcnn_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/faster-rcnn_03.png


--------------------------------------------------------------------------------
/cv/assets/faster-rcnn_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/faster-rcnn_04.png


--------------------------------------------------------------------------------
/cv/assets/faster-rcnn_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/faster-rcnn_05.png


--------------------------------------------------------------------------------
/cv/assets/faster-rcnn_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/faster-rcnn_06.png


--------------------------------------------------------------------------------
/cv/assets/gMLP_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/gMLP_01.png


--------------------------------------------------------------------------------
/cv/assets/gMLP_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/gMLP_02.png


--------------------------------------------------------------------------------
/cv/assets/gMLP_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/gMLP_03.png


--------------------------------------------------------------------------------
/cv/assets/iou_giou_diou_ciou_yc.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/iou_giou_diou_ciou_yc.jpeg


--------------------------------------------------------------------------------
/cv/assets/mask-rcnn_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/mask-rcnn_01.png


--------------------------------------------------------------------------------
/cv/assets/mask-rcnn_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/mask-rcnn_02.png


--------------------------------------------------------------------------------
/cv/assets/mask-rcnn_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/mask-rcnn_03.png


--------------------------------------------------------------------------------
/cv/assets/mask-rcnn_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/mask-rcnn_04.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_01.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_02.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_03.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_04.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_05.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_06.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_07.png


--------------------------------------------------------------------------------
/cv/assets/rcnn_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/rcnn_08.png


--------------------------------------------------------------------------------
/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_01.png


--------------------------------------------------------------------------------
/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_02.png


--------------------------------------------------------------------------------
/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_03.png


--------------------------------------------------------------------------------
/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_04.png


--------------------------------------------------------------------------------
/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/what-is-wrong-with-scene-text-recognition-model-comparisons_05.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v1_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v1_01.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v1_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v1_02.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v1_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v1_03.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v1_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v1_04.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_01.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_02.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_03.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_04.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_05.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_06.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v2_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v2_07.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v3_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v3_01.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v3_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v3_02.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v3_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v3_03.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_01.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_02.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_03.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_04.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_05.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_06.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_07.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_08.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_09.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_10.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_11.png


--------------------------------------------------------------------------------
/cv/assets/yolo-v4_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/cv/assets/yolo-v4_12.png


--------------------------------------------------------------------------------
/cv/crnn.md:
--------------------------------------------------------------------------------
 1 | # An End-to-End Trainable Neural Network for Image-based Sequence Recognition and Its Application to Scene Text Recognition (2015), Baoguang Shi et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1507.05717)\] \[[code](https://github.com/GitYCC/crnn-pytorch)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Prerequisite
10 | 
11 | [Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks (2006), Alex Graves et al.](speech/ctc.md)
12 | 
13 | 
14 | 
15 | ### Introduction
16 | 
17 | - Compared with previous systems for scene text recognition, the proposed architecture possesses 4 distinctive properties:
18 |   - end-to-end trainable
19 |   - handles sequences in arbitrary lengths
20 |   - both lexicon-free and lexicon-based scene text recognition tasks
21 |   - smaller model
22 | - The proposed neural network model is named as Convolutional Recurrent Neural Network (CRNN), since it is a combination of DCNN and RNN.
23 | 
24 | 
25 | 
26 | ### The Proposed Network Architecture
27 | 
28 | - three components:
29 |   - the convolutional layers
30 |   - the recurrent layers
31 |   - a transcription layer
32 | 
33 | ![](assets/crnn_01.png)
34 | 
35 | #### Transcription
36 | 
37 | - Probability of label sequence
38 | 
39 |   - We adopt the conditional probability defined in the Connectionist Temporal Classification (CTC) layer proposed by [Graves et al](https://www.cs.toronto.edu/~graves/icml_2006.pdf).
40 |   - $B$ maps $π$ onto $l$ by firstly removing the repeated labels, then removing the ’blank’s.
41 |     - For example, $B$ maps `“--hh-e-l-ll-oo--”` (`-` represents blank) onto `“hello”`.
42 |   - The conditional probability is defined as the sum of probabilities of all $π$ that are mapped by $B$ onto $l$:
43 |     - $p(l|y)=\sum_{π:B(π)=l}p(π|y)$
44 |     - $p(π|y)=\prod_{t=1}^{T}y^{t}_{π_t}$ 
45 |     - $y^{t}_{π_t}$ is the probability of having label $π_t$ at time stamp $t$
46 |     - Directly computing this eq. would be computationally infeasible due to the exponentially large number of summation items. However, this eq. can be efficiently computed using the forward-backward algorithm described in [Graves et al](https://www.cs.toronto.edu/~graves/icml_2006.pdf).
47 | 
48 | - Lexicon-free transcription
49 | 
50 |   - $l^*\sim B(argmax_π p(π|y))$
51 | 
52 | - Lexicon-based transcription
53 | 
54 |   - $l^*=argmax_{l\in D}p(l|y)$  (where: $D$ is a lexicon)
55 | 
56 |   - However, for large lexicons, it would be very time-consuming to perform an exhaustive search over the lexicon.
57 | 
58 |   - This indicates that we can limit our search to the nearest-neighbor candidates $N_δ(l′)$, where $δ$ is the maximal edit distance and $l′$ is the sequence transcribed from $y$ in lexicon-free mode:
59 |     $$
60 |     l^*=argmax_{l\in N_δ(l′)}p(l|y)
61 |     $$
62 |     The candidates $N_δ(l′)$ can be found efficiently with the BK-tree data structure.
63 | 
64 |  


--------------------------------------------------------------------------------
/cv/faster-rcnn.md:
--------------------------------------------------------------------------------
 1 | # Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks (2016), Shaoqing Ren et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1506.01497)\] \[[code](https://github.com/rbgirshick/py-faster-rcnn)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Prerequisite
10 | 
11 | [Fast R-CNN (2015), Ross Girshick.](cv/fast_rcnn.md)
12 | 
13 | 
14 | 
15 | ### Introduction
16 | 
17 | - When ignoring the time spent on region proposals (e.g. Selective Search), Fast R-CNN achieves near real-time rates using very deep networks.
18 | - Region Proposal Networks (RPNs) are designed to efficiently predict region proposals with a wide range of scales and aspect ratios.
19 | - To unify RPNs with Fast R-CNN object detection networks, we propose a training scheme that alternates between fine-tuning for the region proposal task and then fine-tuning for object detection, while keeping the proposals fixed.
20 | - Briefly, Faster R-CNN is replacing "Selective Search" with "RPNs" on Fast R-CNN.
21 | 
22 | 
23 | 
24 | ### Faster R-CNN
25 | 
26 | ![](assets/faster-rcnn_01.png)
27 | 
28 | **Conv Layers**
29 | 
30 | - Zeiler and Fergus model (ZF)
31 |   - ![](assets/faster-rcnn_03.png)
32 |   - last conv layer of ZF is `Conv5`, whose size is (6 x 6 x 256)
33 | - VGG-16
34 |   - ![](assets/faster-rcnn_04.png)
35 |   - last conv layer of VGG16 is `Conv13`, whose size is (14 x 14 x 512)
36 | 
37 | 
38 | 
39 | **Region Proposal Networks (RPNs)** 
40 | 
41 | ![](assets/faster-rcnn_02.png)
42 | 
43 | - To generate region proposals, we slide a small network over the convolutional feature map output by the last shared convolutional layer. This small network takes as input an $n × n$ ($n=3$) spatial window of the input convolutional feature map. Each sliding window is mapped to a lower-dimensional feature (256-d for ZF and 512-d for VGG, with ReLU following).
44 |   - ZF: conv3-256 (padding size=1), generate 6 x 6 x 256 conv feature map
45 |   - VGG: conv3-512 (padding size=1), generate 14 x 14 x 512 conv feature map
46 | - This feature is fed into two sibling fully-connected layers—a box-regression layer (*reg*) and a box-classification layer (*cls*). This architecture is naturally implemented with an $n×n$ convolutional layer followed by two sibling $1 × 1$ convolutional layers (for *reg* and *cls*, respectively).
47 |   - Anchors: At each sliding-window location, we simultaneously predict multiple region proposals, where the number of maximum possible proposals for each location is denoted as $k$. So the *reg* layer has $4k$ outputs encoding the coordinates of $k$ boxes, and the *cls* layer outputs $2k$ scores that estimate probability of object or not object for each proposal.
48 |     - For anchors, we use 3 scales with box areas of 128, 256, and 512 pixels, and 3 aspect ratios of 1:1, 1:2, and 2:1 ($k=9$)
49 |   - *cls* layer:  conv1-18 (no padding) -> softmax on pixel
50 |   - *reg* layer: conv1-36 (no padding) -> regression on pixel
51 |   - ![](assets/faster-rcnn_05.png)
52 |   - ![](assets/faster-rcnn_06.png)
53 | 
54 | - Some RPN proposals highly overlap with each other. To reduce redundancy, we adopt non-maximum suppression (NMS) on the proposal regions based on their *cls* scores. We fix the IoU threshold for NMS at 0.7, which leaves us about 2000 proposal regions per image.
55 | 
56 | 


--------------------------------------------------------------------------------
/cv/gMLP.md:
--------------------------------------------------------------------------------
 1 | # Pay Attention to MLPs (2021), Hanxiao Liu et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/2105.08050)\] \[[code](https://github.com/rwightman/pytorch-image-models)\]
 6 | 
 7 | ---
 8 | 
 9 | - gMLP, based on MLPs with gating
10 | - ![](assets/gMLP_01.png)
11 | - In terms of computation cost, SGU has $n^2e/2$ multiply-adds which is comparable to the $2n^2d$ of dot-product self-attention.
12 | - Our comparisons show that self-attention is not critical for Vision Transformers, as gMLP can achieve the same accuracy.
13 |   - ![](assets/gMLP_02.png)
14 |     - survival prob (stochastic depth): for each mini-batch, randomly drop a subset of layers and bypass them with the identity function
15 |   - ![](assets/gMLP_03.png)
16 | - For BERT, our model achieves parity with Transformers on pretraining perplexity and is better on some downstream NLP tasks. On finetuning tasks where gMLP performs worse, making the gMLP model substantially larger can close the gap with Transformers.
17 | 
18 | 


--------------------------------------------------------------------------------
/cv/mask-rcnn.md:
--------------------------------------------------------------------------------
 1 | # Mask R-CNN (2018), Kaiming He et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1703.06870)\] \[[code](https://github.com/facebookresearch/Detectron)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Prerequisite
10 | 
11 | [Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks (2016), Shaoqing Ren et al.](cv/faster_rcnn.md)
12 | 
13 | 
14 | 
15 | ### Introduction
16 | 
17 | - We present a conceptually simple, flexible, and general framework for object instance segmentation.
18 | - The method, called Mask R-CNN, extends Faster R-CNN by adding a branch for predicting an object mask in parallel with the existing branch for bounding box recognition.
19 | - Change RoIPool to RoIAlign:
20 |   - Most importantly, Faster R- CNN was not designed for pixel-to-pixel alignment between network inputs and outputs.
21 |   - To fix the misalignment, we propose a simple, quantization-free layer, called RoIAlign, that faithfully preserves exact spatial locations.
22 |   - Despite being a seemingly minor change, RoIAlign has a large impact: it improves mask accuracy by relative 10% to 50%, showing bigger gains under stricter localization metrics.
23 | - We found it essential to decouple mask and class prediction: we predict a binary mask for each class independently, without competition among classes, and rely on the network’s RoI classification branch to predict the category.
24 | 
25 | ![](assets/mask-rcnn_02.png)
26 | 
27 | 
28 | 
29 | ### Mask R-CNN
30 | 
31 | ![](assets/mask-rcnn_01.png)
32 | 
33 | - Mask R-CNN is conceptually simple: Faster R-CNN has two outputs for each candidate object, a class label and a bounding-box offset; to this we add a third branch that out- puts the object mask. 
34 | - Formally, during training, we define a multi-task loss on each sampled RoI as $L = L_{cls} + L_{box} + L_{mask}$
35 |   - To this we apply a per-pixel sigmoid, and define $L_{mask}$ as the average binary cross-entropy loss. For an RoI associated with ground-truth class $k$, $L_{mask}$ is only defined on the $k$-th mask (other mask outputs do not contribute to the loss).
36 |   - Our definition of $L_{mask}$ allows the network to generate masks for every class without competition among classes; we rely on the dedicated classification branch to predict the class label used to select the output mask.
37 | 
38 | 
39 | 
40 | **RoIAlign**
41 | 
42 | - This pixel-to-pixel behavior requires our RoI features, which themselves are small feature maps, to be well aligned to faithfully preserve the explicit per-pixel spatial correspondence. This motivated us to develop the following RoIAlign layer that plays a key role in mask prediction.
43 | - These quantizations introduce misalignments between the RoI and the extracted features. While this may not impact classification, which is robust to small translations, it has a large negative effect on predicting pixel-accurate masks.
44 | - Our proposed change is simple: we avoid any quantization of the RoI boundaries or bins (i.e., we use $x/16$ instead of $[x/16]$). We use bi-linear interpolation to compute the exact values of the input features at four regularly sampled locations in each RoI bin, and aggregate the result (using max or average)
45 |   - ![](assets/mask-rcnn_03.png)
46 | 
47 | 
48 | 
49 | **Network Architecture**
50 | 
51 | - Convolutional Backbone Architecture + Head Architecture
52 |   - Backbone Architectures: ResNet-50, ResNet-101, ResNeXt-50, ResNeXt-101
53 |   - Head Architecture: C4 and FPN
54 |     - C4: extracted features from the final convolutional layer of the 4-th stage
55 |     - FPN: Feature Pyramid Network, uses a top-down architecture with lateral connections to build an in-network feature pyramid from a single-scale input
56 |     - ![](assets/mask-rcnn_04.png)
57 | - All Network Architectures:
58 |   - ResNet-50-C4, ResNet-101-C4, ResNeXt-50-C4, ResNeXt-101-C4
59 |   - ResNet-50-FPN, ResNet-101-FPN, ResNeXt-50-FPN, ResNeXt-101-FPN
60 | 
61 | 


--------------------------------------------------------------------------------
/cv/yolo-v3.md:
--------------------------------------------------------------------------------
 1 | # YOLOv3: An Incremental Improvement (2018), Joseph Redmon et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1804.02767)\] \[[code](https://pjreddie.com/darknet/yolo/)\]
 6 | 
 7 | ---
 8 | 
 9 | ### The Deal
10 | 
11 |  #### Bounding Box Prediction
12 | 
13 | - Following YOLO9000 our system predicts bounding boxes using dimension clusters as anchor boxes.
14 | - the predictions correspond to:
15 |   - $b_x =σ(t_x)+c_x$
16 |   - $b_y =σ(t_y)+c_y$
17 |   - $b_w =p_we^{t_w}$
18 |   - $b_h =p_he^{t_h}$
19 | - During training we use sum of squared error loss.
20 | 
21 | 
22 | 
23 | #### Class Prediction
24 | 
25 | - Each box predicts the classes the bounding box may contain using multilabel classification.
26 | - We simply use independent logistic classifiers. During training we use binary cross-entropy loss for the class predictions.
27 | 
28 | 
29 | 
30 | #### Predictions Across Scales
31 | 
32 | - YOLOv3 predicts boxes at 3 different scales. Our system extracts features from those scales using a similar concept to feature pyramid networks (FPN).
33 |   - ![](assets/yolo-v3_01.png)
34 |   - (from: [Feature Pyramid Networks for Object Detection (2017)](https://arxiv.org/pdf/1612.03144.pdf))
35 | 
36 | - Non-max suppression is used for choosing boxes from 3 different scales.
37 | 
38 | 
39 | 
40 | #### Feature Extractor
41 | 
42 | Our new network Darknet-53 is a hybrid approach between the network used in YOLOv2, Darknet-19, and that newfangled residual network stuff. Our network uses successive 3 × 3 and 1 × 1 convolutional layers but now has some shortcut connections as well and is significantly larger.
43 | 
44 | ![](assets/yolo-v3_03.png)
45 | 
46 | 
47 | 
48 | ### Evaluation
49 | 
50 | ![](assets/yolo-v3_02.png)
51 | 
52 | 


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_01.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_02.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_03.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_04.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_05.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_06.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_07.png


--------------------------------------------------------------------------------
/generative/assets/density-estimation-using-real-nvp_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/density-estimation-using-real-nvp_08.png


--------------------------------------------------------------------------------
/generative/assets/glow_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_02.png


--------------------------------------------------------------------------------
/generative/assets/glow_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_03.png


--------------------------------------------------------------------------------
/generative/assets/glow_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_04.png


--------------------------------------------------------------------------------
/generative/assets/glow_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_05.png


--------------------------------------------------------------------------------
/generative/assets/glow_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_06.png


--------------------------------------------------------------------------------
/generative/assets/glow_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_07.png


--------------------------------------------------------------------------------
/generative/assets/glow_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/generative/assets/glow_08.png


--------------------------------------------------------------------------------
/gnn/assets/GRAPE_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/GRAPE_01.png


--------------------------------------------------------------------------------
/gnn/assets/GRAPE_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/GRAPE_02.png


--------------------------------------------------------------------------------
/gnn/assets/GRAPE_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/GRAPE_03.png


--------------------------------------------------------------------------------
/gnn/assets/GRAPE_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/GRAPE_04.png


--------------------------------------------------------------------------------
/gnn/assets/GRAPE_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/GRAPE_05.png


--------------------------------------------------------------------------------
/gnn/assets/GRAPE_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/GRAPE_06.png


--------------------------------------------------------------------------------
/gnn/assets/SpellGCN_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/SpellGCN_01.png


--------------------------------------------------------------------------------
/gnn/assets/SpellGCN_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/SpellGCN_02.png


--------------------------------------------------------------------------------
/gnn/assets/SpellGCN_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/SpellGCN_03.png


--------------------------------------------------------------------------------
/gnn/assets/deep-walk_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/deep-walk_01.png


--------------------------------------------------------------------------------
/gnn/assets/deep-walk_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/deep-walk_02.png


--------------------------------------------------------------------------------
/gnn/assets/deep-walk_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/deep-walk_03.png


--------------------------------------------------------------------------------
/gnn/assets/deep-walk_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/deep-walk_04.png


--------------------------------------------------------------------------------
/gnn/assets/deep-walk_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/deep-walk_05.png


--------------------------------------------------------------------------------
/gnn/assets/deep-walk_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/deep-walk_06.png


--------------------------------------------------------------------------------
/gnn/assets/gcn_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gcn_01.png


--------------------------------------------------------------------------------
/gnn/assets/gin_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gin_01.png


--------------------------------------------------------------------------------
/gnn/assets/gin_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gin_02.png


--------------------------------------------------------------------------------
/gnn/assets/gin_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gin_03.png


--------------------------------------------------------------------------------
/gnn/assets/gin_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gin_04.png


--------------------------------------------------------------------------------
/gnn/assets/gin_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gin_05.png


--------------------------------------------------------------------------------
/gnn/assets/gin_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/gin_06.png


--------------------------------------------------------------------------------
/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_01.png


--------------------------------------------------------------------------------
/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_02.png


--------------------------------------------------------------------------------
/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_03.png


--------------------------------------------------------------------------------
/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_04.png


--------------------------------------------------------------------------------
/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-neural-networks-a-review-of-methods-and-applications_05.png


--------------------------------------------------------------------------------
/gnn/assets/graph-sage_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-sage_01.png


--------------------------------------------------------------------------------
/gnn/assets/graph-sage_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-sage_02.png


--------------------------------------------------------------------------------
/gnn/assets/graph-sage_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/gnn/assets/graph-sage_03.png


--------------------------------------------------------------------------------
/gnn/deep-walk.md:
--------------------------------------------------------------------------------
 1 | # DeepWalk: Online Learning of Social Representations (2014), B. Perozzi et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://dl.acm.org/doi/10.1145/2623330.2623732)\] \[[code](https://github.com/phanein/deepwalk)\]
 6 | 
 7 | ---
 8 | 
 9 | **object**
10 | 
11 | Given related graph, create embeddings of nodes to assist label classification
12 | 
13 | - Instead of mixing the label space as part of the feature space, we propose an unsupervised method which learns features that capture the graph structure independent of the labels’ distribution.
14 | 
15 | ![](assets/deep-walk_01.png)
16 | 
17 | **design**
18 | 
19 | The algorithm consists of two main components; first a random walk generator, and second, an update procedure.
20 | 
21 | ![](assets/deep-walk_05.png)
22 | 
23 | - benefit of random walk
24 |   1. local exploration is easy to parallelize. Several random walkers (in different threads, processes, or machines) can simultaneously explore different parts of the same graph.
25 |   2. relying on information obtained from short random walks make it possible to accommodate small changes in the graph structure without the need for global recomputation. We can iteratively update the learned model with new random walks from the changed region in time sub-linear to the entire graph.
26 | 
27 | ![](assets/deep-walk_02.png)
28 | 
29 | - Hierarchical Softmax
30 |   - line 2: Build a binary Tree T from V
31 |   - detail in [Efficient estimation of word representations in vector space (2013), T. Mikolov et al.](../nlp/efficient-estimation-of-word-representations-in-vector-space.md)
32 | 
33 | ![](assets/deep-walk_03.png)
34 | 
35 | - SkipGram:
36 |   - ![](assets/deep-walk_04.png)
37 | 
38 | **result**
39 | 
40 | ![](assets/deep-walk_06.png)


--------------------------------------------------------------------------------
/gnn/gin.md:
--------------------------------------------------------------------------------
 1 | # How Powerful Are Graph Neural Networks? (2018), K. Xu et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1706.02216)\] \[[pytorch](https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html#torch_geometric.nn.conv.GINConv)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Abstract
10 | 
11 | - We then develop a simple architecture that is provably the most expressive among the class of GNNs and is as powerful as the Weisfeiler-Lehman graph isomorphism test. We empirically validate our theoretical findings on a number of graph classification benchmarks, and demonstrate that our model achieves state-of-the-art performance.
12 | 
13 | 
14 | 
15 | ### Introduction
16 | 
17 | - Many GNN variants with different neighborhood aggregation and graph-level pooling schemes have been proposed. However, the design of new GNNs is mostly based on empirical intuition, heuristics, and experimental trial-and- error. There is little theoretical understanding of the properties and limitations of GNNs, and formal analysis of GNNs’ representational capacity is limited.
18 | - Our framework is inspired by the close connection between GNNs and the Weisfeiler-Lehman (WL) graph isomorphism test, a powerful test known to distinguish a broad class of graphs.
19 | - We identify graph structures that cannot be distinguished by popular GNN variants, such as GCN and GraphSAGE, and we precisely characterize the kinds of graph structures such GNN-based models can capture.
20 | - We develop a simple neural architecture, Graph Isomorphism Network (GIN), and show that its discriminative/representational power is equal to the power of the WL test.
21 | 
22 | 
23 | 
24 | ### Preliminaries
25 | 
26 | - Weisfeiler-Lehman test
27 |   - The graph isomorphism problem asks whether two graphs are topologically identical. This is a challenging problem: no polynomial-time algorithm is known for it yet.
28 |   - The WL test iteratively (1) aggregates the labels of nodes and their neighborhoods, and (2) hashes the aggregated labels into unique new labels. The algorithm decides that two graphs are non-isomorphic if at some iteration the labels of the nodes between the two graphs differ.
29 |   - ![](assets/gin_01.png)
30 | 
31 | 
32 | 
33 | ### Building Powerful Graph Neural Networks
34 | 
35 | - ![](assets/gin_02.png)
36 |   - If the neighbor aggregation and graph-level readout functions are **injective**, then the resulting GNN is as powerful as the WL test.
37 |   - ![](assets/gin_03.png)
38 |   - GNN satisfying the criteria in Theorem 3 generalizes the WL test by learning to embed the subtrees to low-dimensional space. This enables GNNs to not only discriminate different structures, but also to learn to map similar graph structures to similar embeddings and capture dependencies between graph structures. Capturing structural similarity of the node labels is shown to be helpful for generalization particularly when the co-occurrence of subtrees is sparse across different graphs or there are noisy edges and node features.
39 | - Grpah Isomorphism Network (GIN)
40 |   - ![](assets/gin_04.png)
41 |   - ![](assets/gin_05.png)
42 |   - the neighbor aggregation and graph-level readout functions of GIN are **injective**
43 |   - 1-Layer Perceptrons are not Sufficient
44 | - GCN and GraphSAGE do not satisfy the conditions in Theorem 3
45 |   - ![](assets/gin_06.png)
46 |   - The mean aggregator may perform well if, for the task, the statistical and distributional information in the graph is more important than the exact structure. Moreover, when node features are diverse and rarely repeat, the mean aggregator is as powerful as the sum aggregator. This may explain why, despite the limitations identified in Section 5.2, GNNs with mean aggregators are effective for node classification tasks, such as classifying article subjects and community detection, where node features are rich and the distribution of the neighborhood features provides a strong signal for the task.
47 | 
48 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />
 7 |     <meta name="viewport"
 8 |         content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0">
 9 | 
10 |     <title>Machine Learning Papers Summary</title>
11 |     <meta name="description" content="Let's read important ML/DL/AI papers together!"> <!-- TODO: Set. -->
12 | 
13 |     <meta name="keywords" content="machine-learning,deep-learning,papers,summary">
14 | 
15 |     <!-- TODO: Replace the theme name or leave as is. -->
16 |     <link rel="stylesheet" href="//unpkg.com/docsify/lib/themes/vue.css">
17 |     <style>
18 |         /* Fix issue on mobile devices where a custom cover image goes off the screen. */
19 |         section.cover.show {
20 |             height: 100%;
21 |         }
22 | 
23 |         .sidebar {
24 |             width: 400px;
25 |         }
26 |         body.close .sidebar {
27 |             transform: translateX(-400px);
28 |         }
29 |         @media screen and (max-width: 768px) {
30 |             .sidebar {
31 |                 left: -400px;
32 | 
33 |              }
34 |              body.close .sidebar {
35 |                  transform: translateX(400px);
36 |              }
37 |         }
38 |     </style>
39 | </head>
40 | 
41 | <body>
42 |     <div id="app"></div>
43 |     <script>
44 |         window.$docsify = {
45 |             name: 'Machine Learning Papers Summary',
46 |             auto2top: true,
47 |             coverpage: false,
48 |             loadSidebar: 'summary.md',
49 |             subMaxLevel: 1,
50 |         }
51 |         
52 | 
53 |     </script>
54 |     <script src="//unpkg.com/docsify/lib/docsify.min.js"></script>
55 |     <script src="//unpkg.com/docsify/lib/plugins/emoji.min.js"></script>
56 |     <script src="//unpkg.com/docsify/lib/plugins/zoom-image.min.js"></script>
57 | 
58 |     <script src="//cdn.jsdelivr.net/npm/docsify-katex@latest/dist/docsify-katex.js"></script>
59 |     <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/katex@latest/dist/katex.min.css">
60 | </body>
61 | 
62 | </html>
63 | 


--------------------------------------------------------------------------------
/nlp/BiDAF.md:
--------------------------------------------------------------------------------
 1 | # Bi-Directional Attention Flow for Machine Comprehension (2017), Minjoon Seo et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1611.01603)\] \[[tensorflow](https://github.com/allenai/bi-att-flow)\] 
 6 | 
 7 | ---
 8 | 
 9 | ### Introduction
10 | 
11 | - Our attention mechanism offers following improvements to the previously popular attention paradigms
12 |   - First, our attention layer is not used to summarize the context paragraph into a fixed-size vector. Instead, the attention is computed for every time step, and the attended vector at each time step, along with the representations from previous layers, is allowed to flow through to the subsequent modeling layer. This reduces the information loss caused by early summarization.
13 |   - Second, we use a memory-less attention mechanism.
14 |   - Third, we use attention mechanisms in both directions, query-to-context and context-to-query, which provide complimentary information to each other.
15 | 
16 | ### Model
17 | 
18 | ![](assets/BiDAF_01.png)
19 | 
20 | - (applied to both the query and context) Character Embedding Layer
21 |   - maps each word to a vector space using character-level CNNs
22 | - (applied to both the query and context) Word Embedding Layer
23 |   - maps each word to a vector space using a pre-trained word embedding model (GloVe)
24 |   - The concatenation of the character and word embedding vectors is passed to a two-layer Highway Network
25 | - (applied to both the query and context) Contextual Embedding Layer
26 |   - utilizes contextual cues from surrounding words to refine the embedding of the words
27 |   - We place an LSTM in both directions, and concatenate the outputs of the two LSTMs.
28 | - Attention Flow Layer
29 |   - couples the query and context vectors and produces a set of query-aware feature vectors for each word in the context
30 |   - Similarity Matrix $S_{tj}=\alpha(H_{:t},U_{:j})\in \real$
31 |     - context part: $H_{:t}$ is t-th column vector of $H$
32 |     - query part: $U_{:j}$ is j-th column vector of $U$
33 |     - $\alpha$ is a trainable scalar function that encodes the similarity between its two input vectors, we choose: $\alpha(h,u)=w_{(S)}^T[h;u;h\circ u]$  
34 |       -  $w_{(S)}\in \real^{6d}$ is a trainable weight vector
35 |       - $\circ$ is elementwise multiplication
36 |       - $[;]$ is vector concatenation across row
37 |   - Context-to-query (C2Q) Attention
38 |     - signifies which query words are most relevant to each context word
39 |     - $\tilde{U}_{:t}=\sum_j a_{tj}U_{:j}\in \real^{2d}$  where: $\tilde{U}\in \real^{2d\times T}$
40 |       - $a_{t}=softmax(S_{t:})\in \real^J$
41 |   - Query-to-context (Q2C) Attention
42 |     - signifies which context words have the closest similarity to one of the query words and are hence critical for answering the query
43 |     - $\tilde{H}=\begin{bmatrix} \tilde{h}, \tilde{h}, \cdots,\tilde{h} \end{bmatrix}\in \real^{2d\times T}$
44 |       - $\tilde{h}=\sum_t b_{t}H_{:t}\in \real^{2d}$ 
45 |       - $b=softmax(max_{col}(S))\in \real^{T}$
46 |       - $\tilde{h}$ indicates the weighted sum of the most important words in the context with respect to the query.
47 |   - Combined together to yield $G$
48 |     - $G_{:t}=\beta(H_{:t},\tilde{U}_{:t},\tilde{H}_{:t})\in \real^{d_{G}}$
49 |     - $\beta$ function can be an arbitrary trainable neural network, but simple concatenation as following still shows good performance in our experiments: $\beta(h,\tilde{u},\tilde{h})=[h;\tilde{u};h\circ \tilde{u};h\circ \tilde{h}]\in \real^{8d\times T}$
50 | - Modeling Layer $M\in \real^{2d\times T}$
51 |   - employs a Recurrent Neural Network to scan the context
52 | - Output Layer
53 |   - provides an answer to the query
54 |   - $p^{start}=softmax(w^T_{(p^{start})}[G;M^{start}])$ and $p^{end}=softmax(w^T_{(p^{end})}[G;M^{end}])$
55 |   - training: $L(\theta)=-\frac{1}{N}\sum_i log(p^{start}_{y_i^{start}})+log(p^{end}_{y_i^{end}})$
56 |   - testing: The answer span $(k,l)$ where $k≤l$ with the maximum value of $p^{start}_kp^{end}_l$ is chosen, which can be computed in linear time with dynamic programming.
57 | 
58 | 


--------------------------------------------------------------------------------
/nlp/CNN-for-sentence-classification.md:
--------------------------------------------------------------------------------
 1 | # Convolutional Neural Networks for Sentence Classification (2014), Yoon Kim.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1408.5882)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Model
10 | 
11 | ![](assets/CNN-for-sentence-classification_01.png)
12 | 
13 | - step by step
14 |   - word vectors: use `word2vec` 
15 |   - we train a simple CNN with one layer of convolution on top of word vectors
16 |   - apply a max-over-time pooling operation over the feature map and take the maximum value
17 |   - fully connected layer with dropout and softmax output
18 | - constraint on l2-norms of the weight vectors: escaling w to have $||w||_2 = s$ whenever $||w||_2 > s$ after a gradient descent step
19 | - Model Variations
20 |   - CNN-rand: Our baseline model where all words are randomly initialized and then modified during training.
21 |   - CNN-static: pre-trained vectors from `word2vec` and fixed the word vectors
22 |   - CNN-non-static: Same as above but the pre- trained vectors are fine-tuned for each task.
23 |   - CNN-multichannel: A model with two sets of word vectors. One channel is for static word vectors, another channel is for non-static word vectors. Hence the model is able to fine-tune one set of vectors while keeping the other static. Both channels are initialized with `word2vec`.
24 | 
25 | 
26 | 
27 | ### Result
28 | 
29 | ![](assets/CNN-for-sentence-classification_02.png)
30 | 
31 | - These results suggest that the pre-trained vectors are good, ‘universal’ feature extractors and can be utilized across datasets. Fine-tuning the pre-trained vectors for each task gives still further improvements (CNN-non-static).
32 | - Multichannel vs. Single Channel Models:
33 |   - The multichannel model work better than the single channel model
34 |   - prevent overfitting:  by ensuring that the learned vectors do not deviate too far from the original values 
35 | 
36 | ![](assets/CNN-for-sentence-classification_03.png)
37 | 
38 | - problem: `word2vec` is presented for (almost) syntactically equivalent, but not expressing sentiment
39 | - vectors in the non-static channel can solve this problem shown as above picture


--------------------------------------------------------------------------------
/nlp/GPT.md:
--------------------------------------------------------------------------------
 1 | # Improving Language Understanding by Generative Pre-Training (2018), Alec Radford et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)\] \[[pytorch](https://github.com/huggingface/transformers/blob/master/src/transformers/models/openai/modeling_openai.py)\]
 6 | 
 7 | ---
 8 | 
 9 | ![](assets/bert_08.png)
10 | 
11 | (from: **BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding** (2019), Jacob Devlin et al. \[➤ [summary](nlp/bert.md)\])
12 | 
13 | - Architecture: Language Model + Transformer Layer
14 | 
15 | - Two Stage:
16 | 
17 |   - Unsupervised pre-training
18 | 
19 |     - Given an unsupervised corpus of tokens $\mathcal{U} = {u_1, . . . , u_n}$, we use a standard language modeling objective to maximize the following likelihood:
20 |       $$
21 |       L_1(\mathcal{U})=\sum_i logP(u_i|u_{i-k},...,u_{i-1};Θ)
22 |       $$
23 |       where $k$ is the size of the context window, and the conditional probability $P$ is modeled using a neural network with parameters $Θ$. These parameters are trained using stochastic gradient descent.
24 | 
25 |     - We use a multi-layer Transformer. This model applies a multi-headed self-attention operation over the input context tokens followed by position-wise feedforward layers to produce an output distribution over target tokens:
26 |       $$
27 |       h_0=UW_e+W_p \\
28 |       h_l=\text{transformer-block}(h_{l-1}) \\
29 |       P(u)=\text{softmax}(h_nW_e^T)
30 |       $$
31 |       where $U = (u_{−k},..., u_{−1})$ is the context vector of tokens, $n$ is the number of layers, $W_e$ is the token embedding matrix, and $W_p$ is the position embedding matrix.
32 | 
33 |   - Supervised fine-tuning
34 | 
35 |     - The inputs are passed through our pre-trained model to obtain the final transformer block’s activation $h^m_l$ , which is then fed into an added linear output layer with parameters $W_y$ to predict $y$:
36 |       $$
37 |       P(y|x^1,...,x^m)=\text{softmax}(h_l^mW_y)
38 |       $$
39 |       This gives us the following objective to maximize:
40 |       $$
41 |       L_2(\mathcal{C})=\sum_{(x,y)}logP(y|x^1,...,x^m)
42 |       $$
43 | 
44 |     - We additionally found that including language modeling as an auxiliary objective to the fine-tuning helped learning by (a) improving generalization of the supervised model, and (b) accelerating convergence.
45 |       $$
46 |       L_3(\mathcal{C})=L_2(\mathcal{C})+\lambda L_1(\mathcal{C})
47 |       $$
48 | 
49 | - Task-specific input transformations:
50 | 
51 |   - ![](assets/GPT_01.png)
52 |   - special tokens: a start token `<s>`, an end token `<e>` and a delimiter token `<$>`
53 |   - (note): For similarity tasks, there is no inherent ordering of the two sentences being compared. To reflect this, we modify the input sequence to contain both possible sentence orderings.
54 | 
55 | - Experiments
56 | 
57 |   - ![](assets/GPT_02.png)
58 |   - ![](assets/GPT_03.png)
59 |   - ![](assets/GPT_04.png)
60 | 
61 | 


--------------------------------------------------------------------------------
/nlp/GPT3.md:
--------------------------------------------------------------------------------
 1 | # Language Models are Few-Shot Learners (2020), Tom B. Brown et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/2005.14165.pdf)\] 
 6 | 
 7 | ---
 8 | 
 9 | - Prerequisite: **Language Models are Unsupervised Multitask Learners** (2019), Alec Radford et al. \[➤ [summary](nlp/GPT2.md)\]
10 | - TL;DR: We train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic.
11 | - We evaluate GPT-3 under 3 conditions: 
12 |   - (a) “few-shot learning”, or in-context learning where we allow as many demonstrations as will fit into the model’s context window (typically 10 to 100)
13 |   - (b) “one-shot learning”, where we allow only one demonstration
14 |   - (c) “zero-shot” learning, where no demonstrations are allowed and only an instruction in natural language is given to the model. GPT-3 could also in principle be evaluated in the traditional fine-tuning setting, but we leave this to future work.
15 |   - In this work we do not fine-tune GPT-3 because our focus is on task-agnostic performance, but GPT-3 can be fine-tuned in principle and this is a promising direction for future work.
16 |   - ![](assets/GPT3_01.png)
17 | - The large data from internet is potential contamination of downstream tasks by having their test or development sets inadvertently seen during pre-training. To reduce such contamination, we searched for and attempted to remove any overlaps with the development and test sets of all benchmarks studied in this paper.
18 | 
19 | ---
20 | 
21 | ![](assets/GPT3_02.png)
22 | 
23 | Comparison:
24 | 
25 | |      | Data Size | Layers | Parameters |
26 | | ---- | --------- | ------ | ---------- |
27 | | GPT  | 5GB       | 12     | 0.117B     |
28 | | GPT2 | 40GB      | 48     | 1.542B     |
29 | | GPT3 | 540GB     | 96     | 175B       |
30 | 
31 | 
32 | 
33 | ![](assets/GPT3_03.png)
34 | 
35 | - Prompt: with a natural language task description
36 | 
37 | 
38 | 
39 | ![](assets/GPT3_04.png)
40 | 
41 | ---
42 | 
43 | **Limitations of GPT3**
44 | 
45 | - GPT-3 samples still sometimes repeat themselves semantically at the document level, start to lose coherence over sufficiently long passages, contradict themselves, and occasionally contain non-sequitur sentences or paragraphs.
46 | - We have noticed informally that GPT-3 seems to have special difficulty with “common sense physics”. Specifically GPT-3 has difficulty with questions of the type “If I put cheese into the fridge, will it melt?”. 
47 | - Making a bidirectional model at the scale of GPT-3, and/or trying to make bidirectional models work with few- or zero-shot learning, is a promising direction for future research, and could help achieve the “best of both worlds”.
48 | - GPT3 still sees much more text during pre-training than a human sees in the their lifetime. Improving pre-training sample efficiency is an important direction for future work, and might come from grounding in the physical world to provide additional information, or from algorithmic improvements.
49 | 
50 | 


--------------------------------------------------------------------------------
/nlp/RoBERTa.md:
--------------------------------------------------------------------------------
 1 | # RoBERTa: A Robustly Optimized BERT Pretraining Approach (2019), Yinhan Liu et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1907.11692.pdf)\] \[[pytorch](https://github.com/huggingface/transformers/blob/master/src/transformers/models/roberta/modeling_roberta.py)\]
 6 | 
 7 | ---
 8 | 
 9 | - Our modifications are simple, they include: (1) training the model longer, with bigger batches, over more data; (2) removing the next sentence prediction objective; (3) training on longer sequences; and (4) dynamically changing the masking pattern applied to the training data.
10 | - Analysis: Model Input Format and Next Sentence Prediction
11 |   - SEGMENT-PAIR + NSP: This follows the original input format used in BERT (Devlin et al., 2019), with the NSP loss. Each input has a pair of segments, which can each contain multiple natural sentences.
12 |   - SENTENCE-PAIR + NSP: 
13 |     - Each input contains a pair of natural sentences.
14 |     - Since these inputs are significantly shorter than 512 tokens, we increase the batch size so that the total number of tokens remains similar to SEGMENT-PAIR+NSP.
15 |   - FULL-SENTENCES: 
16 |     - NO NSP
17 |     - Inputs may cross document boundaries. When we reach the end of one document, we begin sampling sentences from the next document and add an extra separator token `[SEP]` between documents.
18 |   - DOC-SENTENCES:
19 |     - NO NSP
20 |     - not cross document boundaries
21 |     - dynamically increase the batch size in these cases to achieve a similar number of total tokens as FULL- SENTENCES
22 |   - ![](assets/RoBERTa_01.png)
23 |     - Comparing between SEGMENT-PAIR + NSP and SENTENCE-PAIR + NSP, We find that using individual sentences hurts performance on downstream tasks.
24 |     - Removing the NSP loss matches or slightly improves downstream task performance
25 | - Analysis: Training with large batches
26 |   - Past work in Neural Machine Translation has shown that training with very large mini-batches can both improve optimization speed and end-task performance when the learning rate is increased appropriately (Ott et al., 2018). Recent work has shown that BERT is also amenable to large batch training (You et al., 2019).
27 |   - ![](assets/RoBERTa_02.png)
28 | - Text Encoding: Byte-Pair Encoding (BPE) (Sennrich et al., 2016) is a hybrid between character- and word-level representations that allows handling the large vocab- ularies common in natural language corpora.
29 | - We now aggregate these improvements and evaluate their combined impact. We call this configuration RoBERTa for **R**obustly **o**ptimized **BERT** **a**pproach. Specifically, RoBERTa is trained with 
30 |   - dynamic masking
31 |   - FULL-SENTENCES without NSP loss
32 |   - large mini-batches
33 |   - a larger byte-level BPE
34 | 
35 | 


--------------------------------------------------------------------------------
/nlp/a-fast-and-accurate-dependency-parser-using-nural-networks.md:
--------------------------------------------------------------------------------
 1 | # A Fast and Accurate Dependency Parser using Neural Networks (2014), Chen and Manning. 
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://nlp.stanford.edu/pubs/emnlp2014-depparser.pdf)\]
 6 | 
 7 | ---
 8 | 
 9 | **TL;DR**
10 | 
11 | In this paper, we train a neural net- work classifier to make parsing decisions within a transition-based dependency parser. The neu- ral network learns compact dense vector represen- tations of words, part-of-speech (POS) tags, and dependency labels.
12 | 
13 | **Transition-based Dependency Parsing**
14 | 
15 | ![](assets/a-fast-and-accurate-dependency-parser-using-nural-networks_01.png)
16 | 
17 | - In the arc-standard system, a configuration $c = (s,b,A)$ consists of a stack $s$, a buffer $b$, and a set of dependency arcs $A$.
18 | - Depend on a configuration $c = (s,b,A)$ to determine actions:
19 |   - LEFT-ARC($l$): adds an arc $s1 → s2$ with label $l$ and removes $s2$ from the stack. Pre-condition: $|s| ≥ 2$.
20 |   - RIGHT-ARC($l$): adds an arc $s2 → s1$ with label $l$ and removes $s1$ from the stack. Pre- condition: $|s| ≥ 2$.
21 |   - SHIFT: moves $b1$ from the buffer to the stack. Precondition: $|b| ≥ 1$.
22 | 
23 | - Its features suffer from the following problems:
24 |   - Sparsity
25 |   - Incompleteness: Because even with expertise and man- ual handling involved, they still do not in- clude the conjunction of every useful word combination.
26 |   - Expensive feature computation
27 | 
28 | **Neural Network Based Parser**
29 | 
30 | ![](assets/a-fast-and-accurate-dependency-parser-using-nural-networks_02.png)
31 | 
32 | - We choose a set of elements based on the stack / buffer positions for each type of information (word, POS or label), which might be useful for our predictions.
33 | - POS and label embeddings
34 | - Cube activation function: Using $g(x) = x^3$ can model the product terms of $x_i$, $x_j$, $x_k$ for any three different elements at the input layer directly. In our case, $x_i$, $x_j$, $x_k$ could come from different dimensions of three embeddings.
35 | - The choice of $S_w$ (words), $S_t$ (POS tags), $S_l$ (arc labels)
36 |   - Following (Zhang and Nivre, 2011), we pick a rich set of elements for our final parser. In detail, $S_w$ contains $n_w$ = 18 elements: (1) The top 3 words on the stack and buffer: $s1, s2, s3, b1, b2, b3$; (2) The first and second leftmost / rightmost children of the top two words on the stack: $lc1(si), rc1(si), lc2(si), rc2(si)$, $i = 1, 2$. (3) The leftmost of leftmost / rightmost of rightmost children of the top two words on the stack: $lc1(lc1(si)), rc1(rc1(si))$, $i = 1, 2$.
37 |   - We use the corresponding POS tags for $S_t$ (nt = 18), and the corresponding arc labels of words excluding those 6 words on the stack/buffer for $S_l$ ($n_l$ = 12). A good advantage of our parser is that we can add a rich set of elements cheaply, instead of hand-crafting many more indicator features.
38 | - prediction target: dependents and their labels
39 | 
40 | **results**
41 | 
42 | ![](assets/a-fast-and-accurate-dependency-parser-using-nural-networks_03.png)
43 | 
44 | - UAS (unlabeled attachment score): # of correct dependents / # of dependents
45 | - LAS (labeled attachment score): # of correct dependents with labels / # of dependents with labels
46 | 
47 | 


--------------------------------------------------------------------------------
/nlp/assets/ALBERT_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/ALBERT_01.png


--------------------------------------------------------------------------------
/nlp/assets/ALBERT_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/ALBERT_02.png


--------------------------------------------------------------------------------
/nlp/assets/ALBERT_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/ALBERT_03.png


--------------------------------------------------------------------------------
/nlp/assets/ALBERT_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/ALBERT_04.png


--------------------------------------------------------------------------------
/nlp/assets/ALBERT_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/ALBERT_05.png


--------------------------------------------------------------------------------
/nlp/assets/ALBERT_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/ALBERT_06.png


--------------------------------------------------------------------------------
/nlp/assets/BiDAF_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/BiDAF_01.png


--------------------------------------------------------------------------------
/nlp/assets/CNN-for-sentence-classification_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/CNN-for-sentence-classification_01.png


--------------------------------------------------------------------------------
/nlp/assets/CNN-for-sentence-classification_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/CNN-for-sentence-classification_02.png


--------------------------------------------------------------------------------
/nlp/assets/CNN-for-sentence-classification_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/CNN-for-sentence-classification_03.png


--------------------------------------------------------------------------------
/nlp/assets/GPT2_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT2_01.png


--------------------------------------------------------------------------------
/nlp/assets/GPT2_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT2_02.png


--------------------------------------------------------------------------------
/nlp/assets/GPT2_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT2_03.png


--------------------------------------------------------------------------------
/nlp/assets/GPT3_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT3_01.png


--------------------------------------------------------------------------------
/nlp/assets/GPT3_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT3_02.png


--------------------------------------------------------------------------------
/nlp/assets/GPT3_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT3_03.png


--------------------------------------------------------------------------------
/nlp/assets/GPT3_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT3_04.png


--------------------------------------------------------------------------------
/nlp/assets/GPT_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT_01.png


--------------------------------------------------------------------------------
/nlp/assets/GPT_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT_02.png


--------------------------------------------------------------------------------
/nlp/assets/GPT_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT_03.png


--------------------------------------------------------------------------------
/nlp/assets/GPT_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/GPT_04.png


--------------------------------------------------------------------------------
/nlp/assets/QANet_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/QANet_01.png


--------------------------------------------------------------------------------
/nlp/assets/QANet_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/QANet_02.png


--------------------------------------------------------------------------------
/nlp/assets/QANet_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/QANet_03.png


--------------------------------------------------------------------------------
/nlp/assets/QANet_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/QANet_04.png


--------------------------------------------------------------------------------
/nlp/assets/RoBERTa_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/RoBERTa_01.png


--------------------------------------------------------------------------------
/nlp/assets/RoBERTa_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/RoBERTa_02.png


--------------------------------------------------------------------------------
/nlp/assets/T5_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_01.png


--------------------------------------------------------------------------------
/nlp/assets/T5_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_02.png


--------------------------------------------------------------------------------
/nlp/assets/T5_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_03.png


--------------------------------------------------------------------------------
/nlp/assets/T5_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_04.png


--------------------------------------------------------------------------------
/nlp/assets/T5_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_05.png


--------------------------------------------------------------------------------
/nlp/assets/T5_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_06.png


--------------------------------------------------------------------------------
/nlp/assets/T5_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_07.png


--------------------------------------------------------------------------------
/nlp/assets/T5_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_08.png


--------------------------------------------------------------------------------
/nlp/assets/T5_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_09.png


--------------------------------------------------------------------------------
/nlp/assets/T5_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_10.png


--------------------------------------------------------------------------------
/nlp/assets/T5_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_11.png


--------------------------------------------------------------------------------
/nlp/assets/T5_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_12.png


--------------------------------------------------------------------------------
/nlp/assets/T5_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_13.png


--------------------------------------------------------------------------------
/nlp/assets/T5_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_14.png


--------------------------------------------------------------------------------
/nlp/assets/T5_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/T5_15.png


--------------------------------------------------------------------------------
/nlp/assets/Transformer-XL_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/Transformer-XL_01.png


--------------------------------------------------------------------------------
/nlp/assets/Transformer-XL_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/Transformer-XL_02.png


--------------------------------------------------------------------------------
/nlp/assets/Transformer-XL_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/Transformer-XL_03.png


--------------------------------------------------------------------------------
/nlp/assets/Transformer-XL_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/Transformer-XL_04.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_01.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_02.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_03.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_04.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_05.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_06.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_07.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_08.png


--------------------------------------------------------------------------------
/nlp/assets/XLNet_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/XLNet_09.png


--------------------------------------------------------------------------------
/nlp/assets/a-fast-and-accurate-dependency-parser-using-nural-networks_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/a-fast-and-accurate-dependency-parser-using-nural-networks_01.png


--------------------------------------------------------------------------------
/nlp/assets/a-fast-and-accurate-dependency-parser-using-nural-networks_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/a-fast-and-accurate-dependency-parser-using-nural-networks_02.png


--------------------------------------------------------------------------------
/nlp/assets/a-fast-and-accurate-dependency-parser-using-nural-networks_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/a-fast-and-accurate-dependency-parser-using-nural-networks_03.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_01.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_02.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_03.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_04.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_05.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_06.png


--------------------------------------------------------------------------------
/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-based-recurrent-neural-network-models-for-joint-intent-detection-and-slot-filling_07.png


--------------------------------------------------------------------------------
/nlp/assets/attention-is-all-you-need_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-is-all-you-need_01.png


--------------------------------------------------------------------------------
/nlp/assets/attention-is-all-you-need_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-is-all-you-need_02.png


--------------------------------------------------------------------------------
/nlp/assets/attention-is-all-you-need_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-is-all-you-need_03.png


--------------------------------------------------------------------------------
/nlp/assets/attention-is-all-you-need_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-is-all-you-need_04.png


--------------------------------------------------------------------------------
/nlp/assets/attention-is-all-you-need_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/attention-is-all-you-need_05.png


--------------------------------------------------------------------------------
/nlp/assets/bert_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_01.png


--------------------------------------------------------------------------------
/nlp/assets/bert_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_02.png


--------------------------------------------------------------------------------
/nlp/assets/bert_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_03.png


--------------------------------------------------------------------------------
/nlp/assets/bert_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_04.png


--------------------------------------------------------------------------------
/nlp/assets/bert_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_05.png


--------------------------------------------------------------------------------
/nlp/assets/bert_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_06.png


--------------------------------------------------------------------------------
/nlp/assets/bert_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_07.png


--------------------------------------------------------------------------------
/nlp/assets/bert_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_08.png


--------------------------------------------------------------------------------
/nlp/assets/bert_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_09.png


--------------------------------------------------------------------------------
/nlp/assets/bert_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/bert_13.png


--------------------------------------------------------------------------------
/nlp/assets/distributed-representations-of-words-and-phrases-and-their-compositionality_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/distributed-representations-of-words-and-phrases-and-their-compositionality_01.png


--------------------------------------------------------------------------------
/nlp/assets/distributed-representations-of-words-and-phrases-and-their-compositionality_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/distributed-representations-of-words-and-phrases-and-their-compositionality_02.png


--------------------------------------------------------------------------------
/nlp/assets/doc2vec_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/doc2vec_01.png


--------------------------------------------------------------------------------
/nlp/assets/doc2vec_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/doc2vec_02.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_01.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_02.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_03.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_04.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_05.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-estimation-of-word-representations-in-vector-space_06.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_01.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_02.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_03.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_04.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_05.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_06.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_07.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_08.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_09.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_10.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_11.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_12.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_13.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_14.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_15.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_16.png


--------------------------------------------------------------------------------
/nlp/assets/efficient-transformers-a-survey_17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/efficient-transformers-a-survey_17.png


--------------------------------------------------------------------------------
/nlp/assets/elmo_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/elmo_01.png


--------------------------------------------------------------------------------
/nlp/assets/elmo_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/elmo_02.png


--------------------------------------------------------------------------------
/nlp/assets/elmo_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/elmo_03.png


--------------------------------------------------------------------------------
/nlp/assets/elmo_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/elmo_04.png


--------------------------------------------------------------------------------
/nlp/assets/glove_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/glove_01.png


--------------------------------------------------------------------------------
/nlp/assets/glove_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/glove_02.png


--------------------------------------------------------------------------------
/nlp/assets/glove_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/glove_03.png


--------------------------------------------------------------------------------
/nlp/assets/glove_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/glove_04.png


--------------------------------------------------------------------------------
/nlp/assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_01.png


--------------------------------------------------------------------------------
/nlp/assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_02.png


--------------------------------------------------------------------------------
/nlp/assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_03.png


--------------------------------------------------------------------------------
/nlp/assets/sequence-to-sequence-learning-with-neural-networks_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/sequence-to-sequence-learning-with-neural-networks_01.png


--------------------------------------------------------------------------------
/nlp/assets/sequence-to-sequence-learning-with-neural-networks_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/nlp/assets/sequence-to-sequence-learning-with-neural-networks_02.png


--------------------------------------------------------------------------------
/nlp/doc2vec.md:
--------------------------------------------------------------------------------
 1 | # Distributed Representations of Sentences and Documents (2014), Q. Le and T. Mikolov et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1405.4053)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Introduction
10 | 
11 | - known as **doc2vec**
12 | - bag-of-words features have two major weaknesses: they lose the ordering of the words and they also ignore semantics of the words
13 | - Empirical results show that Paragraph Vectors outperform bag-of-words models as well as other techniques for text representations.
14 | - Unlike some of the previous approaches, it is general and applicable to texts of any length: sentences, paragraphs, and documents. It does not require task-specific tuning of the word weighting function nor does it rely on the parse trees.
15 | 
16 | 
17 | 
18 | ### Distributed Memory Model of Paragraph Vectors (PV-DM)
19 | 
20 | ![](assets/doc2vec_01.png)
21 | 
22 | - On training stage: training to get word vectors $W$ , softmax weights $U$, $b$ and paragraph vectors $D$ on already seen paragraphs, $y=b+Uh(x_{t-k},...,x_{t+k};para\_id;W;D)$
23 | - On inference stage: to get paragraph vectors $D$ for new paragraphs (never seen before) by adding more columns in $D$ and gradient descending on $D$ while holding $W$, $U$, $b$ fixed. We use $D$ to make a prediction about some particular labels using a standard classifier, e.g., logistic regression.
24 | 
25 | 
26 | 
27 | ### Distributed Bag of Words version of Paragraph Vector (PV-DBOW)
28 | 
29 | ![](assets/doc2vec_02.png)
30 | 
31 | - On training stage: training to get softmax weights $U$, $b$ and paragraph vectors $D$ on already seen paragraphs, $y=b+Uh(para\_id;D)$
32 | - On inference stage: to get paragraph vectors $D$ for new paragraphs (never seen before) by adding more columns in $D$ and gradient descending on $D$ while holding  $U$, $b$ fixed. 


--------------------------------------------------------------------------------
/nlp/dr-qa.md:
--------------------------------------------------------------------------------
 1 | # Reading Wikipedia to Answer Open-Domain Questions (2017), Danqi Chen et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1704.00051)\] 
 6 | 
 7 | ---
 8 | 
 9 | ### Introduction
10 | 
11 | - Problem: answer to any factoid question from a text span in a Wikipedia article
12 | - Our approach combines a search component based on bigram hashing and TF-IDF matching with a multi-layer recurrent neural network model trained to detect answers in Wikipedia paragraphs.
13 |   - In order to answer any question, one must first retrieve the few relevant articles among more than 5 million items, and then scan them carefully to identify the answer.
14 | - We develop DrQA, a strong system for question answering from Wikipedia composed of: 
15 |   - (1) Document Retriever, a module using bigram hashing and TF-IDF matching designed to, given a question, efficiently return a subset of relevant articles
16 |     - Our experiments show that Document Retriever outperforms the built-in Wikipedia search engine and that Document Reader reaches state-of-the- art results on the very competitive SQuAD benchmark
17 |   - (2) Document Reader, a multi-layer recurrent neural network machine comprehension model trained to detect answer spans in those few returned documents.
18 | 
19 | 
20 | 
21 | ### Our System: DrQA
22 | 
23 | - Document Retriever
24 |   - Articles and questions are compared as TF-IDF weighted bag-of- word vectors.
25 |   - We further improve our system by taking local word order into account with n-gram features.
26 |     - Our best performing system uses bigram counts while preserving speed and memory efficiency by using the hashing of (Weinberger et al., 2009) to map the bigrams to 224 bins with an un-signed `murmur3` hash.
27 | - Document Reader
28 |   - Given a question $q$ consisting of $l$ tokens $\{q_1,...,q_l\}$ and a document or a small set of documents of $n$ paragraphs where a single paragraph $p$ consists of $m$ tokens $\{p_1, \dots , p_m\}$, we develop an RNN model that we apply to each paragraph in turn and then finally aggregate the predicted answers.
29 |   - Question encoding:
30 |     - apply BiLSTM to $\{q_1,...,q_l\}$ 's word embeddings
31 |       - 300-dimensional Glove
32 |       - We keep most of the pre-trained word embeddings fixed and only fine-tune the 1000 most frequent question words because the representations of some key words such as what, how, which, many could be crucial for QA systems.
33 |     - and combine the resulting hidden units into one single vector: $q^*=\sum_jb_j\hat{q}_j$ (where: $b_j=exp(w\cdot q_j)/\sum_kexp(w\cdot q_k)$ and $w$ is a weight vector to learn)
34 |   - Paragraph encoding
35 |     - apply multi-layer BiLSTM to the feature vector $\tilde{p}_i$. The feature vector $\tilde{p}_i$ is comprised of the following parts:
36 |     - Word Embeddings
37 |       - 300-dimensional Glove
38 |       - We keep most of the pre-trained word embeddings fixed and only fine-tune the 1000 most frequent question words because the representations of some key words such as what, how, which, many could be crucial for QA systems.
39 |     - Exact Match: indicating whether $p_i$ can be exactly matched to one question word in $q$, either in its original, lowercase or lemma form
40 |     - Token Features: include its part-of-speech (POS) and named entity recognition (NER) tags and its (normalized) term frequency (TF)
41 |     - Aligned Question Embedding: soft alignments between similar but non-identical words
42 |       - $f_{align}(p_i)=\sum_ja_{i,j}E(q_j)$
43 |       - $E(q_j)$: Word Embeddings
44 |       - $a_{i,j}=exp(\alpha(E(p_i))\cdot \alpha(E(q_j)))/\sum_k exp(\alpha(E(p_i))\cdot \alpha(E(q_k)))$
45 |       - where: $\alpha(.)$ is a single dense layer with ReLU nonlinearity
46 |   - Prediction
47 |     - $P_{start}(i)\propto exp(p^*_iW_sq^*)$ and $P_{end}(i)\propto exp(p^*_iW_eq^*)$
48 |     - During prediction, we choose the best span from token $i$ to token $i'$ such that $i\le i'\le i + 15$  and $P_{start}(i)\times P_{end}(i')$ is maximized
49 | 
50 | 


--------------------------------------------------------------------------------
/nlp/neural-machine-translation-by-jointly-learning-to-align-and-translate.md:
--------------------------------------------------------------------------------
 1 | # Neural Machine Translation by Jointly Learning to Align and Translate (2016), D. Bahdanau et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/abs/1409.0473)\] \[[pytorch](https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Prerequisite
10 | 
11 | **Sequence to Sequence Learning with Neural Networks** (2014), I. Sutskever et al. \[➤ [summary](nlp/sequence-to-sequence-learning-with-neural-networks.md)\]
12 | 
13 | 
14 | 
15 | ### Model
16 | 
17 | - In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder–decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly.
18 |   - A potential issue with previous encoder–decoder approach is that a neural network needs to be able to compress all the necessary information of a source sentence into a fixed-length vector.
19 |   - In order to address this issue, we introduce an extension to the encoder–decoder model which learns to align and translate jointly. Each time the proposed model generates a word in a translation, it (soft-)searches for a set of positions in a source sentence where the most relevant information is concentrated. The model then predicts a target word based on the context vectors associated with these source positions and all the previous generated target words.
20 | - Seq2Seq Encoder-decoder Framework
21 |   - target: translate from $\bold{x}$ to $\bold{y}$
22 |   - encoder:
23 |     - encoder RNN with hidden state: $h_t=f(x_t,h_{t-1})$
24 |   - decoder:
25 |     - $c=q(\{h_1,h_2,\dots,h_T\})$
26 |     - the joint probability: $p(\bold{y})=\prod_{t}p(y_t\mid \{h_1,\dots,h_{t-1}\},c)$
27 |       - with RNN: $p(y_t\mid \{h_1,\dots,h_{t-1}\},c)=g(y_{t-1},s_t,c)$
28 |   - approach form "Sequence to Sequence Learning with Neural Networks"
29 |     - $f$ and $g$ are LSTMs
30 |     - $q(\{h_1,h_2,\dots,h_T\})=h_T$
31 | 
32 | - Our approach
33 |   - ![](assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_01.png)
34 |   - $f$ is bi-directional LSTM
35 |   - $p(y_i\mid \{h_1,\dots,h_{i-1}\},\bold{x})=g(y_{i-1},s_i,c_i)$
36 |     - $g$ is LSTM
37 |     - $c_i=\sum_j\alpha_{ij}h_j$
38 |     - attention weight: $\alpha_{ij}=\frac{e_{ij}}{\sum_k e_{ik}}$
39 |     - where: $e_{ij}=a(s_{i-1},h_j)$ : We parametrize the alignment model $a$ as a feedforward neural network which is jointly trained with all the other components of the proposed system.
40 |     - Intuitively, this implements a mechanism of attention in the decoder. The decoder decides parts of the source sentence to pay attention to.
41 | 
42 | 
43 | 
44 | ### Result
45 | 
46 | ![](assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_02.png)
47 | 
48 | - We train two types of models. The first one is an RNN Encoder–Decoder (RNNencdec, Cho et al., 2014a), and the other is the proposed model, to which we refer as RNNsearch. We train each model twice: first with the sentences of length up to 30 words (RNNencdec-30, RNNsearch-30) and then with the sentences of length up to 50 word (RNNencdec-50, RNNsearch-50).
49 | 
50 | 
51 | 
52 | ![](assets/neural-machine-translation-by-jointly-learning-to-align-and-translate_03.png)


--------------------------------------------------------------------------------
/nlp/sequence-to-sequence-learning-with-neural-networks.md:
--------------------------------------------------------------------------------
 1 | # Sequence to Sequence Learning with Neural Networks (2014), I. Sutskever et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1211.3711)\] \[[keras](https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py)\] \[[pytorch](https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb)\]
 6 | 
 7 | ---
 8 | 
 9 | - First seq2seq on machine translation
10 | - Sequences pose a challenge for DNNs because they require that the dimensionality of the inputs and outputs is known and fixed. In this paper, we show that a straightforward application of the Long Short-Term Memory (LSTM) architecture can solve general sequence to sequence problems.
11 | - The idea is to use one LSTM to read the input sequence, one timestep at a time, to obtain large fixed- dimensional vector representation, and then to use another LSTM to extract the output sequence from that vector (fig. 1). The second LSTM is essentially a recurrent neural network language model except that it is conditioned on the input sequence.
12 | - ![](assets/sequence-to-sequence-learning-with-neural-networks_01.png)
13 | - Our actual models differ from the above description in three important ways:
14 |   1. we used two different LSTMs: one for the input sequence and another for the output sequence, because doing so increases the number model parameters at negligible computational cost and makes it natural to train the LSTM on multiple language pairs simultaneously.
15 |   2. we found that deep LSTMs significantly outperformed shallow LSTMs, so we chose an LSTM with 4 layers
16 |   3. we found it extremely valuable to reverse the order of the words of the input sentence
17 |      - By reversing the words in the source sentence, the average distance between corresponding words in the source and target language is unchanged. However, the first few words in the source language are now very close to the first few words in the target language, so the problem’s minimal time lag is greatly reduced.
18 |      - Initially, we believed that reversing the input sentences would only lead to more confident predictions in the early parts of the target sentence and to less confident predictions in the later parts. How- ever, LSTMs trained on reversed source sentences did much better on long sentences than LSTMs trained on the raw source sentences, which suggests that reversing the input sentences results in LSTMs with better memory utilization.
19 | - It would be difficult to train the simple RNNs due to the resulting long term dependencies (figure 1). However, the Long Short-Term Memory (LSTM) is known to learn problems with long range temporal dependencies, so an LSTM may succeed in this setting.
20 | - The goal of the LSTM is to estimate the conditional probability $p(y_1, \dots , y_{T'} |x_1, \dots , x_T )$
21 |   - training objective: $\frac{1}{\mid\mathcal{D_{train}}\mid}\sum_{(T,S)\in\mathcal{D_{train}}}log\ p(T\mid S)$
22 |   - inference: $T^*=argmax_T\ p(T\mid S)$ (with a simple left-to-right beam-search)
23 | - On the WMT’14 English to French translation task, we obtained a BLEU score of 34.81 by directly extracting translations from an ensemble of 5 deep LSTMs (with 384M parameters and 8,000 dimensional state each) using a simple left-to-right beam-search decoder. This is by far the best result achieved by direct translation with large neural networks. For comparison, the BLEU score of an SMT baseline on this dataset is 33.30.
24 | - But, in our case, we use relatively unoptimized small-vocabulary (only 80k). Finally, we used the LSTM to rescore the publicly available 1000-best lists of the SMT baseline on the same task. By doing so, we obtained a BLEU score of 36.5, which improves the baseline by 3.2 BLEU points and is close to the previous best published result on this task (which is 37.0).
25 |   - details of rescoring: we computed the log probability of every hypothesis with our LSTM and took an even average with their score and the LSTM’s score
26 | - ![](assets/sequence-to-sequence-learning-with-neural-networks_02.png)


--------------------------------------------------------------------------------
/optimization-training-techniques/AdamW.md:
--------------------------------------------------------------------------------
 1 | # Decoupled Weight Decay Regularization (2019), Ilya Loshchilov et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1711.05101.pdf)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Abstract / Introduction
10 | 
11 | - L2 regularization and weight decay regularization are equivalent for standard stochastic gradient descent, but as we demonstrate this is not the case for adaptive gradient algorithms, such as Adam. 
12 | - We propose a simple modification to recover the original formulation of weight decay regularization by decoupling the weight decay from the optimization steps taken.
13 | - Our analysis of Adam leads to the following observations:
14 |   - L2 regularization and weight decay are not identical
15 |   - L2 regularization is not effective in Adam
16 |   - Weight decay is equally effective in both SGD and Adam
17 | - The main contribution of this paper is to **improve regularization in Adam by decoupling the weight decay from the gradient-based update**
18 | 
19 | 
20 | 
21 | ### Decoupling the Weight Decay from the Gradient-based Update
22 | 
23 | - L2 regularization and weight decay regularization are equivalent for standard stochastic gradient descent.
24 |   - L2 regularization: $f^{reg}(\theta)=f(\theta)+\frac{\lambda/\alpha}{2}||\theta||^2$
25 |   - L2 regularization + gradient decent: $\theta_{t+1}=\theta_t-\alpha \nabla f^{reg}(\theta_t)=(1-\lambda)\theta_t-\alpha \nabla f(\theta_t)$
26 |   - weight decay: $\theta_{t+1}=(1-\lambda)\theta_t-\alpha \nabla f(\theta_t)$
27 | - But this is not the case for adaptive gradient algorithms, such as Adam.
28 |   - L2 regularization + Adam: $\theta_{t+1}=\theta_t-\alpha\frac{\beta_1m_{t-1}+(1-\beta_1)g_t}{\sqrt{\beta_2v_{t-1}+(1-\beta_2)g_t^2}+ε}$
29 |     - where: $g_t=\nabla f_t(\theta_{t-1})+\lambda\theta_{t-1}$
30 | 
31 | ![](assets/AdamW_01.png)
32 | 
33 | ![](assets/AdamW_02.png)
34 | 
35 | - Their equivalence between L2 regularization and weight decay for standard SGD remains very helpful for intuition: both mechanisms push weights closer to zero, at the same rate.
36 | - However, for adaptive gradient algorithms they differ:
37 |   - With L2 regularization, the sums of the gradient of the loss function and the gradient of the regularizer are adapted, whereas with decoupled weight decay, only the gradients of the loss function are adapted.
38 |   - With L2 regularization both types of gradients are normalized by their typical (summed) magnitudes, and therefore weights $x$ with large typical gradient magnitude $s$ are regularized by a smaller relative amount than other weights. 
39 |   - In contrast, decoupled weight decay regularizes all weights with the same rate $λ$, effectively regularizing weights $x$ with large $s$ more than standard L2 regularization does.
40 | 
41 | 
42 | 
43 | ### Experimental Validation
44 | 
45 | - Following suggestions that adaptive gradient methods such as Adam might lead to worse generalization than SGD with momentum (Wilson et al., 2017), we identified and exposed the inequivalence of L2 regularization and weight decay for Adam. We empirically showed that our version of Adam with decoupled weight decay yields substantially better generalization performance than the com- mon implementation of Adam with L2 regularization. 
46 | - We also proposed to use warm restarts for Adam to improve its anytime performance.
47 | 
48 | ![](assets/AdamW_03.png)
49 | 
50 | ![](assets/AdamW_04.png)
51 | 
52 | ![](assets/AdamW_05.png)
53 | 
54 | ![](assets/AdamW_06.png)
55 | 
56 | 


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AMSGrad_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AMSGrad_01.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AMSGrad_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AMSGrad_02.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AMSGrad_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AMSGrad_03.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AMSGrad_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AMSGrad_04.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AMSGrad_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AMSGrad_05.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AdamW_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AdamW_01.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AdamW_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AdamW_02.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AdamW_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AdamW_03.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AdamW_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AdamW_04.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AdamW_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AdamW_05.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/AdamW_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/AdamW_06.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_01.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_02.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_03.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_04.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_05.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_06.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_07.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_08.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_08.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_09.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_10.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_11.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_12.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/RAdam_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/RAdam_13.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_01.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_02.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_03.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/an-overview-of-gradient-descent-optimization-algorithms_04.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lars_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lars_01.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lars_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lars_02.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lars_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lars_03.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lars_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lars_04.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lars_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lars_05.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_01.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_02.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_03.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_04.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_05.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_06.png


--------------------------------------------------------------------------------
/optimization-training-techniques/assets/lookahead_07.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/optimization-training-techniques/assets/lookahead_07.png


--------------------------------------------------------------------------------
/optimization-training-techniques/lars.md:
--------------------------------------------------------------------------------
 1 | # Large Batch Training of Convolutional Networks (2017), Yang You.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1708.03888.pdf)\]
 6 | 
 7 | ---
 8 | 
 9 | - Problem: A common way to speed up training of large convolutional networks is to add computational units. With an increase in the number of nodes, the batch size grows. But training with large batch size often results in the lower model accuracy. 
10 | 
11 |   - Increasing the global batch while keeping the same number of epochs means that you have fewer iterations to update weights
12 |   - Another problem related to large batch training is so called "generalization gap", observed by Keskar et al. (2016). They came to conclusion that "the lack of generalization ability is due to the fact that large-batch methods tend to converge to sharp minimizers of the training function." They tried a few methods to improve the generalization with data augmentation and warm-starting with small batch, but they did not find a working solution.
13 | 
14 | - Possible Solution: linear learning rate (LR) scaling with warm-up
15 | 
16 |   - linear LR scaling:
17 | 
18 |     - The weight updates for batch size $B$ after 2 iterations would be:
19 | 
20 |       ![](assets/lars_04.png)
21 | 
22 |     - The weight update for the batch $B_2 = 2 ∗ B$ with learning rate $λ_2$:
23 |       ![](assets/lars_05.png)
24 |     - will be similar if you take $λ_2 =2∗λ$, assuming that $∇L(x_j,w_t+1)≈L(x_j,w_t)$.
25 | 
26 |   - LR warm-up:  training starts with small LR, and then LR is gradually increased to the target. After the warm-up period (usually a few epochs), you switch to the regular LR policy ("multi-steps", polynomial decay etc).
27 | 
28 | - We argue that the current recipe for large batch training (linear learning rate scaling with warm-up) is not general enough and training may diverge.
29 | 
30 |   - while increasing the learning rate (LR), networks may diverge
31 |   - The scaling of Alexnet above 2K is difficult, since the training diverges for larger LRs
32 |   - When LR $λ$ is large, the update $||λ ∗ ∇L(w_t)||$ can become larger than $||w||$, and this can cause the divergence. 
33 |   - We found that the ratio the L2-norm of weights and gradients $||w||/||∇L(wt)||$ varies significantly between weights and biases, and between different layers.
34 |     - ![](assets/lars_01.png)
35 | 
36 | - Layer-wise Adaptive Rate Scaling (LARS)
37 | 
38 |   - To analyze the training stability with large LRs we measured the ratio between the norm of the layer weights and norm of gradients update. We observed that if this ratio is too high, the training may become unstable. On other hand, if the ratio is too small, then weights don’t change fast enough. This ratio varies a lot between different layers, which makes it necessary to use a separate LR for each layer. Thus we propose a novel Layer-wise Adaptive Rate Scaling (LARS) algorithm. There are two notable differences:
39 |     - LARS uses a separate learning rate for each layer and not for each weight, which leads to better stability.
40 |     - the magnitude of the update is controlled with respect to the weight norm for better control of training speed. 
41 |   - Using LARS, we scaled Alexnet up to a batch size of 8K, and Resnet-50 to a batch size of 32K without loss in accuracy.
42 |     - ![](assets/lars_03.png)
43 |   - Algorithm:
44 |     - ![](assets/lars_02.png)
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/optimization-training-techniques/lookahead.md:
--------------------------------------------------------------------------------
 1 | # Lookahead Optimizer: k steps forward, 1 step back (2019), Michael R. Zhang et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1907.08610.pdf)\]
 6 | 
 7 | ---
 8 | 
 9 | - Lookahead first updates the “fast weights” k times using any standard optimizer in its inner loop before updating the “slow weights” once in the direction of the final fast weights.
10 |   - ![](assets/lookahead_01.png)
11 | - Standard optimization methods typically require carefully tuned learning rates to prevent oscillation and slow convergence. Lookahead, however, benefits from a larger learning rate in the inner loop. When oscillating in the high curvature directions, the fast weights updates make rapid progress along the low curvature directions. The slow weights help smooth out the oscillations through the parameter interpolation. The combination of fast weights and slow weights improves learning in high curvature directions, reduces variance, and enables Lookahead to converge rapidly in practice.
12 | - Empirical analysis: 
13 |   - Robustness to inner optimization algorithm, the number of fast weight updates k, and the slow weights learning rate α. Therefore, lessens the need for extensive hyperparameter tuning.
14 |     - ![](assets/lookahead_05.png)
15 |     - ![](assets/lookahead_06.png)
16 |   - The slow weights step recovers the outer loop variance and restores the test accuracy.
17 |     - ![](assets/lookahead_07.png)
18 | - Experiments
19 |   - CIFAR
20 |     - ![](assets/lookahead_02.png)
21 |   - ImageNet
22 |     - ![](assets/lookahead_03.png)
23 |   - Penn Treebank and WMT-14 machine translation task
24 |     - ![](assets/lookahead_04.png)
25 | 
26 | 


--------------------------------------------------------------------------------
/speech/assets/attention_mechanism.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/attention_mechanism.jpg


--------------------------------------------------------------------------------
/speech/assets/ctc_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_01.png


--------------------------------------------------------------------------------
/speech/assets/ctc_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_02.png


--------------------------------------------------------------------------------
/speech/assets/ctc_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_03.png


--------------------------------------------------------------------------------
/speech/assets/ctc_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_04.png


--------------------------------------------------------------------------------
/speech/assets/ctc_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_05.png


--------------------------------------------------------------------------------
/speech/assets/ctc_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_06.png


--------------------------------------------------------------------------------
/speech/assets/ctc_07.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_07.jpeg


--------------------------------------------------------------------------------
/speech/assets/ctc_08.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_08.jpeg


--------------------------------------------------------------------------------
/speech/assets/ctc_09.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_09.png


--------------------------------------------------------------------------------
/speech/assets/ctc_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_10.png


--------------------------------------------------------------------------------
/speech/assets/ctc_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_11.png


--------------------------------------------------------------------------------
/speech/assets/ctc_12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_12.png


--------------------------------------------------------------------------------
/speech/assets/ctc_13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_13.png


--------------------------------------------------------------------------------
/speech/assets/ctc_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_14.png


--------------------------------------------------------------------------------
/speech/assets/ctc_15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_15.png


--------------------------------------------------------------------------------
/speech/assets/ctc_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/ctc_16.png


--------------------------------------------------------------------------------
/speech/assets/tacotron2_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron2_01.png


--------------------------------------------------------------------------------
/speech/assets/tacotron2_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron2_02.png


--------------------------------------------------------------------------------
/speech/assets/tacotron2_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron2_03.png


--------------------------------------------------------------------------------
/speech/assets/tacotron2_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron2_04.png


--------------------------------------------------------------------------------
/speech/assets/tacotron2_decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron2_decoder.png


--------------------------------------------------------------------------------
/speech/assets/tacotron_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron_01.png


--------------------------------------------------------------------------------
/speech/assets/tacotron_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron_02.png


--------------------------------------------------------------------------------
/speech/assets/tacotron_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron_03.png


--------------------------------------------------------------------------------
/speech/assets/tacotron_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/tacotron_04.png


--------------------------------------------------------------------------------
/speech/assets/waveglow_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/waveglow_01.png


--------------------------------------------------------------------------------
/speech/assets/waveglow_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/waveglow_02.png


--------------------------------------------------------------------------------
/speech/assets/waveglow_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/waveglow_03.png


--------------------------------------------------------------------------------
/speech/assets/wavenet_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/wavenet_01.png


--------------------------------------------------------------------------------
/speech/assets/wavenet_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/wavenet_02.png


--------------------------------------------------------------------------------
/speech/assets/wavenet_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/wavenet_03.png


--------------------------------------------------------------------------------
/speech/assets/wavenet_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/wavenet_04.png


--------------------------------------------------------------------------------
/speech/assets/wavenet_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/speech/assets/wavenet_05.png


--------------------------------------------------------------------------------
/time-series/NeuralODE.md:
--------------------------------------------------------------------------------
 1 | # Neural Ordinary Differential Equations (2019), Ricky T. Q. Chen et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/1806.07366.pdf)\] \[[code](https://github.com/rtqichen/torchdiffeq)\]
 6 | 
 7 | ---
 8 | 
 9 | ### Abstract
10 | 
11 | - We introduce a new family of deep neural network models. Instead of specifying a discrete sequence of hidden layers, we parameterize the derivative of the hidden state using a neural network. The output of the network is computed using a black-box differential equation solver. These continuous-depth models have constant memory cost, adapt their evaluation strategy to each input, and can explicitly trade numerical precision for speed.
12 | 
13 | 
14 | 
15 | ### Neural ODE
16 | 
17 | **Residual networks like steps of Eular method**
18 | 
19 | - Ordinary differential equation (ODE)
20 | 
21 | $$
22 | \frac{d\bold{h}(t)}{dt}=f(\bold{h}(t),t,\theta)
23 | $$
24 | 
25 | - Euler discretization:
26 |   $$
27 |   \bold{h}_{t+1}=\bold{h}_t+f(\bold{h}_t,\theta_t)
28 |   $$
29 | 
30 |   - this is a residual layer
31 | 
32 | - Euler discretization method is not a good ODE solver.
33 | 
34 | - Could we have continuous-depth models instead of specifying a discrete sequence of hidden layers?
35 | 
36 | 
37 | ![](assets/NeuralODE_01.png)
38 | 
39 | 
40 | 
41 | **Adjoint Sensitivity Method**
42 | 
43 | ![](assets/NeuralODE_02.png)
44 | 
45 | - $\bold{z}(t_0)=\bold{z}(t_1)+\int_{t_1}^{t_0}f(\bold{z}(t),t,\theta)dt$
46 | 
47 |   - we could use a black box ODE solver to solve above integral
48 |     $$
49 |     \bold{z}(t_0)=\text{ODESolve}(\bold{z}(t_1),f=f(\bold{z}(t),t,\theta),t_1,t_0,\theta)
50 |     $$
51 | 
52 | - If loss $L(\bold{z}(t_1))$, $\frac{d\bold{z}(t)}{dt}=f(\bold{z}(t),t,\theta)$ and $\bold{a}(t)=\frac{dL}{d\bold{z}(t)}$, then 
53 |   $$
54 |   \bold{a}(t_0)=\bold{a}(t_1)-\int_{t_{1}}^{t_{0}}\bold{a}(t)^T\frac{\partial f(\bold{z}(t),t,\theta)}{\partial \bold{z}(t)}dt
55 |   $$
56 | 
57 |   - we could use a black box ODE solver to solve above integral
58 |     $$
59 |     \bold{a}(t_0)=\text{ODESolve}(\bold{a}(t_1),f=-\bold{a}(t)^T\frac{\partial f(\bold{z}(t),t,\theta)}{\partial \bold{z}(t)},t_1,t_0,\theta)
60 |     $$
61 | 
62 | - Computing the gradients with respect to the parameters $θ$ requires evaluating a third integral, which depends on both $\bold{z}(t)$ and $\bold{a}(t)$:
63 |   $$
64 |   \frac{dL}{d\theta}=-\int_{t_1}^{t_0}\bold{a}(t)^T\frac{\partial f(\bold{z}(t),t,\theta)}{\partial \theta(t)}dt
65 |   $$
66 | 
67 |   - we could use a black box ODE solver to solve above integral
68 |     $$
69 |     \frac{dL}{d\theta}=\text{ODESolve}(\bold{0}_{|\theta|},f=-\bold{a}(t)^T\frac{\partial f(\bold{z}(t),t,\theta)}{\partial \theta},t_1,t_0,\theta)
70 |     $$
71 | 
72 | ![](assets/NeuralODE_03.png)
73 | 
74 | 
75 | 
76 | ### Replacing residual networks with ODEs for supervised learning
77 | 
78 | ![](assets/NeuralODE_04.png)
79 | 
80 | 
81 | 
82 | ### A generative latent function time-series model
83 | 
84 | ![](assets/NeuralODE_05.png)
85 | 
86 | - We can train this latent-variable model as a variational autoencoder, with sequence-valued observations. Our recognition net is an RNN, which consumes the data sequentially backwards in time, and outputs $q_φ(z_0|x_1, x_2, . . . , x_N)$.
87 | 
88 | 
89 | 
90 | ![](assets/NeuralODE_06.png)


--------------------------------------------------------------------------------
/time-series/assets/NeuralODE_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/time-series/assets/NeuralODE_01.png


--------------------------------------------------------------------------------
/time-series/assets/NeuralODE_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/time-series/assets/NeuralODE_02.png


--------------------------------------------------------------------------------
/time-series/assets/NeuralODE_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/time-series/assets/NeuralODE_03.png


--------------------------------------------------------------------------------
/time-series/assets/NeuralODE_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/time-series/assets/NeuralODE_04.png


--------------------------------------------------------------------------------
/time-series/assets/NeuralODE_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/time-series/assets/NeuralODE_05.png


--------------------------------------------------------------------------------
/time-series/assets/NeuralODE_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/time-series/assets/NeuralODE_06.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_001.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_002.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_003.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_004.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_004.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_005.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_005.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_006.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/deep-neural-networks-are-easily-fooled_006.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/distilling-the-knowledge-in-a-neural-network_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/distilling-the-knowledge-in-a-neural-network_001.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/how-transferable-are-features-in-deep-neural-networks_001.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/how-transferable-are-features-in-deep-neural-networks_001.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/how-transferable-are-features-in-deep-neural-networks_002.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/how-transferable-are-features-in-deep-neural-networks_002.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/how-transferable-are-features-in-deep-neural-networks_003.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/how-transferable-are-features-in-deep-neural-networks_003.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/image.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_01.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_02.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_03.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_04.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_04.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_05.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GitYCC/machine-learning-papers-summary/6cce67e2784e808cc9f16f92fde8466667040be6/understanding-generalization-transfer/assets/rethinking_pre-training_and_self-training_06.png


--------------------------------------------------------------------------------
/understanding-generalization-transfer/distilling-the-knowledge-in-a-neural-network.md:
--------------------------------------------------------------------------------
 1 | # Distilling the knowledge in a neural network (2015), G. Hinton et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](http://arxiv.org/pdf/1503.02531)\] 
 6 | 
 7 | ---
 8 | 
 9 | * latency and computational resources are very important in deployment stage
10 | * cumbersome model
11 |   * ensemble of separately trained models
12 |   * large model with strong regularization \(such as dropout\)
13 | * Once the cumbersome model has been trained, we can then use a different kind of training, which we call "distillation" to transfer the knowledge from the cumbersome model to a small model that is more suitable for deployment.
14 |   
15 |   * teacher model and student model
16 | * hard targets: one-hot-encoding from groud true labels
17 | * soft targets: use the class probabilities produced by the cumbersome model to train the distilled model
18 |   * is arithmetic or geometric mean of their individual predictive distribution for the ensemble model
19 |   * larger entropy -&gt; more information than hard targets
20 |   * training: need less data and high learning rate \(v.s. hard targets\)
21 | * soft targets have more information between classes than hard targets
22 |   * e.g. soft targets shows BMW is more similar to garbage truck than carrot
23 |   * e.g. in MNIST, soft targets provide the information that "1" is similar to "7"
24 |   * e.g. you could randomly remove one class in MNIST. it still works
25 | * use temperature to increase entropy of soft targets
26 |   * softmax:
27 |     $$
28 |     q_i=exp\{z_i\}/\sum_j exp\{z_j\}
29 |     $$
30 | 
31 |   * softmax with temperature
32 |     $$
33 |     q_i=exp\{z_i/T\}/\sum_j exp\{z_j/T\}
34 |     $$
35 | 
36 | * method
37 |   * Pretrained cumbersome model
38 |   * Train distilled model
39 |     * model with temperature
40 |       * use same high temperature in training
41 |       * use $$T=1$$ in deployment
42 |     * use real data to support training distilled model: 
43 | 
44 |       weighted average of two different object function
45 | 
46 |       * $C_1$ from trained cumbersome model in high temperature
47 |       * $C_2$ from real labels with normal cross entropy
48 |       * "smaller weight of object\_function\_2" is better
49 |       * $loss=C_1T^2+\alpha C_2$  \(why $T^2$ ? explain later\)
50 | 
51 |     * $C_2=\sum_i (-y_iln(q_i)-(1-y_i)ln(1-q_i))$ 
52 | 
53 |       * gradient: 
54 |         $$
55 |         \frac{\partial C_2}{\partial z_i}=\frac{1}{T}(q_i-y_i)
56 |         $$
57 | 
58 |     * $C_1=\sum_i (-p_iln(q_i)-(1-p_i)ln(1-q_i))$
59 |       * $p_i=exp\{v_i\}/\sum_j exp\{v_j\}$ from cumbersome model
60 |       * $\frac{\partial C_1}{\partial z_i}=\frac{1}{T}(q_i-p_i)$ 
61 |       * considering high temperature and zero-meaned sperately: $\frac{\partial C_1}{\partial z_i}=\frac{1}{NT^2}(z_i-v_i)$ 
62 |       * equivalent: $C_1\approx \frac{1}{NT^2}\sum\frac{1}{2}(z_i-v_i)^2$ 
63 |     * $C_1T^2$ ensures that the relative contributions of the hard and soft targets remain roughly unchanged if the temperature used for distillation is changed while experimenting with meta-parameters
64 | * Experiments on speech recognition: 
65 |   * DNN acoustic models with total number of parameters about 85M. Directly train the model as _baseline_, 10x ensemble with randomly initialized with different initial parameter values as _cumbersome model_, distill from cumbersome model as _distilled single model_
66 |   * For the distillation we tried temperatures of 1,2,5,10 and used a relative weight of 0.5 on the cross-entropy for the hard targets.
67 |   * evaluation: frame accuaracy \(sample frames with time-box\) and word error rate \(WER\)
68 | 
69 | ![](assets/distilling-the-knowledge-in-a-neural-network_001.png)
70 | 
71 | * soft targets as regularizers
72 |   * One of our main claims about using soft targets instead of hard targets is that a lot of helpful infor- mation can be carried in soft targets that could not possibly be encoded with a single hard target.
73 | 


--------------------------------------------------------------------------------
/understanding-generalization-transfer/rethinking_pre-training_and_self-training.md:
--------------------------------------------------------------------------------
 1 | # Rethinking Pre-training and Self-training (2020), Barret Zoph et al.
 2 | 
 3 | ###### contributors: [@GitYCC](https://github.com/GitYCC)
 4 | 
 5 | \[[paper](https://arxiv.org/pdf/2006.06882.pdf)\]
 6 | 
 7 | ---
 8 | 
 9 | Our study reveals the generality and flexibility of self-training with three additional insights: 
10 | - 1) stronger data augmentation and more labeled data further diminish the value of pre-training
11 | - 2) unlike pre-training, self-training is always helpful when using stronger data augmentation, in both low-data and high-data regimes
12 | - 3) in the case that pre-training is helpful, self-training improves upon pre-training
13 | 
14 | 
15 | 
16 | **Methodology**
17 | 
18 | - Data Augmentation
19 |   - **Augment-S1**: Weakest augmentation: Flips and Crops
20 |   - **Augment-S2**: Third strongest augmentation: AutoAugment, Flips and Crops
21 |   - **Augment-S3**: Second strongest augmentation: Large Scale Jittering, AutoAugment, Flips and Crops
22 |   - **Augment-S4**: Strongest augmentation: Large Scale Jittering, RandAugment, Flips and Crops
23 | - Supervised Pre-training: using the EfficientNet-B7 architecture
24 |   - **Rand Init**: Model initialized w/ random weights
25 |   - **ImageNet Init**: Model initialized w/ ImageNet pre-trained checkpoint (84.5% top-1)
26 |   - **ImageNet++ Init**: Model initialized w/ higher performing ImageNet pre-trained checkpoint (86.9% top-1)
27 | - Self-supervised Pre-training: SimCLR
28 | - Self-training: First, a teacher model is trained on the labeled data (e.g., COCO dataset). Then the teacher model generates pseudo labels on unlabeled data (e.g., ImageNet dataset). Finally, a student is trained to optimize the loss on human labels and pseudo labels jointly. 
29 |   - the standard loss for self-training: $\hat{L}=L_h+\alpha L_p$  (where: $L_h$ and $L_p$ are human loss and pseudo loss)
30 |     - But it is unstable (nan loss)
31 |     - ![](assets/rethinking_pre-training_and_self-training_01.png)
32 |   - We thus implement a Loss Normalization method: $\hat{L}=\frac{1}{1+\alpha}(L_h+\alpha\frac{\bar{L_h}}{\bar{L_p}}L_p)$  (where: $\bar{L_h}$ and $\bar{L_p}$ are their respective moving averages over training)
33 | 
34 | 
35 | 
36 | **Experiments**
37 | 
38 | - The effects of augmentation and labeled dataset size on pre-training
39 |   - ![](assets/rethinking_pre-training_and_self-training_02.png)
40 |   - observaions:
41 |     - Supervised pre-training hurts performance when stronger data augmentation is used
42 |     - More labeled data diminishes the value of supervised pre-training
43 | - The effects of augmentation and labeled dataset size on self-training
44 |   - ![](assets/rethinking_pre-training_and_self-training_03.png)
45 |   - observaions:
46 |     - Self-training helps in high data/strong augmentation regimes, even when supervised pre-training hurts
47 |   - ![](assets/rethinking_pre-training_and_self-training_04.png)
48 |   - observaions:
49 |     - Self-training works across dataset sizes and is additive to supervised pre-training
50 | - Self-supervised pre-training also hurts when self-training helps in high data/strong augmentation regimes
51 |   - ![](assets/rethinking_pre-training_and_self-training_05.png)
52 | 
53 | 
54 | 
55 | **Discussion**
56 | 
57 | - Rethinking pre-training and universal feature representations
58 | - The importance of task alignment
59 |   - One interesting result in our experiments is ImageNet pre-training, even with additional human labels, performs worse than self-training.
60 |   - With strong data augmentation (Augment-S4), training with train+aug actually hurts accuracy, but pseudo labels generated by self-training on the same aug dataset significantly improves accuracy. 
61 |     - ![](assets/rethinking_pre-training_and_self-training_06.png)
62 |   - **Above both results suggest that noisy (PASCAL) or un-targeted (ImageNet) labeling is worse than targeted pseudo labeling.**
63 | - Limitations: self-training requires more compute than fine-tuning on a pre-trained model
64 | - The scalability, generality and flexibility of self-training
65 | 
66 | 


--------------------------------------------------------------------------------