├── .gitignore
├── README.md
├── new_paper.md
└── papers
    ├── 1606_progressive_nn.md
    ├── 1606_progressive_nn
        ├── main.png
        └── results.png
    ├── 1702_cmp.md
    ├── 1702_cmp
        ├── f2.png
        └── f3.png
    ├── 1703_maml.md
    ├── 1703_maml
        ├── a1.png
        ├── a2.png
        ├── f1.png
        ├── f2.png
        ├── f3.png
        ├── f5.png
        └── t1.png
    ├── 1707_distral.md
    ├── 1707_distral
        ├── f1.png
        └── objective.png
    ├── 1708_reproducible_rl.md
    ├── 1708_rl_survey.md
    ├── 1801_soft_ac.md
    ├── 1801_soft_ac
        ├── algorithm.png
        ├── f1.png
        ├── policy.png
        ├── policy_update.png
        ├── qvalue_update.png
        ├── results_envs.png
        ├── results_hyperparams.png
        ├── results_improvements.png
        ├── results_seeds.png
        ├── soft_policy.png
        ├── soft_theory.png
        └── value_update.png
    ├── 1802_me_trpo.md
    ├── 1802_me_trpo
        ├── a1.png
        ├── a2.png
        ├── f2.png
        ├── f4.png
        └── f5.png
    ├── 1803_behavioral_cloning.md
    ├── 1803_behavioral_cloning
        ├── baselines.png
        ├── cooperation.png
        ├── f3.png
        ├── macro_goals.png
        ├── multimodal.png
        ├── notation.png
        ├── objectives.png
        ├── objectives2.png
        ├── s1.png
        ├── t2.png
        └── vrnn.png
    ├── 1803_cnn_vs_rnn.md
    ├── 1803_cnn_vs_rnn
        ├── f1.png
        ├── f6.png
        ├── t1.png
        ├── t4.png
        └── t5.png
    ├── 1803_smith_part1.md
    ├── 1803_smith_part1
        ├── p1.png
        └── p2.png
    ├── 1803_sptm.md
    ├── 1803_sptm
        ├── f1.png
        ├── f2.png
        └── t1.png
    ├── 1803_world_models.md
    ├── 1803_world_models
        ├── car_exp1.png
        ├── cover_exp1.png
        ├── f4.png
        └── t2.png
    ├── 1804_dora.md
    ├── 1804_dora
        ├── a1.png
        ├── e_values.png
        ├── example.png
        ├── f1.png
        ├── f13.png
        ├── f6.png
        ├── t1.png
        └── t2.png
    ├── 1804_gotta_learn_fast.md
    ├── 1804_gotta_learn_fast
        ├── baselines.png
        └── retro.png
    ├── 1805_progress_compress.md
    ├── 1805_progress_compress
        ├── adaptors.png
        ├── f1.png
        ├── f2.png
        ├── f3.png
        ├── objective_kb.png
        ├── objective_task.png
        └── t1.png
    ├── 1805_youtube.md
    └── 1805_youtube
        ├── f2.png
        ├── f3.png
        ├── f7.png
        └── reward.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | .DS_store
104 | .idea
105 | .code
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #### 2018-05
 2 | - Progress & Compress: A scalable framework for continual learning [[arxiv](https://arxiv.org/abs/1805.06370)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1805_progress_compress.md)]
 3 | - Playing hard exploration games by watching YouTube [[arxiv](https://arxiv.org/abs/1805.11592)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1805_youtube.md)]
 4 | #### 2018-04
 5 | - DORA The Explorer: Directed Outreaching Reinforcement Action-Selection [[arxiv](https://arxiv.org/abs/1804.04012)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1804_dora.md)]
 6 | - Gotta Learn Fast: A New Benchmark for Generalization in RL [[arxiv](https://arxiv.org/abs/1804.03720)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1804_gotta_learn_fast.md)]
 7 | 
 8 | #### 2018-03
 9 | - An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling [[arxiv](https://arxiv.org/abs/1803.01271)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1803_cnn_vs_rnn.md)]
10 | - Generative Multi-Agent Behavioral Cloning [[arxiv](https://arxiv.org/abs/1803.07612)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1803_behavioral_cloning.md)]
11 | - World Models [[arxiv](https://arxiv.org/abs/1803.10122)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1803_world_models.md)] 
12 | - Semi-parametric Topological Memory for Navigation [[arxiv](https://arxiv.org/abs/1803.00653)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1803_sptm.md)] 
13 | - A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay [[arxiv](https://arxiv.org/abs/1803.09820)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1803_smith_part1.md)] 
14 | 
15 | #### 2018-02
16 | 
17 | - Model-Ensemble Trust-Region Policy Optimization [[arxiv](https://arxiv.org/abs/1802.10592)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1802_me_trpo.md)]
18 | 
19 | #### 2018-01
20 | - Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor [[arxiv](https://arxiv.org/abs/1801.01290)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1801_soft_ac.md)]
21 | 
22 | #### 2017-08
23 | 
24 | - A Brief Survey of Deep Reinforcement Learning [[arxiv](https://arxiv.org/abs/1708.05866)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1708_rl_survey.md)]
25 | - Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control [[arxiv](https://arxiv.org/abs/1708.04133)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1708_reproducible_rl.md)]
26 | 
27 | #### 2017-07
28 | - Distral: Robust Multitask Reinforcement Learning [[arxiv](https://arxiv.org/abs/1707.04175)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1707_distral.md)]
29 | 
30 | #### 2017-03
31 | - Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks [[arxiv](https://arxiv.org/abs/1703.03400)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1703_maml.md)]
32 | 
33 | #### 2017-02
34 | - Cognitive Mapping and Planning for Visual Navigation [[arxiv](https://arxiv.org/abs/1702.03920)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1702_cmp.md)]
35 | 
36 | #### 2016-06
37 | - Progressive Neural Networks [[arxiv](https://arxiv.org/abs/1606.04671)] & [[notes](https://github.com/Scitator/papers/blob/master/papers/1606_progressive_nn.md)]


--------------------------------------------------------------------------------
/new_paper.md:
--------------------------------------------------------------------------------
 1 | # [Paper](https://arxiv.org)
 2 | 
 3 | 
 4 | ##### TLDR
 5 | 
 6 | 
 7 | 
 8 | - ​
 9 | 
10 | ##### Notes
11 | 
12 | - ​
13 | 
14 | ##### Afterword
15 | 
16 | - 
17 | 
18 | ##### Interesting links
19 | 
20 | 1. ​


--------------------------------------------------------------------------------
/papers/1606_progressive_nn.md:
--------------------------------------------------------------------------------
 1 | # [ Progressive Neural Networks](https://arxiv.org/abs/1606.04671)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Additional article – [Sim-to-Real Robot Learning from Pixels with Progressive Nets](https://arxiv.org/abs/1610.04286)
 6 | 
 7 | Presentation – https://www.youtube.com/watch?v=aWAP_CWEtSI
 8 | 
 9 | Slides – http://juxi.net/workshop/deep-learning-rss-2016/slides/Raia_Hadsell_RSS_DL_workshop.pdf
10 | 
11 | 
12 | 
13 | ###### Idea:
14 | 
15 | ![alt text](./1606_progressive_nn/main.png)
16 | 
17 | 
18 | 
19 | ###### Intuition:
20 | 
21 | We have some trained network for task 1 – "column 1". And we want to transfer it knowlenge to new task. Rather than finetine column 1, let's froze it and make a new clean copy of it – column 2. Then just train the column 2 for new task, BUT with adding inner features from column 1.
22 | 
23 | ###### Results:
24 | 
25 | Additionally, we can analyse what blocks are used with different setups of knowledge transferring.
26 | 
27 | ![alt text](./1606_progressive_nn/results.png)


--------------------------------------------------------------------------------
/papers/1606_progressive_nn/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1606_progressive_nn/main.png


--------------------------------------------------------------------------------
/papers/1606_progressive_nn/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1606_progressive_nn/results.png


--------------------------------------------------------------------------------
/papers/1702_cmp.md:
--------------------------------------------------------------------------------
 1 | # [Cognitive Mapping and Planning for Visual Navigation](https://arxiv.org/abs/1702.03920)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Authors suggest new NN methods for navigation in novel unseen environments. They use *Mapper* to transform the visual input (frames from drone cameras) into some top-down belief map of the world (latent spatial memory) and *Planner* for decision making and goals achievement. All is trained **end-to-end**.
 6 | 
 7 | - looks like very interesting area of research, because:
 8 |   - computer vision and environment representation problem
 9 |   - memorisation problems (why am I thinking about NLP with their memory networks?)
10 |   - decision making, navigation and planning (and yes, it's RL)
11 | - lot's of additional material in Appendix
12 | - **All needed notes (don't miss the videos!) ->** [project website](https://sites.google.com/view/cognitive-mapping-and-planning/)
13 | - [source code](https://github.com/tensorflow/models/tree/master/research/cognitive_mapping_and_planning)
14 | - [slides](https://people.eecs.berkeley.edu/~sgupta/pdf/cmp_slides.pdf)
15 | 
16 | ##### Notes
17 | 
18 | Looks like authors did all my job, cause I found their project notes quite good.
19 | 
20 | Nevertheless, I need to mention, that whole pipeline was trained with [DAGGER](https://arxiv.org/abs/1011.0686) algorithm. 
21 | 
22 | ######  Architectures
23 | 
24 | ![alt text](./1702_cmp/f2.png)
25 | 
26 | ![alt text](./1702_cmp/f3.png)
27 | 
28 | So, *Mapper* builds a 2D multi-scale metric map that is used by *Planner* for end-to-end trainable planning.
29 | 
30 | ######  Intetesting
31 | 
32 | - NN mapper works better than analytic one.
33 | 
34 | ######  Future work
35 | 
36 | - more experiments with dynamic environments (when people are moving around?)
37 | 
38 | ##### Afterword and Questions
39 | 
40 | I find quite interesting their approach to visualize the belief state of the agent. Especially, when we have the GT mask.
41 | 
42 | - This method operates over a discrete grid, but how can we adapt it for non-grid environments?
43 | - That was the reward function? Distance to the Goal? Or it was imitation learning only?
44 | - It's a bit unclear how do they translate *semantic task* to something with geometric meaning (looks like geometric tasks are more applicable to grid navigation problem).
45 | - Can we use memory networks for belief state creation?
46 | - Can we use attention mechanism or something like AlphaGo MCTS for better hierarchical planning?
47 | 
48 | ##### Interesting links
49 | 
50 | 1. How to transer simulation learned knowledge to real-world - [CAD2RL: Real Single-Image Flight without a Single Real Image](https://arxiv.org/abs/1611.04201)
51 | 2. DAGGER - [ A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning](https://arxiv.org/abs/1011.0686)
52 | 2. "Bi-linear sampling allows to back-propagate gradients" - [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025)
53 | 


--------------------------------------------------------------------------------
/papers/1702_cmp/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1702_cmp/f2.png


--------------------------------------------------------------------------------
/papers/1702_cmp/f3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1702_cmp/f3.png


--------------------------------------------------------------------------------
/papers/1703_maml.md:
--------------------------------------------------------------------------------
 1 | # [Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks](https://arxiv.org/abs/1703.03400)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Currenly, meta-learning is very hot topic. Ability to adapt for different problems and correctly transfer learned experience from one task to another is crucial. To address this issue authors suggest a new meta-learning method, that can be used both in classification and regression and what is even more interesting - in reinforcement learning too.
 6 | 
 7 | - very simple and intuitive approach
 8 | - also looks very quicke one in comparison with others
 9 | - source code available
10 | 
11 | ##### Notes
12 | 
13 | ###### Methodology
14 | 
15 | Thanks to the authors, the paper has all necessary algorithms and figures to understant it well:
16 | 
17 | ![alt text](./1703_maml/a1.png)
18 | 
19 | ![alt text](./1703_maml/f1.png)
20 | 
21 | More precisely, setup for supervised or reinforcement learning (with REINFORCE algorithm):
22 | 
23 | ![alt text](./1703_maml/a2.png)
24 | 
25 | So, here is main idea:
26 | 
27 | - take N tasks T with loss function L, you want to be good at
28 | - for each task create sub-model theta<sub>i</sub>, that is trained and updated for that task; sample new dataset D<sub>i</sub> with (features, targets) for meta-learner
29 | - then, we train our new model theta as an average-best from theta<sub>i</sub>'s over D<sub>i</sub>'s updates
30 | 
31 | Interesting stuff, in original algorithm we need to make last, meta-update, uptimizing theta<sub>i</sub> over theta throung second derivatives. Nevertheless, due to ReLU activation, we can use only by first order approximation.
32 | 
33 | ###### Results & Experiments
34 | 
35 | ###### Regression
36 | 
37 | ![alt text](./1703_maml/f2.png)
38 | 
39 | ![alt text](./1703_maml/f3.png)
40 | 
41 | ###### Classification
42 | 
43 | ![alt text](./1703_maml/t1.png)
44 | 
45 | Interesting, that first order approximation is still very accurate.
46 | 
47 | ###### RL
48 | 
49 | ![alt text](./1703_maml/f5.png)
50 | 
51 | ##### Afterword
52 | 
53 | MAML looks like very simple, but powerfull approach for model meta-learning.​ Many experiments setups: untrilial regression, few-shot classification learning and velocity adaptation with RL. Lot's of interesting ideas and all is open-sourced. Just great work, you know.
54 | 
55 | ##### Interesting links
56 | 
57 | 1. Source code for [regression and classification](https://github.com/cbfinn/maml), [reinforcement learningg](https://github.com/cbfinn/maml_rl)
58 | 2. [Actor-Mimic: Deep Multitask and Transfer Reinforcement Learning](https://arxiv.org/abs/1511.06342)
59 | 3. [Optimization as a Model for Few-Shot Learning](https://openreview.net/forum?id=rJY0-Kcll)


--------------------------------------------------------------------------------
/papers/1703_maml/a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/a1.png


--------------------------------------------------------------------------------
/papers/1703_maml/a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/a2.png


--------------------------------------------------------------------------------
/papers/1703_maml/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/f1.png


--------------------------------------------------------------------------------
/papers/1703_maml/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/f2.png


--------------------------------------------------------------------------------
/papers/1703_maml/f3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/f3.png


--------------------------------------------------------------------------------
/papers/1703_maml/f5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/f5.png


--------------------------------------------------------------------------------
/papers/1703_maml/t1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1703_maml/t1.png


--------------------------------------------------------------------------------
/papers/1707_distral.md:
--------------------------------------------------------------------------------
 1 | # [Distral: Robust Multitask Reinforcement Learning](https://arxiv.org/abs/1707.04175)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Presenataion – https://www.youtube.com/watch?v=scf7Przmh7c
 6 | 
 7 | ###### How it looks:
 8 | ![alt text](./1707_distral/f1.png)
 9 | 
10 | ###### Objectvive:
11 | 
12 | ![alt text](./1707_distral/objective.png)
13 | 
14 | ###### Intuition:
15 | We try to learn one common policy for several different levels of the game (simulation, etc). So we add additional regularization term (c<sub>KL</sub>) to get our level-policy close to common one. (Similar idea to PPO one?)


--------------------------------------------------------------------------------
/papers/1707_distral/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1707_distral/f1.png


--------------------------------------------------------------------------------
/papers/1707_distral/objective.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1707_distral/objective.png


--------------------------------------------------------------------------------
/papers/1708_reproducible_rl.md:
--------------------------------------------------------------------------------
 1 | # [Reproducibility of Benchmarked Deep Reinforcement Learning Tasks for Continuous Control](https://arxiv.org/abs/1708.04133)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | **Reproducibility** *refers to the* **ability of a researcher to duplicate the results of a prior study** *using the same materials as were used by the original investigator. (...) Reproducibility is a minimum necessary condition for a finding to be believable and informative.* The paper about the most important issue of the DL/RL field.
 6 | 
 7 | - Really interesting to see the gap between reproduced results and published one
 8 | - Looks like we need also a code review for the accepted papers.
 9 | - New DL field challenge - **reproducibility**:
10 |   - NIPS 2017 - https://nurture.ai/nips-challenge/
11 |   - ICLR 2018 - http://www.cs.mcgill.ca/~jpineau/ICLR2018-ReproducibilityChallenge.html
12 | - RL needs standard **well-tuned baseline** algorithms
13 | 
14 | ##### Notes
15 | 
16 | Main aspects of reproducibility  (top 5):
17 | 
18 | - code 
19 | - dependencies (DL framework version, for example)
20 | - availability of datasets (NDA problem, etc) and simulators ($500 for MuJoCo) 
21 | - experimental setup setting (train/test split, random seeds, etc)
22 | - computation resources (the story of Google and 5k gpus)
23 | 
24 | How to measure RL agent performance?
25 | 
26 | - ~~Run N experiments, take K best - you are done~~ (but it is how it works now)
27 | - Run N*100 experiments and measure reward mean and std (for each environment)
28 | 
29 | ##### Afterword
30 | 
31 | Questions:
32 | 
33 | - Does Google have `from tensorflow.private import rl_golden_seeds`?
34 | - How can TRPO'15 implementation be better then TRPO'17 one?
35 | 
36 | ##### Interesting links
37 | 
38 | Video: https://vimeo.com/252185490
39 | 
40 | Presentation: https://drive.google.com/file/d/1ANKnC-EFDbbPr04xHl7LmEBtD6dKw4vK/view
41 | 
42 | Review by [Sergey Ovcharenko](https://github.com/dudevil) (in russian): https://www.youtube.com/watch?v=FVX7qLe-fZ8


--------------------------------------------------------------------------------
/papers/1708_rl_survey.md:
--------------------------------------------------------------------------------
  1 | # [A Brief Survey of Deep Reinforcement Learning](https://arxiv.org/abs/1708.05866)
  2 | 
  3 | ##### TLDR - ESSENTIAL READING
  4 | 
  5 | **RL** *is a step towards building autonomous systems with a higher level understanding of the visual world*. **DL** *enables **RL** to scale to decision-making problems that were previously intractable.* Survey about last papers and current challenges in this field.
  6 | 
  7 | - Great article to check your knowledge of RL field or to start to understand it. All necessary theory such as MDP/POMDP and state/qvalue/policy functions included. 
  8 | - Really cool paper to confirm your PhD research proposals :)
  9 | - Lots of interesting links.
 10 | 
 11 | ##### Notes
 12 | 
 13 | MDP:
 14 | 
 15 | - A set of states *S*, plus a distribution of starting states p(s<sub>0</sub>).
 16 | - A set of actions *A*.
 17 | - Transition dynamics *T* (s<sub>t+1</sub>|s<sub>t</sub> , a<sub>t</sub> ) that map a state-action pair at time t onto a distribution of states at time t + 1.
 18 | - An immediate/instantaneous reward function *R*(s<sub>t</sub> , a<sub>t</sub> , s<sub>t+1</sub>).
 19 | - A discount factor *γ* ∈ [0, 1], where lower values place more emphasis on immediate rewards. 
 20 | 
 21 | RL challenges:
 22 | 
 23 | - The optimal policy must be inferred by **trial-and-error interaction** with the environment. The **only learning signal** the agent receives is the **reward**.
 24 | - The **observations** of the agent **depend on its actions** and can contain strong temporal correlations.
 25 | - Agents must deal with **long-range time dependencies**: Often the consequences of an action only materialise after many transitions of the environment. This is known as the (temporal) credit assignment problem [135].
 26 | 
 27 | Dynamic Programming:
 28 | 
 29 | - **SARSA** - *on-policy*, uses transitions generated by the behavioral policy: 
 30 | 
 31 |   Q<sup>π</sup>(s<sub>t</sub>, a<sub>t</sub>)  ← Q<sup>π</sup>(s<sub>t</sub>, a<sub>t</sub>)  + α [*r*<sub>t</sub> + *γ* Q<sup>π</sup>(s<sub>t+1</sub>, a<sub>t+1</sub>)  - Q<sup>π</sup>(s<sub>t</sub>, a<sub>t</sub>) ]
 32 | - **Q-learning** - *off-policy*, directly approximates Q<sup>*</sup>:
 33 | 
 34 |   Q<sup>π</sup>(s<sub>t</sub>, a<sub>t</sub>)  ← Q<sup>π</sup>(s<sub>t</sub>, a<sub>t</sub>)  + α [*r*<sub>t</sub> + *γ* *max*<sub>a</sub>Q<sup>π</sup>(s<sub>t+1</sub>, a)  - Q<sup>π</sup>(s<sub>t</sub>, a<sub>t</sub>) ]
 35 | - To find Q<sup>*</sup> from an arbitrary Q<sup>π</sup> , we use **generalised policy iteration**.
 36 | 
 37 | Policy Search:
 38 | 
 39 | - Perhaps the greatest advantage of gradient-free policy search is that they can also optimise non-differentiable policies.
 40 | 
 41 | Planning and Learning:
 42 | 
 43 | - **Model-free** RL methods learn directly from interactions with the environment, but **model-based** RL methods can simulate transitions using the learned model,
 44 |   resulting in **increased sample efficiency**. 
 45 | 
 46 | Value functions:
 47 | 
 48 | - The combination of the **duelling DQN with prioritised experience replay** is one of the state-of-the-art techniques in **discrete action** settings.
 49 | - **NAF** is one of several state-of-the-art techniques in **continuous control** problems [40].
 50 | - **Metz et al.** [81] used this idea in order to construct the sequential DQN,
 51 |   allowing them to discretise a large action space and **outperform NAF in continuous control problems**.
 52 | 
 53 | Policy Search:
 54 | 
 55 | - Recent work has reignited interest in **evolutionary methods** for RL as they can potentially be distributed at larger scales than techniques that rely on gradients [116].
 56 | - The combination of **TRPO** and **GAE** remains one of the state-of-the-art RL techniques in **continuous control.** Also **PPO** (w/ or w/o **GAE**).
 57 | - On-policy methods can be more stable, whilst off-policy methods can be more data efficient.
 58 | 
 59 | Current Research and challenges:
 60 | 
 61 | - Hierarchical RL and the discovery and generalisation of goals.
 62 | - Imitation learning and Inverse RL: *...generative adversarial imitation learning (GAIL) was later extended to allow IRL to be applied even when receiving expert trajectories from a different visual viewpoint to that of the RL agent [131].*
 63 | - Multi-agent RL: *...investigate the effects of learning and sequential decision making in game theory [48, 71].*
 64 | - Memory and attention: *...it is possible to add a differentiable memory to the DQN, which allows it to more flexibly process information in its “working memory” [96].*
 65 | - Transfer learning.
 66 | 
 67 | ##### Afterword
 68 | 
 69 | Things to think about:
 70 | 
 71 | - Meta learning.
 72 | - Multi-agent RL and self-play.
 73 | - Learning to remember (memory & attention).
 74 | - Imitation learning and Inverse RL & GANs.
 75 | 
 76 | ##### Interesting links
 77 | 
 78 | 5. [Dzmitry Bahdanau, Philemon Brakel, Kelvin Xu, Anirudh Goyal, Ryan Lowe, Joelle Pineau, Aaron Courville, and Yoshua Bengio. An Actor-Critic Algorithm for Sequence Prediction. In ICLR, 2017.](https://arxiv.org/abs/1607.07086)
 79 | 
 80 | 
 81 | -
 82 | 
 83 | 7. [Nir Baram, Oron Anschel, and Shie Mannor. Model-Based Adversarial Imitation Learning. In NIPS Workshop on Deep Reinforcement Learning, 2016.](https://arxiv.org/abs/1612.02179)
 84 | 
 85 | 
 86 | -
 87 | 
 88 | 29.  [Yan Duan, John Schulman, Xi Chen, Peter L Bartlett, Ilya Sutskever, and Pieter Abbeel. RL2 : Fast Reinforcement Learning via Slow Reinforcement Learning. In NIPS Workshop on Deep Reinforcement Learning, 2016.](https://arxiv.org/abs/1611.02779)
 89 | 
 90 | 
 91 | -
 92 | 
 93 | 40. [Shixiang Gu, Timothy Lillicrap, Ilya Sutskever, and Sergey Levine. Continuous Deep Q-Learning with Model-Based Acceleration. In ICLR, 2016.](https://arxiv.org/abs/1603.00748)
 94 | 
 95 | 
 96 | -
 97 | 
 98 | 
 99 | 40. [Shixiang Gu, Timothy Lillicrap, Zoubin Ghahramani, Richard E Turner, Bernhard Schölkopf, and Sergey Levine. Interpolated Policy Gradient: Merging On-Policy and Off-Policy Gradient Estimation for Deep Reinforcement Learning. In NIPS, 2017.](https://arxiv.org/abs/1706.00387)
100 | 
101 | 
102 | -
103 | 
104 | 
105 | 48. [Johannes Heinrich and David Silver. Deep Reinforcement Learning from Self-Play in Imperfect-Information Games. 2016.](https://arxiv.org/abs/1603.01121)
106 | 
107 | 
108 | -
109 | 
110 | 
111 | 71. [Joel Z Leibo, Vinicius Zambaldi, Marc Lanctot, Janusz Marecki, and Thore Graepel. Multi-Agent Reinforcement Learning in Sequential Social Dilemmas. In AAMAS, 2017.](https://arxiv.org/abs/1702.03037)
112 | 
113 | 
114 | -
115 | 
116 | 78. [Yuxi Li. Deep Reinforcement Learning: An Overview. arXiv:1701.07274, 2017.](https://arxiv.org/abs/1701.07274)
117 | 
118 | 
119 | -
120 | 
121 | 
122 | 81. [Luke Metz, Julian Ibarz, Navdeep Jaitly, and James Davidson. Discrete Sequential Prediction of Continuous Actions for Deep RL. arXiv:1705.05035, 2017.](https://arxiv.org/abs/1705.05035)
123 | 
124 | 
125 | -
126 | 
127 | 
128 | 96. [Junhyuk Oh, Valliappa Chockalingam, Satinder Singh, and Honglak Lee. Control of Memory, Active Perception, and Action in Minecraft. In ICLR, 2016.](https://arxiv.org/abs/1605.09128)
129 | 
130 | 
131 | -
132 | 
133 | 98. [Emilio Parisotto and Ruslan Salakhutdinov. Neural Map: Structured Memory for Deep Reinforcement Learning. arXiv:1702.08360, 2017.](https://arxiv.org/abs/1702.08360)
134 | 
135 | 
136 | -
137 | 
138 | 105. [Alexander Pritzel, Benigno Uria, Sriram Srinivasan, Adrià Puigdomènech, Oriol Vinyals, Demis Hassabis, Daan Wierstra, and Charles Blundell. Neural Episodic Control. In ICML, 2017.](https://arxiv.org/abs/1703.01988)
139 | 
140 | 
141 | -
142 | 
143 | 105. [Marc’Aurelio Ranzato, Sumit Chopra, Michael Auli, and Wojciech Zaremba. Sequence Level Training with Recurrent Neural Networks. In ICLR, 2016.](https://arxiv.org/abs/1511.06732)
144 | 
145 | 
146 | -
147 | 
148 | 116. [Tim Salimans, Jonathan Ho, Xi Chen, and Ilya Sutskever. Evolution Strategies as a Scalable Alternative to Reinforcement Learning. arXiv:1703.03864, 2017.](https://arxiv.org/abs/1703.03864)
149 | 
150 | 
151 | -
152 | 
153 | 123.  [John Schulman, Philipp Moritz, Sergey Levine, Michael Jordan, and Pieter Abbeel. High-Dimensional Continuous Control using Generalized Advantage Estimation. In ICLR, 2016.](https://arxiv.org/abs/1506.02438)
154 | 
155 | 
156 | -
157 | 
158 | 131. [Bradley C Stadie, Pieter Abbeel, and Ilya Sutskever. Third Person Imitation Learning. In ICLR, 2017.](https://arxiv.org/abs/1703.01703)
159 | 
160 | 
161 | -
162 | 
163 | 150. [Harm Vanseijen and Rich Sutton. A Deeper Look at Planning as Learning from Replay. In ICML, 2015.](http://proceedings.mlr.press/v37/vanseijen15.pdf)
164 | 
165 | 
166 | -
167 | 
168 | 156. [Jane X Wang, Zeb Kurth-Nelson, Dhruva Tirumala, Hubert Soyer, Joel Z Leibo, Rémi Munos, Charles Blundell, Dharshan Kumaran, and Matt Botvinick. Learning to Reinforcement Learn. In CogSci, 2017.](https://arxiv.org/abs/1611.05763)


--------------------------------------------------------------------------------
/papers/1801_soft_ac.md:
--------------------------------------------------------------------------------
  1 | # [Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor](https://arxiv.org/abs/1801.01290)
  2 | 
  3 | ##### TLDR
  4 | 
  5 | Lots of interesting stuff, but not presented well -> criticism and lots of questions.
  6 | 
  7 | Authors present new off-policy method. Combining off-policy update with actor-crirtic formulation they achives stable and sample-efficient algorithm for continious state-action space environments. (looks like Grail found, need to check  with implementation). Main motivation - apply RL to real-world tasks. [ Berkeley goal? ]
  8 | 
  9 | - quite simple idea with maximum entropy RL ( ~ just a bit tunned DDPG )
 10 | - looks like more **robust** than DDPG one
 11 | - still can't understand additional value net usage
 12 | - can be used with reparametrization trick (performance boost)
 13 | - another article with appendix (nice trend, just like it)
 14 | 
 15 | ##### Notes
 16 | 
 17 | ###### Theory
 18 | 
 19 | What we already have? Typical maximum entropy ojective (aka REINFORCE, etc):
 20 | 
 21 | ![alt text](./1801_soft_ac/f1.png)
 22 | 
 23 | So, let's reformulate Q-learning a bit:
 24 | 
 25 | ![alt text](./1801_soft_ac/soft_theory.png)
 26 | 
 27 | additionally, policy change:
 28 | 
 29 | ![alt text](./1801_soft_ac/soft_policy.png)
 30 | 
 31 | ----
 32 | 
 33 | ###### Practice
 34 | 
 35 | Now, let's use ~~typical~~ q-learning. How to train it all in practice?
 36 | 
 37 | ![alt text](./1801_soft_ac/value_update.png)
 38 | 
 39 | ![alt text](./1801_soft_ac/qvalue_update.png)
 40 | 
 41 | ![alt text](./1801_soft_ac/policy_update.png)
 42 | 
 43 | 
 44 | 
 45 | And it's all! After that, the ~~typical~~ DDPG works:
 46 | 
 47 | 
 48 | 
 49 | ![alt text](./1801_soft_ac/algorithm.png)
 50 | 
 51 | 
 52 | 
 53 | For policy weighted Gaussian mixture were used:
 54 | 
 55 | ![alt text](./1801_soft_ac/policy.png)
 56 | 
 57 | UDP: It's possible that *mixture* of gaussians we used just for approach generalization. It's interesting to see learned gaussians weights. (It's possible, that for tested envs there were only one main gaussian learned and other with 0-weights).
 58 | 
 59 | ---
 60 | 
 61 | ###### Results
 62 | 
 63 | SAC vs SOTAs: rewards on different environments
 64 | 
 65 | ![alt text](./1801_soft_ac/results_envs.png)
 66 | 
 67 | 
 68 | 
 69 | Stability check: reward curves for different seed values
 70 | 
 71 | ![alt text](./1801_soft_ac/results_seeds.png)
 72 | 
 73 | Improvements value
 74 | 
 75 | ![alt text](./1801_soft_ac/results_improvements.png)
 76 | 
 77 | And hyperparameters value
 78 | 
 79 | ![alt text](./1801_soft_ac/results_hyperparams.png)
 80 | 
 81 | 
 82 | 
 83 | ##### Afterword
 84 | 
 85 | - A bit hacky theory - all about tabular setting, but in practice - DL used.
 86 | - Still have a questions about value function approximation usage? Don't see any training stabilization by plots.
 87 | - Why do authors use Gaussian mixture as a policy? It is worse then just one, by their own plot! [?!]
 88 | - Nevertheless, it looks like interesting low-cost improvement of the DDPG.
 89 | 
 90 | ##### Interesting links
 91 | 
 92 | P. Henderson, R. Islam, P. Bachman, J. Pineau, D. Precup, and D. Meger. [Deep reinforcement learning that matters](https://arxiv.org/abs/1709.06560). arXiv preprint arXiv:1709.06560, 2017.
 93 | 
 94 | T. Haarnoja, H. Tang, P. Abbeel, and S. Levine. [Reinforcement learning with deep energy-based policies](https://arxiv.org/abs/1702.08165). arXiv preprint arXiv:1702.08165, 2017.
 95 | 
 96 | ---
 97 | 
 98 | [NIPS video](https://vimeo.com/252185258)
 99 | 
100 | Review by [Aleksey Grinchuk](https://github.com/AlexGrinch) (in russian): https://www.youtube.com/watch?v=NiTJOw1aST4


--------------------------------------------------------------------------------
/papers/1801_soft_ac/algorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/algorithm.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/f1.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/policy.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/policy_update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/policy_update.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/qvalue_update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/qvalue_update.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/results_envs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/results_envs.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/results_hyperparams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/results_hyperparams.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/results_improvements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/results_improvements.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/results_seeds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/results_seeds.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/soft_policy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/soft_policy.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/soft_theory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/soft_theory.png


--------------------------------------------------------------------------------
/papers/1801_soft_ac/value_update.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1801_soft_ac/value_update.png


--------------------------------------------------------------------------------
/papers/1802_me_trpo.md:
--------------------------------------------------------------------------------
 1 | # [Model-Ensemble Trust-Region Policy Optimization](https://arxiv.org/abs/1802.10592)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Model-free RL methods - easy to learn, but have very high sample complexity - can't be applied to real world problems. Model-based methods - low sample complexity, but need careful tuning for each new environment. Paper propose an approach to combine them into ME-TRPO to access easy learn with low sample complexity.
 6 | 
 7 | - paper with appendix
 8 | - interesting solution for real-world problems or envs with low fps
 9 | 
10 | ##### Notes
11 | 
12 | *Vanilla* model-based RL:
13 | 
14 | - model learning
15 |   - collect samples form the environment
16 |   - fit a dynamics model to the observations
17 | - policy optimization stage
18 |   - search for an optimal policy into learned dynamic model
19 | 
20 | Problems
21 | 
22 | - fails/overfits if there is now enough data from real environment.
23 | - SL tricks does not works cause not i.i.d.
24 | 
25 | Proposed solution
26 | 
27 | - use ensemble of preudo-env-models (aka bootstrap over real env data?)
28 | 
29 | ---
30 | 
31 | Algorithms
32 | 
33 | ![alt text](./1802_me_trpo/a1.png)
34 | 
35 | ![alt text](./1802_me_trpo/a2.png)
36 | 
37 | Tricks
38 | 
39 | - policy validation - stop policy improvement if it's better than old one in N% cases
40 | 
41 | ---
42 | 
43 | Comparison with SOTAs
44 | 
45 | ![alt text](./1802_me_trpo/f2.png)
46 | 
47 | Effect of the number of pseudo-envs
48 | 
49 | ![alt text](./1802_me_trpo/f4.png)
50 | 
51 | ---
52 | 
53 | Future work
54 | 
55 | - use ME-TRPO to explore difficult states where ME disagree
56 | - apply for real-world robotics
57 | 
58 | ---
59 | 
60 | Appendix
61 | 
62 | - for each env we need to specify pseudo-env reward function
63 | 
64 | Training process
65 | 
66 | ![alt text](./1802_me_trpo/f5.png)
67 | 
68 | *what about the policy validation trick?*
69 | 
70 | ##### Afterword
71 | 
72 | Questions
73 | 
74 | - What with learning time? -> Appendix (1h for swimmer, 5d for Humanoid)
75 | - How many samples from pseudo-envs does ME-TRPO need?
76 | - Can we replace TRPO with REINFORCE/PPO/other-method?
77 | - What about non-MuJoCo envs? [Yeap, NIPS: Learning to Run env]
78 | 
79 | ##### Interesting links
80 | 
81 | 1. Stefan Depeweg, José Miguel Hernández-Lobato, Finale Doshi-Velez, and Steffen Udluft. [Learning and policy search in stochastic dynamical systems with bayesian neural networks](https://arxiv.org/abs/1605.07127). In International Conference on Learning Representations (ICLR2017), 2017.
82 | 2. Nikhil Mishra, Pieter Abbeel, and Igor Mordatch. [Prediction and control with temporal segment models](https://arxiv.org/abs/1703.04070). arXiv preprint arXiv:1703.04070, 2017.
83 | 
84 | 
85 | 
86 | Review by [cydoroga](tpg.cydoroga@gmail.com) (in russian): https://youtu.be/nDsDzADmSzk


--------------------------------------------------------------------------------
/papers/1802_me_trpo/a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1802_me_trpo/a1.png


--------------------------------------------------------------------------------
/papers/1802_me_trpo/a2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1802_me_trpo/a2.png


--------------------------------------------------------------------------------
/papers/1802_me_trpo/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1802_me_trpo/f2.png


--------------------------------------------------------------------------------
/papers/1802_me_trpo/f4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1802_me_trpo/f4.png


--------------------------------------------------------------------------------
/papers/1802_me_trpo/f5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1802_me_trpo/f5.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning.md:
--------------------------------------------------------------------------------
 1 | # [Generative Multi-Agent Behavioral Cloning](https://arxiv.org/abs/1803.07612)
 2 | 
 3 | ##### TLDR [Imitation Learning]
 4 | 
 5 | Sometimes, all we have is pre-collected demonstration data, from which we should learn an optimal policy. Nevertheless, all can go wrong as we want our agent to be creative. All become even more complex if we have a lot of agents and each of them should communicate with each other. To solve all these problems authors suggest usage of combination of VRNN (RNN with VAE)  with additional goal-oriented RNN.
 6 | 
 7 | - article with [source code](https://github.com/ezhan94/gen-MA-BC) and [demo](http://basketball-ai.com)
 8 | 
 9 | ![alt text](./1803_behavioral_cloning/s1.png)
10 | 
11 | ##### Notes
12 | 
13 | ###### Generative multi-agent behavioral cloning
14 | 
15 | - *simplest* form of imitation learning	
16 | - desired policy does not necessarily perfectly mimic the demonstrations, but rather recover the generating distribution from which the demonstrations were sampled
17 | - we also need our agent to behave coherently over long time horizons
18 | - multimodal agents
19 | 
20 | ###### *VRNNs
21 | 
22 | ![alt text](./1803_behavioral_cloning/vrnn.png)
23 | 
24 | ###### Notation
25 | 
26 | ![alt text](./1803_behavioral_cloning/notation.png)
27 | 
28 | ###### Learning Objective
29 | 
30 | Because of the problem formulation, we can use several tricks - insted of action prediction, we can predict next state or probability of the next state.
31 | 
32 | ![alt text](./1803_behavioral_cloning/objectives.png)
33 | 
34 | And with respect to multi-agent:
35 | 
36 | ![alt text](./1803_behavioral_cloning/objectives2.png)
37 | 
38 | ###### Model
39 | 
40 | Additional assumption - macro-goals.
41 | - provide tractable way to capture coordination between agents
42 | - encode long-tern intents of agent and enable long-term planning at a higher-level timescale.
43 | 
44 | ![alt text](./1803_behavioral_cloning/macro_goals.png)
45 | 
46 | ![alt text](./1803_behavioral_cloning/f3.png)
47 | 
48 | ###### Experiments & Results
49 | 
50 | Baselines
51 | 
52 | ![alt text](./1803_behavioral_cloning/baselines.png)
53 | 
54 | Comparison
55 | 
56 | ![alt text](./1803_behavioral_cloning/t2.png)
57 | 
58 | Cooperation
59 | 
60 | ![alt text](./1803_behavioral_cloning/cooperation.png)
61 | 
62 | Multimodal behavior
63 | 
64 | ![alt text](./1803_behavioral_cloning/multimodal.png)
65 | 
66 | ##### Afterword
67 | 
68 | - The ball is a lie
69 | - As for me, setup with RNN over VRNN looks too complex
70 | - What about CNN/TCN/Attention-like and GAN usage instead of RNNs for generation?
71 | - Wonder to see this approach for music creation (instruments as a agents)
72 | - Can this approach be applied for coninuous action space environments? (For these problem I see only finite number of states and goals.)
73 | 
74 | ##### Interesting links
75 | 
76 | 1. [A Recurrent Latent Variable Model for Sequential Data](https://arxiv.org/abs/1506.02216)
77 | 
78 | Review (in russian): https://youtu.be/L0vM1Z0YL5g


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/baselines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/baselines.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/cooperation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/cooperation.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/f3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/f3.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/macro_goals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/macro_goals.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/multimodal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/multimodal.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/notation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/notation.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/objectives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/objectives.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/objectives2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/objectives2.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/s1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/s1.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/t2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/t2.png


--------------------------------------------------------------------------------
/papers/1803_behavioral_cloning/vrnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_behavioral_cloning/vrnn.png


--------------------------------------------------------------------------------
/papers/1803_cnn_vs_rnn.md:
--------------------------------------------------------------------------------
 1 | # [An Empirical Evaluation of Generic Convolutional and Recurrent Networks for Sequence Modeling](https://arxiv.org/abs/1803.01271)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | As we know, RNNs are commonly used for sequential data modeling in the DL field. Baseline LSTM-like architectures were a standard approach for this task. Nevertheless, time goes fast and nowadays we see a lot of experiments with CNN and Attention models for sequential data processing. With this idea, authors present their study of systematic comparison between CNNs (more concrete, TCNs - Temporal Convolutional Networks) and RNNs architectures of the variety of tasks.
 6 | 
 7 | - appendix with all interesting stuff and hyperparameters
 8 | - article with [source code](https://github.com/locuslab/TCN) - great! Pytorch - great x2! clear code implementation - great x200!
 9 | - article from the 1st-year PhD student - really cool achievement (article, appendix, code - all best practices for DL research, as for me)
10 | 
11 | ##### Notes
12 | 
13 | After LSTM comes out, there was only a few significant improvements in applying RNNs for sequence modeling. So, DL community start to combine CNNs and RNNs architectures: Quasi-RNN, Dilated RNN. Nevertheless, CNN-only approach was not fully investigated.
14 | 
15 | Intro to [TCNs](https://arxiv.org/abs/1611.05267):
16 | - the convolutions in the architecture are causal, meaning that there is no information “leakage” from future to past
17 | - the architecture can take a sequence of any length and map it to an output sequence of the same length, just as with an RNN.
18 | - much simpler than WaveNet (no skip connections across layers, conditioning, context stacking, or gated activations)
19 | 
20 | 
21 | ![alt text](./1803_cnn_vs_rnn/f1.png)
22 | 
23 | Advantages:
24 | 
25 | - parallelism
26 | - flexible receptive field size
27 | 
28 | 
29 | - stable gradients
30 | - low memory requirement for training
31 | - variable length inputs
32 | 
33 | Disadvantages:
34 | 
35 | - data storage during evaluation (production)
36 | - potential parameter change for a transfer of domain (as for me, RNNs still have such problems)
37 | 
38 | ###### Comparison
39 | 
40 | ![alt text](./1803_cnn_vs_rnn/t1.png)
41 | 
42 | Interesting fact - grid search was used to find good set of hyperparameters for recurrent models.
43 | 
44 | ![alt text](./1803_cnn_vs_rnn/t4.png)
45 | 
46 | ###### Improvements
47 | 
48 | ![alt text](./1803_cnn_vs_rnn/f6.png)
49 | 
50 | ![alt text](./1803_cnn_vs_rnn/t5.png)
51 | 
52 | ##### Afterword
53 | 
54 | - If you know the length distribution of your data, it's time to try TCNs for your problem (as for me, pure dynamic lengths are almost never used in practical applications)
55 | - It's interesting to see experiments with pretrained W2V for LM tasks
56 | - Due to the TCNs advantages, they can be a real competitor for RNNs field tasks.
57 | 
58 | ##### Interesting links
59 | 
60 | - [Temporal Convolutional Networks for Action Segmentation and Detection](https://arxiv.org/abs/1611.05267)
61 | - [source code](https://github.com/locuslab/TCN)
62 | 
63 | 


--------------------------------------------------------------------------------
/papers/1803_cnn_vs_rnn/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_cnn_vs_rnn/f1.png


--------------------------------------------------------------------------------
/papers/1803_cnn_vs_rnn/f6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_cnn_vs_rnn/f6.png


--------------------------------------------------------------------------------
/papers/1803_cnn_vs_rnn/t1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_cnn_vs_rnn/t1.png


--------------------------------------------------------------------------------
/papers/1803_cnn_vs_rnn/t4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_cnn_vs_rnn/t4.png


--------------------------------------------------------------------------------
/papers/1803_cnn_vs_rnn/t5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_cnn_vs_rnn/t5.png


--------------------------------------------------------------------------------
/papers/1803_smith_part1.md:
--------------------------------------------------------------------------------
 1 | # [ A disciplined approach to neural network hyper-parameters: Part 1 -- learning rate, batch size, momentum, and weight decay](https://arxiv.org/abs/1803.09820)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | ![alt text](./1803_smith_part1/p1.png)
 6 | 
 7 | ![alt text](./1803_smith_part1/p2.png)
 8 | 
 9 | ##### Interesting links
10 | 
11 | code - https://github.com/lnsmith54/hyperParam1


--------------------------------------------------------------------------------
/papers/1803_smith_part1/p1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_smith_part1/p1.png


--------------------------------------------------------------------------------
/papers/1803_smith_part1/p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_smith_part1/p2.png


--------------------------------------------------------------------------------
/papers/1803_sptm.md:
--------------------------------------------------------------------------------
 1 | # [Semi-parametric Topological Memory for Navigation](https://arxiv.org/abs/1803.00653)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Problem - navigation in novel unseen environments. Idea - let's build topological (not metric, as usual) map of the environment and use it to confidently navigate towards goals. Map graph builds upon trajectories from random agent walk. Inference and planning done by association current observation with graph vertex (knn approach), then - dijkstra algorithm to find path to goal vertex and finnaly - locomotion network to take an action to reach next subgoal (next closest to goal vertex by dijkstra).
 6 | 
 7 | - new approach, no grids, self-builded graphs - it's good
 8 | - graph basen on random agent - that bad (even authors write about it), nevertheless, it looks like only the first article of future research. So, maybe next year we will see better approaches for environment graph building.
 9 | - weak baseline, or not fully tuned (again, authors point it themselves).
10 | - but still, the main idea is excellent!
11 | - [project page](https://sites.google.com/view/SPTM)
12 | - [source code](https://github.com/nsavinov/SPTM)
13 | 
14 | ##### Notes
15 | 
16 | So, a bit more concrete:
17 | 
18 | 1. Firstly, we send random agent to new environment to generate samples and trajectories. This proccess end by time limit or frame limit.
19 | 2. Then, with some heuristics and siamese netrorks we create our environment graph.
20 | 3. and build *Retrieval* network (to compare different observations and have an ability to quickly find current position in the graph).
21 | 4. After that, there is a phase of *Locomotion* network training, which uses collected sample and pseudo-goals to get some knowledge of how to navigate throught graph.
22 | 5. Now, all is done, we can do inference:
23 |    1. from goal and current states *Retrieval* networks gives us graph vertices.
24 |    2. then, dijkstra comes out and we plan our route and select next subgoal - waypoint state.
25 |    3. finally, *Locomotion* networks tries to predict how to go to this waypoint state.
26 | 
27 | ![alt text](./1803_sptm/f1.png)
28 | 
29 | ![alt text](./1803_sptm/f2.png)
30 | 
31 | ###### Comparison with RL baselines
32 | 
33 | Yeap, non-RL approach (all nets are trained semi-supervised) vs RL-approach.
34 | 
35 | ![alt text](./1803_sptm/t1.png)
36 | 
37 | ##### Afterword
38 | 
39 | In the end, this is a new look at the problem of navigation, but something confuses me.
40 | 
41 | - what will be in case large environment?
42 | - knn usege looks like a bit hack, as for me. 5 minutes of random agent was enouhgt to just overfit for every test environment. knn and dijkstra after that just give us typical graph search?
43 | 
44 | But, what is really cool is this approach of high level plaling with graph and dijkstra and local planning with *Locomotion*. Really interesting approach.
45 | 
46 | ##### Interesting links
47 | 
48 | - [project page](https://sites.google.com/view/SPTM)
49 | - [source code](https://github.com/nsavinov/SPTM)


--------------------------------------------------------------------------------
/papers/1803_sptm/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_sptm/f1.png


--------------------------------------------------------------------------------
/papers/1803_sptm/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_sptm/f2.png


--------------------------------------------------------------------------------
/papers/1803_sptm/t1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_sptm/t1.png


--------------------------------------------------------------------------------
/papers/1803_world_models.md:
--------------------------------------------------------------------------------
 1 | # [World Models](https://arxiv.org/abs/1803.10122)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Really interesting view on Model-Based methods with logical model parting (world model, memory, controller).
 6 | 
 7 | - exiting demo, lots of ideas, many to think about.
 8 | - VAE for environment modelling - nice, waiting an opportynity to test this idea on sample unefficient env.
 9 | - RNN learned on latent representations - so we can sample environment trajectories in the noise
10 | - quicke and simple controller for action selection
11 | 
12 | ##### Notes
13 | 
14 | So, the idea behind the paper is qute simple (and I can say it have some inspiration from how human thinks). 
15 | 
16 | Firstly, let's look at learning environment representaitons from another point of view. It's common to encode observation with respect to action selection and reward maximization. But, we can simply learn compact state represenatation with AE, or as authors suggests VAE. By this way, we encode not only the information needed to action selection, we compress all the information from the original observation. So, using autoencoder process we can obtain **world model**.
17 | 
18 | Okay, we can encode each environment state into some compact representation. But what about transactions? In every environment there is some probabilities to come to state<sub>t+1</sub> from state<sub>t</sub>. For this purpose **memory** model exists. Authors use RNN with mixture of gaussians to model P(z<sub>t+1</sub>| z<sub>t</sub>), where z - our encoded state from world model.
19 | 
20 | After that, we need only **contoller** model to select action from current z<sub>t</sub>.
21 | 
22 | A bit more about the models in the figures below.
23 | 
24 | ![alt text](./1803_world_models/f4.png)
25 | 
26 | ![alt text](./1803_world_models/car_exp1.png)
27 | 
28 | All looks good, but we want something more. We want learning in the dreams and transter this knowledge to real-world (okey, real simulations). Can we do this? Yes, we can!
29 | 
30 | It was a bit cheating, authors use the environment, main goal of which - live as long as you can. By this way, they don't need to simulate reward function, only done one. Nevertheless, the pipeline looks very familiar to previous one.
31 | 
32 | ![alt text](./1803_world_models/cover_exp1.png)
33 | 
34 | Unfortunately, something can go wrong - agent can learn policy to cheat in the dream-enviromet, so we need to make it more stochastic and hardcore. And, it works:
35 | 
36 |  ![alt text](./1803_world_models/t2.png)
37 | 
38 | By the way, all this 2 experiments beated the Gym leaderboard for this environments. So, take a look at the ideas.
39 | 
40 | ##### Afterword
41 | 
42 | By reading this article I come to the idea: if we can simulation *done* for the environmet, can we also simulatio some advantage reward function? Something like,  the measure of usefulness of transition from one state to another?
43 | 
44 | As for me, it is quite interesting to see more and more approaches of transfering simulation-learned knowledge to real world.
45 | 
46 | ##### Interesting links
47 | 
48 | Demo - [worldmodels.github.io](https://worldmodels.github.io/)


--------------------------------------------------------------------------------
/papers/1803_world_models/car_exp1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_world_models/car_exp1.png


--------------------------------------------------------------------------------
/papers/1803_world_models/cover_exp1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_world_models/cover_exp1.png


--------------------------------------------------------------------------------
/papers/1803_world_models/f4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_world_models/f4.png


--------------------------------------------------------------------------------
/papers/1803_world_models/t2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1803_world_models/t2.png


--------------------------------------------------------------------------------
/papers/1804_dora.md:
--------------------------------------------------------------------------------
 1 | # [DORA The Explorer: Directed Outreaching Reinforcement Action-Selection](https://arxiv.org/abs/1804.04012)
 2 | 
 3 | ##### TLDR - perfect article naming
 4 | 
 5 | When it comes to an exploration, it's common to use some additional heuristics as *counter*s in RL field. Nevertheless, there are number of disadvantages of simple "state" counters. So, the authors propose E-values, a generalization of counters over state-action trajectories. The idea looks quite interesting and can be applied to continuous MDPs.
 6 | 
 7 | - again, a lot of detail in appendix
 8 | - tricky theory
 9 | - only 2 practice results: MountainCar and Freeway
10 | 
11 | ##### Notes
12 | 
13 | Basic intuition behind the counters: for each state in our MDP let's define an additional *parameter* - counter C<sub>*s*</sub>. When our agent comes to state *s* , let's increment  C<sub>*s*</sub> counter. Returning to the exploration - we assume, that less visited states (with low C<sub>*s*</sub> values) are more interesting for us for exploration purpose.
14 | 
15 | So, what can go wrong? Let's check an example:
16 | 
17 | ![alt text](./1804_dora/example.png)
18 | 
19 | Here we can see some intermediate "parent"-state *s<sub>0</sub>* with *k* "child"-states. So, the problem: we can visit "parent"-state *s<sub>0</sub>* a lot of time, but don't visit all its *k* "child"-states. And from some  "preparent"-state *s* , state *s<sub>0</sub>* looks like high explored, but it's completely wrong.
20 | 
21 | ---
22 | 
23 | So, can we do better? 
24 | 
25 | Looks like yes. Let's look at exploration from a "reward" point of view. For each state-action pair we can define a function *E(s<sub>t</sub>, a<sub>t</sub>)* that will define some kind of exploration rate for this state-action pair. As reward we can approximate this E function with discount factor and SARSA algorithm. Long story short, shortcut from article with main idea:
26 | 
27 | ![alt text](./1804_dora/e_values.png)
28 | 
29 | How can it help?
30 | 
31 | ![alt text](./1804_dora/f1.png)
32 | 
33 | Here we assume, that *gamma<sub>E</sub>* = 0,  so *E(s, a) = (1 - alpha)<sup>n</sup> *, so *n = log<sub>1 - alpha</sub>E(s, a)*. As we can see, the parent counter for state *s* increases monotonously with number of cycles (visiting all *k* "child"-states).
34 | 
35 | After that, authors propose the logarithm of *E*-values as a generalization of counters and replace the standard counter with its corresponding generalized counter (log<sub>1 - alpha</sub>E).
36 | 
37 | ---
38 | 
39 | How to apply?
40 | 
41 | It's a bit tricky, but we use some additional assumptions.
42 | 
43 | ![alt text](./1804_dora/t1.png)
44 | 
45 | ![alt text](./1804_dora/t2.png)
46 | 
47 | Now, main algorithm looks quite simple:
48 | 
49 | ![alt text](./1804_dora/a1.png)
50 | 
51 | ---
52 | 
53 | But where are neural nets and all kinds of approximations? Finally, a bit of MountainCar (appendix only) and Freeway experiments.
54 | 
55 | ![alt text](./1804_dora/f13.png)
56 | 
57 | ![alt text](./1804_dora/f6.png)
58 | 
59 | ##### Afterword
60 | 
61 | The idea look really interesting and reasonable for *finite* MDPs but...
62 | 
63 | - Interesting to see an Figure 1 example where cycle consists of visiting **not** all leaves.
64 | - Not much info about approximation practice and neural network usage for continuous MDPs. As for me, it's the most interesting part.
65 | 
66 | ##### Interesting links
67 | 
68 | 1. Marc G. Bellemare, Sriram Srinivasan, Georg Ostrovski, Tom Schaul, David Saxton, Remi Munos. [Unifying Count-Based Exploration and Intrinsic Motivation](https://arxiv.org/abs/1606.01868)
69 | 
70 | Implementation - https://github.com/nathanwang000/deep_exploration_with_E_network
71 | 
72 | Review by [VakhrameevaLiza](https://github.com/VakhrameevaLiza) (in russian): https://www.youtube.com/watch?v=WEg-Cd0y6wQ


--------------------------------------------------------------------------------
/papers/1804_dora/a1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/a1.png


--------------------------------------------------------------------------------
/papers/1804_dora/e_values.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/e_values.png


--------------------------------------------------------------------------------
/papers/1804_dora/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/example.png


--------------------------------------------------------------------------------
/papers/1804_dora/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/f1.png


--------------------------------------------------------------------------------
/papers/1804_dora/f13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/f13.png


--------------------------------------------------------------------------------
/papers/1804_dora/f6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/f6.png


--------------------------------------------------------------------------------
/papers/1804_dora/t1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/t1.png


--------------------------------------------------------------------------------
/papers/1804_dora/t2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_dora/t2.png


--------------------------------------------------------------------------------
/papers/1804_gotta_learn_fast.md:
--------------------------------------------------------------------------------
 1 | # [Gotta Learn Fast: A New Benchmark for Generalization in RL](https://arxiv.org/abs/1804.03720)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | Why are we so in love with RL? Cause we can train on the test set, of course. But, these time are over. OpenAI researcher decide to create a typical benchmark environment with train/test.... levels of Sonic game. Baselines included.
 6 | 
 7 | - new cool environment - competitive, needs agent's level memorization 
 8 | - finally, we have train/test split in RL (mixed feelings still)
 9 | - appendix included
10 | 
11 | ##### Notes
12 | 
13 | ###### Environment
14 | 
15 | So, a bit more about new environment.
16 | 
17 | ![alt text](./1804_gotta_learn_fast/retro.png)
18 | 
19 | Typical trick like random frame skip were also used.
20 | 
21 | ###### Baselines
22 | 
23 | Additionally, authors present several baselines:
24 |  - human with 2h pretrain
25 |  - PPO
26 |  - Rainbow (DQN)
27 |  - JERK (greedy algorithm)
28 | 
29 | ![alt text](./1804_gotta_learn_fast/baselines.png)
30 | 
31 | ###### Whats the point
32 | 
33 | So, why it's do good?
34 | 
35 | - Firstly, we have new RL environment, that is much different from the others. For this one, we need to create RL algorithms with new type of memorization techniques and planning.
36 | - Secondly, we have open sourced baselines. As for me, the code it not wery used-friendly, nevertheless, you can read it and get the main ideas for your own trials.
37 | - Lastly, we have a competition (contest) - new OpeAI challenge for RL researches to solve this environment.
38 | 
39 | 
40 | ##### Afterword
41 | 
42 | - ​No time for questions - it's time to solve the contest.
43 | 
44 | ##### Interesting links
45 | 
46 | Retro contest - https://contest.openai.com.
47 | 
48 | 1. [Progressive Neural Networks](https://arxiv.org/abs/1606.04671)
49 | 1. [PathNet: Evolution Channels Gradient Descent in Super Neural Networks](https://arxiv.org/abs/1701.08734)


--------------------------------------------------------------------------------
/papers/1804_gotta_learn_fast/baselines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_gotta_learn_fast/baselines.png


--------------------------------------------------------------------------------
/papers/1804_gotta_learn_fast/retro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1804_gotta_learn_fast/retro.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress.md:
--------------------------------------------------------------------------------
 1 | # [Progress & Compress: A scalable framework for continual learning](https://arxiv.org/abs/1805.06370)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | ###### Idea:
 6 | 
 7 | Updated version of progressive networks. Rather than every time add new column, let's reformulate task and have only one knowledge column and tine task-specific columns (progress phase) to refresh parameters of th eknowledge one (compress phase).
 8 | 
 9 | ![alt text](./1805_progress_compress/f1.png)
10 | 
11 | ###### Compress phase objective:
12 | 
13 | ![alt text](./1805_progress_compress/objective_kb.png)
14 | 
15 | ###### Progress pahse objective:
16 | 
17 | ![alt text](./1805_progress_compress/objective_task.png)
18 | 
19 | ###### Results:
20 | 
21 | ![alt text](./1805_progress_compress/f2.png)
22 | 
23 | ![alt text](./1805_progress_compress/f3.png)
24 | 
25 | ![alt text](./1805_progress_compress/t1.png)
26 | 
27 | ###### Progressive net addition, a bit about adaptors:
28 | 
29 | ![alt text](./1805_progress_compress/adaptors.png)
30 | 
31 | ##### Interesting links
32 | 
33 | 1. [Hybrid computing using a neural network with dynamic external memory](https://www.nature.com/articles/nature20101)
34 | 2. [On quadratic penalties in elastic weight consolidation](https://arxiv.org/abs/1712.03847?context=cs)
35 | 3. [Overcoming catastrophic forgetting in neural networks](https://arxiv.org/abs/1612.00796) (EWC)


--------------------------------------------------------------------------------
/papers/1805_progress_compress/adaptors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/adaptors.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress/f1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/f1.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/f2.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress/f3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/f3.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress/objective_kb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/objective_kb.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress/objective_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/objective_task.png


--------------------------------------------------------------------------------
/papers/1805_progress_compress/t1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_progress_compress/t1.png


--------------------------------------------------------------------------------
/papers/1805_youtube.md:
--------------------------------------------------------------------------------
 1 | # [Playing hard exploration games by watching YouTube](https://arxiv.org/abs/1805.11592)
 2 | 
 3 | ##### TLDR
 4 | 
 5 | ###### Idea:
 6 | 
 7 | Hard exploration games are...complicated. But if humans can learn how to pass them, so can agents too. Main question is "how to prepare the available data for learning?".
 8 | 
 9 | Firsly, encode all the available replays to predict the distance between two different frames. (Authors use classification, by can we do it with metric learning?) By this way we can achieve a good representaitons of the frames. 
10 | 
11 | ![alt text](./1805_youtube/f3.png)
12 | 
13 | Secondly, you can learn to imitate the replays with simple reward function like:
14 | 
15 | ![alt text](./1805_youtube/reward.png)
16 | 
17 | So, it looks something like this.
18 | 
19 | ![alt text](./1805_youtube/f2.png)
20 | 
21 | That it. (But, it is not so simple.)
22 | 
23 | ###### Results:
24 | 
25 | ![alt text](./1805_youtube/f7.png)


--------------------------------------------------------------------------------
/papers/1805_youtube/f2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_youtube/f2.png


--------------------------------------------------------------------------------
/papers/1805_youtube/f3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_youtube/f3.png


--------------------------------------------------------------------------------
/papers/1805_youtube/f7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_youtube/f7.png


--------------------------------------------------------------------------------
/papers/1805_youtube/reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Scitator/papers/39f00789d1f762eda7975623d79322a20f323c40/papers/1805_youtube/reward.png


--------------------------------------------------------------------------------