├── doc
    ├── aamas
    │   ├── TODO
    │   ├── figures
    │   │   └── preview
    │   │   │   └── rooms-learnt-200.eps
    │   ├── code.tex
    │   ├── taxi.tex
    │   ├── Makefile
    │   ├── abstract.tex
    │   ├── conclusions.tex
    │   ├── macros.tex
    │   ├── rooms.tex
    │   ├── abstract-submission.txt
    │   ├── texify
    │   ├── algo.tex
    │   ├── paper.tex
    │   ├── small-world-theory.tex
    │   ├── background.tex
    │   ├── experiments.tex
    │   └── intro.tex
    ├── ewrl-abstract
    │   ├── library.bib
    │   ├── code.tex
    │   ├── taxi.tex
    │   ├── abstract.tex
    │   ├── Makefile
    │   ├── conclusions.tex
    │   ├── macros.tex
    │   ├── experiments.tex
    │   ├── intro.tex
    │   ├── report.tex
    │   ├── theory.tex
    │   ├── rooms.tex
    │   ├── texify
    │   ├── comments.txt
    │   └── ewrl.bib
    ├── project-report
    │   ├── report_tmp.bbl
    │   ├── code.tex
    │   ├── report_tmp.brf
    │   ├── taxi.tex
    │   ├── abstract.tex
    │   ├── Makefile
    │   ├── macros.tex
    │   ├── conclusions.tex
    │   ├── report_tmp.blg
    │   ├── report.tex
    │   ├── intro.tex
    │   ├── texify
    │   ├── experiments.tex
    │   └── theory.tex
    ├── iisc-ravindran
    │   ├── figures
    │   │   └── us-map.jpg
    │   ├── learning-in-a-small-world.pdf
    │   ├── learning-in-a-small-world.ppt
    │   └── src
    │   │   ├── graph-proof-1.tex
    │   │   ├── taxi.tex
    │   │   ├── graph-proof-2.tex
    │   │   ├── graph-proof.tex
    │   │   ├── graph-proof-3.tex
    │   │   ├── rooms.tex
    │   │   └── rooms-sw-options.tex
    ├── rise-22-08-11
    │   ├── Makefile
    │   └── macros.tex
    ├── rise-30-12-11
    │   ├── Makefile
    │   └── macros.tex
    ├── proposal
    │   ├── Makefile
    │   ├── report.tex
    │   └── intro.tex
    └── ewrl-poster
    │   ├── Makefile
    │   ├── macros.tex
    │   ├── tangocolors.sty
    │   ├── rooms.tex
    │   ├── texify
    │   ├── poster.tex
    │   ├── ewrl.bib
    │   ├── column1.tex
    │   ├── column2.tex
    │   └── beamerthemeI6pd2.sty
├── domains
    ├── taxi2.txt
    ├── taxi1.txt
    ├── rooms1.txt
    ├── rooms-scale
    │   ├── tiny1.tsv
    │   ├── small1.tsv
    │   └── medium1.tsv
    └── rooms-complex
    │   ├── rooms1.txt
    │   ├── rooms2.txt
    │   ├── rooms4.txt
    │   ├── rooms6.txt
    │   └── rooms8.txt
├── src
    ├── Agents
    │   ├── __init__.py
    │   ├── RandomAgent.py
    │   ├── SARSA.py
    │   ├── QLearning.py
    │   ├── MacroQ.py
    │   └── IntraOptionQ.py
    ├── Environments
    │   ├── __init__.py
    │   ├── TaxiOptions.py
    │   ├── RoomsOptions.py
    │   ├── ArbitraryNavigationOptions.py
    │   └── ArbitraryNavigation.py
    ├── measure_.sh
    ├── measure.sh
    ├── rooms-options.sh
    ├── convert.py
    ├── rooms-complex.sh
    ├── util.py
    ├── rooms-options-gen.sh
    ├── rooms-scale.sh
    ├── Runner.py
    ├── make_options.py
    ├── main.py
    ├── ProgressBar.py
    ├── Agent.py
    └── Environment.py
├── IDEAS
├── .gitignore
├── scripts
    ├── add_x
    ├── timeavg
    ├── plot
    └── avg
└── README


/doc/aamas/TODO:
--------------------------------------------------------------------------------
1 | Double check references
2 | 


--------------------------------------------------------------------------------
/domains/taxi2.txt:
--------------------------------------------------------------------------------
1 | 3
2 | (0,0) (1,0) 
3 | (1,0,0,1) 
4 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/library.bib:
--------------------------------------------------------------------------------
1 | /home/teju/Documents/library.bib


--------------------------------------------------------------------------------
/src/Agents/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Agents
3 | """
4 | 
5 | from Agent import *
6 | 
7 | 


--------------------------------------------------------------------------------
/domains/taxi1.txt:
--------------------------------------------------------------------------------
1 | 5 5
2 | 4 1 0 0 4
3 | 0 1 0 0 0
4 | 0 0 0 0 0
5 | 1 0 1 0 0
6 | 5 0 1 4 0
7 | 


--------------------------------------------------------------------------------
/src/Environments/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Environment Base Class
3 | """
4 | 
5 | from Environment import *
6 | 
7 | 


--------------------------------------------------------------------------------
/doc/project-report/report_tmp.bbl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arunchaganty/Small-World-RL/HEAD/doc/project-report/report_tmp.bbl


--------------------------------------------------------------------------------
/doc/iisc-ravindran/figures/us-map.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arunchaganty/Small-World-RL/HEAD/doc/iisc-ravindran/figures/us-map.jpg


--------------------------------------------------------------------------------
/doc/aamas/figures/preview/rooms-learnt-200.eps:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arunchaganty/Small-World-RL/HEAD/doc/aamas/figures/preview/rooms-learnt-200.eps


--------------------------------------------------------------------------------
/doc/iisc-ravindran/learning-in-a-small-world.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arunchaganty/Small-World-RL/HEAD/doc/iisc-ravindran/learning-in-a-small-world.pdf


--------------------------------------------------------------------------------
/doc/iisc-ravindran/learning-in-a-small-world.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arunchaganty/Small-World-RL/HEAD/doc/iisc-ravindran/learning-in-a-small-world.ppt


--------------------------------------------------------------------------------
/src/measure_.sh:
--------------------------------------------------------------------------------
1 | iters=10
2 | PYTHONOPTIMIZE=3 python2 ./main.py $iters 10000 "IntraOptionQ" "TaxiOptions:../data/taxi1.txt:random-node:20" > /dev/null
3 | 
4 | 


--------------------------------------------------------------------------------
/IDEAS:
--------------------------------------------------------------------------------
1 | * Run on domains with > 1 reward
2 | * Run on continuous domain; motion planning problem
3 | * Theoretical results / motivation of extension to non-lattice worlds
4 | 


--------------------------------------------------------------------------------
/src/measure.sh:
--------------------------------------------------------------------------------
1 | commitish=`git log -n1 | head -n1`;
2 | 
3 | /usr/bin/time -f "%e %M %I" ./measure_.sh 2> .a
4 | stats=`cat .a`;
5 | rm .a;
6 | 
7 | echo $commitish $stats
8 | 


--------------------------------------------------------------------------------
/doc/aamas/code.tex:
--------------------------------------------------------------------------------
1 | 
2 | \section{Code}
3 | \label{sec:code}
4 | % \lstset{language=TCL, basicstyle=\small, showstringspaces=false, numbers=left, numberstyle=\tiny }
5 | % \lstinputlisting{many2one.tcl}
6 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/code.tex:
--------------------------------------------------------------------------------
1 | 
2 | \section{Code}
3 | \label{sec:code}
4 | % \lstset{language=TCL, basicstyle=\small, showstringspaces=false, numbers=left, numberstyle=\tiny }
5 | % \lstinputlisting{many2one.tcl}
6 | 


--------------------------------------------------------------------------------
/doc/project-report/code.tex:
--------------------------------------------------------------------------------
1 | 
2 | \section{Code}
3 | \label{sec:code}
4 | % \lstset{language=TCL, basicstyle=\small, showstringspaces=false, numbers=left, numberstyle=\tiny }
5 | % \lstinputlisting{many2one.tcl}
6 | 


--------------------------------------------------------------------------------
/src/Agents/RandomAgent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements the random agent.
 3 | """
 4 | 
 5 | from Agent import *
 6 | import random
 7 | 
 8 | class RandomAgent(Agent):
 9 |     def act( self, state, reward, episode_ended ):
10 |         action = random.choice( self.Q[ state ] )
11 |         return action
12 | 
13 | 


--------------------------------------------------------------------------------
/domains/rooms1.txt:
--------------------------------------------------------------------------------
 1 | 11 11
 2 | 0 0 0 0 0 1 0 0 0 0 0
 3 | 0 0 0 0 0 1 0 0 0 0 0
 4 | 0 0 0 0 0 0 0 0 0 0 0
 5 | 0 0 0 0 0 1 0 0 0 0 0
 6 | 0 0 0 0 0 1 0 0 0 0 0
 7 | 1 0 1 1 1 1 0 0 0 0 0
 8 | 0 0 0 0 0 1 1 1 0 1 1
 9 | 0 0 0 0 0 1 0 0 0 0 0
10 | 0 0 0 0 0 1 0 0 0 0 0
11 | 0 0 0 0 0 0 0 0 0 0 0
12 | 0 0 0 0 0 1 0 0 0 0 0
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Temporary Files
 2 | *.swp
 3 | *.swo
 4 | *.pyc
 5 | *.pyo
 6 | *.o
 7 | cscope.out
 8 | tags
 9 | 
10 | # TeX
11 | *.log
12 | *.aux
13 | *.out
14 | *.nav
15 | *.toc
16 | *.snm
17 | *.pdf
18 | *.dvi
19 | *_tmp.*
20 | 
21 | # Data
22 | *.dot
23 | *.png
24 | *.mat
25 | *.dat
26 | output/
27 | output-*/
28 | data-*/
29 | 
30 | # Included library
31 | src/networkx
32 | 
33 | # Temporary stuff
34 | attic/
35 | 


--------------------------------------------------------------------------------
/domains/rooms-scale/tiny1.tsv:
--------------------------------------------------------------------------------
 1 | F	F	F	F	F	F	F	F	F	F	F				 
 2 | F	F	F	F	F	F	F	F	F	F	F				 
 3 | F	F	F	F	F	F	F	F	F	F	F	F	F	F	F
 4 | F	F	F	F	F	F	F	F	F	F	F				F
 5 | F	F	F	F	F	F	F	F	F	F	F				F
 6 | F	F	F	F	F	F	F	F	F	F	F				F
 7 | F	F	F	F	F	F	F	F	F	F	F				F
 8 | 														F
 9 | 		F	F	F	F	F	F	F	F	F	F	F		F
10 | 		F	F	F	F	F	F	F	F	F	F	F		F
11 | 		F	F	F	F	F	F	F	F	F	F	F		F
12 | 												F		F
13 | 												F	F	F
14 | 


--------------------------------------------------------------------------------
/doc/project-report/report_tmp.brf:
--------------------------------------------------------------------------------
1 | \backcite {SuttonPrecupSingh1998}{{1}{1}{section.1.1}}
2 | \backcite {Stolle}{{1}{1}{section.1.1}}
3 | \backcite {Simsek}{{1}{1}{section.1.1}}
4 | \backcite {Simsek2005}{{1}{1}{section.1.1}}
5 | \backcite {Kleinberg}{{1}{1}{section.1.1}}
6 | \backcite {Simsek}{{3}{2}{figure.1.2}}
7 | \backcite {SuttonPrecupSingh1998,BartoMahadevan}{{3}{2}{figure.1.2}}
8 | \backcite {Simsek}{{5}{3}{figure.1.5}}
9 | 


--------------------------------------------------------------------------------
/doc/aamas/taxi.tex:
--------------------------------------------------------------------------------
 1 | \begin{tikzpicture}
 2 |     % Grid
 3 |     \draw[step=1,color=gray] (0,0) grid (5,5);
 4 |     
 5 |     % Walls
 6 |     \draw[line width=1.5pt] (2,5) -- (2,3);
 7 |     \draw[line width=1.5pt] (1,0) -- (1,2);
 8 |     \draw[line width=1.5pt] (3,0) -- (3,2);
 9 | 
10 |     % Pads
11 |     \draw (0.5,4.5) node {R};
12 |     \draw (0.5,0.5) node {Y};
13 |     \draw (3.5,0.5) node {B};
14 |     \draw (4.5,4.5) node {G};
15 | \end{tikzpicture}
16 | 


--------------------------------------------------------------------------------
/doc/project-report/taxi.tex:
--------------------------------------------------------------------------------
 1 | \begin{tikzpicture}
 2 |     % Grid
 3 |     \draw[step=1,color=gray] (0,0) grid (5,5);
 4 |     
 5 |     % Walls
 6 |     \draw[line width=1.5pt] (2,5) -- (2,3);
 7 |     \draw[line width=1.5pt] (1,0) -- (1,2);
 8 |     \draw[line width=1.5pt] (3,0) -- (3,2);
 9 | 
10 |     % Pads
11 |     \draw (0.5,4.5) node {R};
12 |     \draw (0.5,0.5) node {Y};
13 |     \draw (3.5,0.5) node {B};
14 |     \draw (4.5,4.5) node {G};
15 | \end{tikzpicture}
16 | 


--------------------------------------------------------------------------------
/doc/rise-22-08-11/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE=presentation
 6 | OUTPUT=$(shell basename $(PWD))
 7 | FIGURES=
 8 | 
 9 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES}
10 | 	pdflatex --file-line-error --interaction=nonstopmode $<
11 | 	mv $(FILE).pdf $@
12 | 
13 | ${FIGURES}: %.pdf : %.eps 
14 | 	epstopdf --autorotate=All $^
15 | 
16 | .PHONY: clean
17 | 
18 | clean:
19 | 	rm -rf $(FILE).{aux,dvi,out,log,nav,snm,toc}
20 | 
21 | 


--------------------------------------------------------------------------------
/doc/rise-30-12-11/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE=presentation
 6 | OUTPUT=$(shell basename $(PWD))
 7 | FIGURES=
 8 | 
 9 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES}
10 | 	pdflatex --file-line-error --interaction=nonstopmode $<
11 | 	mv $(FILE).pdf $@
12 | 
13 | ${FIGURES}: %.pdf : %.eps 
14 | 	epstopdf --autorotate=All $^
15 | 
16 | .PHONY: clean
17 | 
18 | clean:
19 | 	rm -rf $(FILE).{aux,dvi,out,log,nav,snm,toc}
20 | 
21 | 


--------------------------------------------------------------------------------
/src/rooms-options.sh:
--------------------------------------------------------------------------------
 1 | ITERS=10
 2 | ENSEMBLES=10
 3 | EPOCHS=40000
 4 | 
 5 | DD="rooms-options"
 6 | OD="options-rooms"
 7 | tmp_prefix="rc1"
 8 | 
 9 | # Make the directory
10 | if [ ! -e $DD ]; then mkdir $DD; fi;
11 | 
12 | n=200
13 | for o in $OD/*.options; do
14 |     scheme="load"
15 |     echo "Running options from $o..."
16 |     PYTHONOPTIMIZE=3 python2 ./main.py $ITERS $ENSEMBLES $EPOCHS "MacroQ" "RoomsOptions:../domains/rooms1.txt:$scheme:$n:$o" $tmp_prefix
17 |     mv "$tmp_prefix-return.dat" $DD/$(basename $o .options).return
18 | done;
19 | 


--------------------------------------------------------------------------------
/domains/rooms-complex/rooms1.txt:
--------------------------------------------------------------------------------
 1 | 16 16
 2 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 3 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 4 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 5 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 6 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 7 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 8 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 9 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
15 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
17 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
18 | 


--------------------------------------------------------------------------------
/domains/rooms-complex/rooms2.txt:
--------------------------------------------------------------------------------
 1 | 16 16
 2 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 3 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 4 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 5 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 6 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 7 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 8 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 9 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
11 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
12 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
13 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
14 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
15 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
16 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
17 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
18 | 


--------------------------------------------------------------------------------
/domains/rooms-complex/rooms4.txt:
--------------------------------------------------------------------------------
 1 | 16 16
 2 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 3 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 4 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 5 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 6 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 7 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 8 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 9 | 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1
10 | 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0
11 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
12 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
13 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
15 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
16 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
17 | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
18 | 


--------------------------------------------------------------------------------
/domains/rooms-complex/rooms6.txt:
--------------------------------------------------------------------------------
 1 | 16 16
 2 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 3 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 4 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 5 | 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 6 | 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1
 7 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 8 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 9 | 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0
10 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
11 | 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 0
12 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
13 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
14 | 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
15 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
16 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
17 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
18 | 


--------------------------------------------------------------------------------
/domains/rooms-complex/rooms8.txt:
--------------------------------------------------------------------------------
 1 | 16 16
 2 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 3 | 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 4 | 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 5 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 6 | 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1
 7 | 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 0
 8 | 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 9 | 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
10 | 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0
11 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
12 | 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1
13 | 0 0 0 0 0 1 1 0 1 1 1 1 0 0 0 0
14 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
15 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
17 | 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
18 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/taxi.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{standalone}
 3 | \usepackage{tikz}
 4 | \usetikzlibrary{external}
 5 | \tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}
 9 |     % Grid
10 |     \draw[step=1,color=gray] (0,0) grid (5,5);
11 |     
12 |     % Walls
13 |     \draw[line width=1.5pt] (2,5) -- (2,3);
14 |     \draw[line width=1.5pt] (1,0) -- (1,2);
15 |     \draw[line width=1.5pt] (3,0) -- (3,2);
16 | 
17 |     % Pads
18 |     \draw (0.5,4.5) node {R};
19 |     \draw (0.5,0.5) node {Y};
20 |     \draw (3.5,0.5) node {B};
21 |     \draw (4.5,4.5) node {G};
22 | \end{tikzpicture}
23 | \end{document}
24 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/graph-proof-1.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Grid
10 |     \draw[clip] (-1,-1) rectangle (11,11);
11 |     \draw[step=1,color=lightgray] (0,0) grid (10,10);
12 |     \foreach \xpos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
13 |     {
14 |       \foreach \ypos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
15 |       {
16 |       \draw [color=lightgray,fill=lightgray,opacity=1.0] (\xpos,\ypos) circle (0.1);
17 |       };
18 |     };
19 | 
20 | \end{tikzpicture}
21 | \end{document}
22 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/taxi.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}
 9 |     % Grid
10 |     \draw[step=1,color=gray] (0,0) grid (5,5);
11 |     
12 |     % Walls
13 |     \draw[line width=1.5pt] (2,5) -- (2,3);
14 |     \draw[line width=1.5pt] (1,0) -- (1,2);
15 |     \draw[line width=1.5pt] (3,0) -- (3,2);
16 | 
17 |     % Pads
18 |     \draw (0.5,4.5) node {R};
19 |     \draw (0.5,0.5) node {Y};
20 |     \draw (3.5,0.5) node {B};
21 |     \draw (4.5,4.5) node {G};
22 | \end{tikzpicture}
23 | \end{document}
24 | 


--------------------------------------------------------------------------------
/src/Agents/SARSA.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RL Framework
 3 | Authors: Arun Chaganty
 4 | Implements SARSA
 5 | """
 6 | 
 7 | from Agent import *
 8 | 
 9 | class SARSA(ValueAgent):
10 |     """
11 |     Implements SARSA
12 |     """
13 | 
14 |     def update_Q(self, state, action, state_, action_, reward):
15 |         if not state:
16 |             return
17 | 
18 |         q = self.get_value( state, action )
19 | 
20 |         if not state_:
21 |             q += self.alpha * (reward - q)
22 |         else:
23 |             q_ = self.get_value( state_, action_ )
24 |             q += self.alpha * (reward + self.gamma * q_ - q)
25 | 
26 |         self.set_value( state, action, q )
27 | 
28 | 


--------------------------------------------------------------------------------
/src/convert.py:
--------------------------------------------------------------------------------
 1 | from Environment import *
 2 | import pickle
 3 | import collections
 4 | 
 5 | def convert_option( o ):
 6 |     # Detect need for conversion
 7 |     o.pi = dict( [ (s,((a,pr),)) for (s,(a,pr)) in o.pi.items() ] )
 8 |     return o
 9 | 
10 | def main(in_fname, out_fname):
11 |     O = pickle.load( open( in_fname ) )
12 |     O = map( convert_option, O )
13 |     pickle.dump(O, open( out_fname, "w" ))
14 | 
15 | if __name__ == "__main__":
16 |     import sys
17 |     if len( sys.argv ) <> 3:
18 |         print "Usage: %s <in> <out>"%( sys.argv[0] )
19 |         sys.exit( 1 )
20 | 
21 |     in_fname = sys.argv[1]
22 |     out_fname = sys.argv[2]
23 | 
24 |     main( in_fname, out_fname )
25 | 
26 | 


--------------------------------------------------------------------------------
/src/Agents/QLearning.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RL Framework
 3 | Authors: Arun Chaganty
 4 | Implements the Q-Learning algorithm
 5 | """
 6 | 
 7 | from Agent import *
 8 | 
 9 | class QLearning(ValueAgent):
10 |     """
11 |     Q-Learning algorithm
12 |     """
13 | 
14 |     def update_Q(self, state, action, state_, action_, reward):
15 |         if not state:
16 |             return
17 | 
18 |         q = self.get_value( state, action )
19 | 
20 |         if not state_:
21 |             q += self.alpha * (reward - q)
22 |         else:
23 |             q_ = max( ( pr for (a_,pr) in self.Q[state_] ) )
24 |             q += self.alpha * (reward + self.gamma * q_ - q)
25 | 
26 |         self.set_value( state, action, q )
27 | 
28 | 


--------------------------------------------------------------------------------
/src/rooms-complex.sh:
--------------------------------------------------------------------------------
 1 | ITERS=2
 2 | ENSEMBLES=2
 3 | EPOCHS=400
 4 | 
 5 | DD="rooms-complex"
 6 | tmp_prefix="rc1"
 7 | 
 8 | # Make the directory
 9 | if [ ! -e $DD ]; then mkdir $DD; fi;
10 | 
11 | # At 16x16, total number of options is 2^8 = 256; Run at 100, 200
12 | cmplx=1
13 | 
14 | for n in 100 200; do
15 |     scheme="small-world"
16 |     # Run for a bunch of 'r'
17 |     for r in 0.75 1.0 1.5 2.0 3.0 4.0; do
18 |         echo "Running $scheme(r=$r) with $n options..."
19 |         PYTHONOPTIMIZE=3 python2 ./main.py $ITERS $ENSEMBLES $EPOCHS "MacroQ" "RoomsOptions:../domains/rooms-complex/rooms$cmplx.txt:$scheme:$n:$r" $tmp_prefix
20 |         mv "$tmp_prefix-return.dat" $DD/$cmplx-$n-$r.return
21 |     done;
22 | done;
23 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/abstract.tex:
--------------------------------------------------------------------------------
 1 | \begin{abstract}
 2 | 
 3 | Understanding how we are able to perform such a diverse set of complex tasks has
 4 | been a central question for the Artificial Intelligence community. We
 5 | hypothesise that the key to solving such tasks lies more in finding a set of
 6 | sub-tasks that can easily span the set of all possible tasks, rather than
 7 | finding the best sub-tasks for the goal. We model this hypothesis using the
 8 | framework of reinforcement learning, and define the sub-tasks based on
 9 | Kleinberg's small world model. Our preliminary results suggest that this
10 | hypothesis may indeed be valid. Further experimentation needs to be done to say
11 | so more confidently.
12 | 
13 | \end{abstract}
14 | 


--------------------------------------------------------------------------------
/doc/project-report/abstract.tex:
--------------------------------------------------------------------------------
 1 | \begin{abstract}
 2 | 
 3 | Understanding how we are able to perform such a diverse set of complex tasks has
 4 | been a central question for the Artificial Intelligence community. We
 5 | hypothesise that the key to solving such tasks lies more in finding a set of
 6 | sub-tasks that can easily span the set of all possible tasks, rather than
 7 | finding the best sub-tasks for the goal. We model this hypothesis using the
 8 | framework of reinforcement learning, and define the sub-tasks based on
 9 | Kleinberg's small world model. Our preliminary results suggest that this
10 | hypothesis may indeed be valid. Further experimentation needs to be done to say
11 | so more confidently.
12 | 
13 | \end{abstract}
14 | 


--------------------------------------------------------------------------------
/scripts/add_x:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # vim:ft=python
 3 | 
 4 | def main(*args):
 5 |     if len(args) == 1:
 6 |         replace = False
 7 |         filename = args[0]
 8 |     elif len(args) == 2 and args[0] == "-w":
 9 |         replace = True
10 |         filename = args[1]
11 |     else:
12 |         print "Usage: %s -w <file>"%(sys.argv[0])
13 |         sys.exit(1)
14 | 
15 |     i = 0
16 |     s = ""
17 |     for line in open(filename).readlines():
18 |         if replace:
19 |             s += "%d   %s"%( i, line )
20 |         else:
21 |             print i, line,
22 |         i += 1
23 |     if replace:
24 |         open(filename,'w').write(s)
25 | 
26 | if __name__ == "__main__":
27 |     import sys
28 |     main(*sys.argv[1:])
29 | 


--------------------------------------------------------------------------------
/doc/proposal/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE = report
 6 | OUTPUT = $(shell basename $(PWD))
 7 | FIGURES =
 8 | SECTIONS = intro.tex 
 9 | 
10 | all: $(OUTPUT).pdf
11 | 
12 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES} $(SECTIONS) 
13 | 	texify < $< > $(FILE)_tmp.tex 
14 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
15 | 	bibtex $(FILE)_tmp
16 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
17 | 	mv $(FILE)_tmp.pdf $@
18 | 
19 | $(FILE).pdf: $(FILE).tex ${FIGURES} 
20 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE).tex 
21 | 
22 | ${FIGURES}: %.pdf : %.eps 
23 | 	epstopdf --autorotate=All $^
24 | 
25 | .PHONY: clean
26 | 
27 | clean:
28 | 	rm -rf *.{aux,dvi,out,bbl,blg,brf,log}
29 | 	rm -rf $(FILE)_tmp.tex
30 | 
31 | 


--------------------------------------------------------------------------------
/doc/project-report/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE = report
 6 | OUTPUT = $(shell basename $(PWD))
 7 | FIGURES =
 8 | SECTIONS = macros.tex abstract.tex intro.tex theory.tex experiments.tex conclusions.tex\
 9 | 		   taxi.tex
10 | 
11 | all: $(OUTPUT).pdf
12 | 
13 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES} $(SECTIONS) 
14 | 	./texify < $< > $(FILE)_tmp.tex 
15 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
16 | 	bibtex $(FILE)_tmp
17 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
18 | 	mv $(FILE)_tmp.pdf $@
19 | 
20 | $(FILE).pdf: $(FILE).tex ${FIGURES} 
21 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE).tex 
22 | 
23 | ${FIGURES}: %.pdf : %.eps 
24 | 	epstopdf --autorotate=All $^
25 | 
26 | .PHONY: clean
27 | 
28 | clean:
29 | 	rm -f *.{aux,bbl,blg,brf,out,log}
30 | 	rm -f $(FILE)_tmp.*
31 | 
32 | 


--------------------------------------------------------------------------------
/doc/aamas/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE = paper
 6 | OUTPUT = $(shell basename $(PWD))
 7 | FIGURES =
 8 | SECTIONS = macros.tex abstract.tex intro.tex background.tex theory.tex algo.tex experiments.tex conclusions.tex\
 9 | 		   taxi.tex small-world-theory.tex
10 | 
11 | 
12 | all: $(OUTPUT).pdf
13 | 
14 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES} $(SECTIONS) 
15 | 	./texify < $< > $(FILE)_tmp.tex 
16 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
17 | 	bibtex $(FILE)_tmp
18 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
19 | 	mv $(FILE)_tmp.pdf $@
20 | 
21 | $(FILE).pdf: $(FILE).tex ${FIGURES} 
22 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE).tex 
23 | 
24 | ${FIGURES}: %.pdf : %.eps 
25 | 	epstopdf --autorotate=All $^
26 | 
27 | .PHONY: clean
28 | 
29 | clean:
30 | 	rm -f *.{aux,bbl,blg,brf,out,log}
31 | 	rm -f $(FILE)_tmp.*
32 | 
33 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE = poster
 6 | OUTPUT = $(shell basename $(PWD))
 7 | FIGURES = figures/rooms-exp.pdf figures/rooms-algos-200.pdf
 8 | FIGURES_CUSTOM = figures/rooms-options.pdf
 9 | SECTIONS = macros.tex column1.tex column2.tex
10 | 
11 | all: $(OUTPUT).pdf
12 | 
13 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES} ${FIGURES_CUSTOM} $(SECTIONS) 
14 | 	./texify < $< > $(FILE)_tmp.tex 
15 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
16 | 	mv $(FILE)_tmp.pdf $@
17 | 
18 | $(FILE).pdf: $(FILE).tex ${FIGURES} 
19 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE).tex 
20 | 
21 | ${FIGURES}: %.pdf : %.eps 
22 | 	epstopdf --autorotate=All $^
23 | 
24 | figures/rooms-options.pdf: rooms.tex
25 | 	pdflatex -shell-escape $^;
26 | 	mv $(^:.tex=-figure0.pdf) $@;
27 | 	rm -f $(^:.tex=-)*
28 | 	rm -f $(^:.tex=.auxlock)
29 | 
30 | .PHONY: clean
31 | 
32 | clean:
33 | 	rm -f *.{aux,bbl,blg,brf,out,log,toc,nav,snm}
34 | 	rm -f $(FILE)_tmp.*
35 | 
36 | 


--------------------------------------------------------------------------------
/domains/rooms-scale/small1.tsv:
--------------------------------------------------------------------------------
 1 | 				F	F	F	F	F	F	F	F	F								 
 2 | 				F	F	F	F	F	F	F	F	F								 
 3 | 				F	F	F	F	F	F	F	F	F		F	F	F	F	F		 
 4 | 												F		F	F	F	F	F		 
 5 | F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F		 
 6 | F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F		 
 7 | F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F	F	F
 8 | F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F		F
 9 | F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F	F	F	F	F
10 | F				F										F	F	F	F	F		F
11 | F		F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F		F
12 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
13 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
14 | F		F	F	F	F	F	F	F	F	F	F	F						F		F
15 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
16 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
17 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
18 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
19 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
20 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
21 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
22 | F																F				F
23 | F		F	F	F	F	F	F	F		F	F	F	F	F	F	F	F	F		F
24 | F		F	F	F	F	F	F	F		F	F	F		F	F	F	F	F		F
25 | F		F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F		F
26 | F																				F
27 | F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F
28 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/Makefile:
--------------------------------------------------------------------------------
 1 | # Author: Arun Chaganty <arunchaganty@gmail.com>
 2 | #
 3 | 
 4 | # $Id$
 5 | FILE = report
 6 | OUTPUT = $(shell basename $(PWD))
 7 | FIGURES = 
 8 | FIGURES_CUSTOM = figures/rooms-options.pdf
 9 | SECTIONS = macros.tex abstract.tex intro.tex theory.tex experiments.tex conclusions.tex\
10 | 		   taxi.tex
11 | 
12 | all: $(OUTPUT).pdf
13 | 
14 | $(OUTPUT).pdf: $(FILE).tex ${FIGURES} ${FIGURES_CUSTOM} $(SECTIONS) 
15 | 	./texify < $< > $(FILE)_tmp.tex 
16 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
17 | 	bibtex $(FILE)_tmp
18 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE)_tmp.tex 
19 | 	mv $(FILE)_tmp.pdf $@
20 | 
21 | $(FILE).pdf: $(FILE).tex ${FIGURES} 
22 | 	pdflatex --file-line-error --interaction=nonstopmode $(FILE).tex 
23 | 
24 | ${FIGURES}: %.pdf : %.eps 
25 | 	epstopdf --autorotate=All $^
26 | 
27 | figures/rooms-options.pdf: rooms.tex
28 | 	pdflatex -shell-escape $^;
29 | 	mv $(^:.tex=-figure0.pdf) $@;
30 | 	rm -f $(^:.tex=-)*
31 | 	rm -f $(^:.tex=.auxlock)
32 | 
33 | .PHONY: clean
34 | 
35 | clean:
36 | 	rm -f *.{aux,bbl,blg,brf,out,log}
37 | 	rm -f $(FILE)_tmp.*
38 | 
39 | 


--------------------------------------------------------------------------------
/doc/project-report/macros.tex:
--------------------------------------------------------------------------------
 1 | % Section References
 2 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
 3 | \newcommand{\eqnref}[1] {Equation \eqref{#1}}
 4 | \newcommand{\thmref}[1] {Theorem \ref{#1}}
 5 | \newcommand{\lmref}[1] {Lemma \ref{#1}}
 6 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
 7 | \renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
 8 | %\theoremstyle{plain} \newtheorem{thm}{Theorem}
 9 | 
10 | %Math Operators
11 | \DeclareMathOperator {\argmax} {argmax}
12 | \DeclareMathOperator {\sgn} {sgn}
13 | \DeclareMathOperator {\trace} {tr}
14 | 
15 | \newcommand{\ud}{\, \mathrm{d}}
16 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
17 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
18 | \newcommand{\tuple}[1] {\langle #1 \rangle}
19 | 
20 | %Short hand
21 | \newcommand{\states} {\mathcal{S}}
22 | \newcommand{\actions} {\mathcal{A}}
23 | \newcommand{\rewards} {\mathcal{R}}
24 | \newcommand{\graph} {\mathcal{G}}
25 | \newcommand{\policy} {\pi}
26 | \newcommand{\initset} {\mathcal{I}}
27 | \newcommand{\stopcond} {\beta}
28 | \newcommand{\option} {\tuple{ \initset,\policy,\stopcond} }
29 | \newcommand{\options} {\mathcal{O}}
30 | 


--------------------------------------------------------------------------------
/doc/aamas/abstract.tex:
--------------------------------------------------------------------------------
 1 | \begin{abstract}
 2 | 
 3 | Understanding how we are able to perform a diverse set of complex tasks
 4 | has been a central question for the Artificial Intelligence community.
 5 | One popular approach is to use temporal abstraction as a framework to
 6 | capture the notion of subtasks. However, this transfers the problem to
 7 | finding the right subtasks, which is still an open problem. Existing
 8 | approaches for subtask generation require too much knowledge of the
 9 | environment, and the abstractions they create can overwhelm the agent.
10 | We propose a simple algorithm inspired by small world networks to learn
11 | subtasks while solving a task that requires virtually no information of
12 | the environment. Additionally, we show that the subtasks we learn can be
13 | easily composed by the agent to solve any other task; more formally, we
14 | prove that any task can be solved using only a logarithmic combination
15 | of these subtasks and primitive actions. Experimental results show that
16 | the subtasks we generate outperform other popular subtask generation
17 | schemes on standard domains. 
18 | 
19 | %\draft{What is our contribution?}
20 | % Relevance to lifelong learning?
21 | 
22 | \end{abstract}
23 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/conclusions.tex:
--------------------------------------------------------------------------------
 1 | \section{Conclusions and Future Work}
 2 | \label{sec:conclusions}
 3 | 
 4 | We have emperically shown that agents using options generating to make the
 5 | state-space a `small world' converge reliably to near-optimal behaviour for a
 6 | single domain. We clearly need to experiment on several other domains,
 7 | especially larger ones.
 8 | 
 9 | Our approach easily extends to stochastic domains, where the shortest path would
10 | naturally extend to the optimal policy between the two points. It would also be
11 | easy to find options based on a state space graph constructed from several
12 | trajectories, though such an approach leads to interesting questions about the
13 | dimensionality of the state space, and which value of $r$ that would need to be
14 | used.
15 | 
16 | Another interesting direction to take this work forward would be to explore the
17 | dynamics of these options; would it be possible to merge or remove options
18 | dynamically? If so, it would imply this approach would scale very gracefully,
19 | and might make for a good cognitive model.
20 | 
21 | % Small World options > random
22 | % Small World options good enough
23 | 
24 | % Further experimentation
25 | % Dimensionality
26 | % Defining in stochastic domains
27 | 


--------------------------------------------------------------------------------
/doc/project-report/conclusions.tex:
--------------------------------------------------------------------------------
 1 | \section{Conclusions and Future Work}
 2 | \label{sec:conclusions}
 3 | 
 4 | We have emperically shown that agents using options generating to make the
 5 | state-space a `small world' converge reliably to near-optimal behaviour for a
 6 | single domain. We clearly need to experiment on several other domains,
 7 | especially larger ones.
 8 | 
 9 | Our approach easily extends to stochastic domains, where the shortest path would
10 | naturally extend to the optimal policy between the two points. It would also be
11 | easy to find options based on a state space graph constructed from several
12 | trajectories, though such an approach leads to interesting questions about the
13 | dimensionality of the state space, and which value of $r$ that would need to be
14 | used.
15 | 
16 | Another interesting direction to take this work forward would be to explore the
17 | dynamics of these options; would it be possible to merge or remove options
18 | dynamically? If so, it would imply this approach would scale very gracefully,
19 | and might make for a good cognitive model.
20 | 
21 | % Small World options > random
22 | % Small World options good enough
23 | 
24 | % Further experimentation
25 | % Dimensionality
26 | % Defining in stochastic domains
27 | 


--------------------------------------------------------------------------------
/src/Agents/MacroQ.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements the Macro Q-Learning Algorithm
 3 | """
 4 | 
 5 | from Agent import *
 6 | from Environment import *
 7 | from numpy import random
 8 | 
 9 | class MacroQ(OptionValueAgent):
10 |     """
11 |     Implements the Q-Learning
12 |     """
13 | 
14 |     def update_Q(self, state, action, state_, action_, reward):
15 |         """Update the Q function
16 |         @state - old state (sequence)
17 |         @action - old action
18 |         @state_ - current state
19 |         @action_ - current action
20 |         @reward - reward (sequence)
21 |         """
22 | 
23 |         if not state:
24 |             return
25 | 
26 |         q = self.get_value( state, action )
27 | 
28 |         # If action is an option,
29 |         if isinstance( action, Option ):
30 |             if state_:
31 |                 state_ = state_[-1][0]
32 |             k = len(reward)
33 |             reward = np.sum( np.exp( self.gamma * np.ones( k ), np.arange( k ) ) * np.array( reward ) )
34 |         else:
35 |             k = 1
36 | 
37 |         if not state_:
38 |             q += self.alpha * (reward - q)
39 |         else:
40 |             q_ = max( ( pr for (a_,pr) in self.Q[state_] ) )
41 |             q += self.alpha * (reward + np.power(self.gamma, k) * q_ - q)
42 | 
43 |         self.set_value( state, action, q )
44 | 
45 | 


--------------------------------------------------------------------------------
/scripts/timeavg:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # vim:ft=python
 3 | 
 4 | # Average over files specified in argument
 5 | # Assumes first 'n' (default=1) columns are keys
 6 | 
 7 | import operator
 8 | import sys
 9 | 
10 | def tuple_add(t1, t2):
11 |     return tuple(map(lambda x: reduce(operator.add, x), zip(t1,t2)))
12 | 
13 | def read_from_file(filename):
14 |     if filename == "-":
15 |         f = sys.stdin
16 |     else:
17 |         f = open( filename )
18 | 
19 |     for line in f.readlines():
20 |         values = tuple(map(float, line.split()))
21 |         yield values
22 | 
23 |     f.close()
24 | 
25 | def time_average(seq, window):
26 |     seq = list(seq)
27 |     assert( len(seq) > window )
28 |     for i in xrange(window, len(seq) - window):
29 |         #yield reduce( tuple_add, seq[i:i+window] )
30 |         yield map( lambda v: v*1/float(window), reduce( tuple_add, seq[i:i+window] ) )
31 | 
32 | def main(*args):
33 |     if len(args) == 2:
34 |         filename = str(args[0])
35 |         window = int(args[1])
36 |     else:
37 |         print "Usage %s <file> <window>"%(sys.argv[0])
38 |         sys.exit(1)
39 | 
40 |     values = read_from_file(filename)
41 |     values = time_average(values, window)
42 |     for v in values:
43 |         print ' '.join(map(str,v))
44 | 
45 | if __name__ == "__main__":
46 |     import sys
47 |     main(*sys.argv[1:])
48 | 


--------------------------------------------------------------------------------
/doc/project-report/report_tmp.blg:
--------------------------------------------------------------------------------
 1 | This is BibTeX, Version 0.99d (TeX Live 2010/Arch Linux)
 2 | Capacity: max_strings=35307, hash_size=35307, hash_prime=30011
 3 | The top-level auxiliary file: report_tmp.aux
 4 | The style file: alpha.bst
 5 | Database file #1: library.bib
 6 | Warning--empty journal in BartoMahadevan
 7 | Warning--empty year in BartoMahadevan
 8 | Warning--empty journal in Kleinberg
 9 | Warning--empty year in Kleinberg
10 | Warning--empty year in Simsek
11 | Warning--empty year in Stolle
12 | You've used 6 entries,
13 |             2543 wiz_defined-function locations,
14 |             593 strings with 5463 characters,
15 | and the built_in function-call counts, 2504 in all, are:
16 | = -- 254
17 | > -- 115
18 | < -- 5
19 | + -- 38
20 | - -- 38
21 | * -- 164
22 | := -- 408
23 | add.period$ -- 18
24 | call.type$ -- 6
25 | change.case$ -- 37
26 | chr.to.int$ -- 6
27 | cite$ -- 12
28 | duplicate$ -- 107
29 | empty$ -- 165
30 | format.name$ -- 45
31 | if$ -- 515
32 | int.to.chr$ -- 1
33 | int.to.str$ -- 0
34 | missing$ -- 6
35 | newline$ -- 33
36 | num.names$ -- 18
37 | pop$ -- 52
38 | preamble$ -- 1
39 | purify$ -- 43
40 | quote$ -- 0
41 | skip$ -- 92
42 | stack$ -- 0
43 | substring$ -- 128
44 | swap$ -- 28
45 | text.length$ -- 5
46 | text.prefix$ -- 1
47 | top$ -- 0
48 | type$ -- 48
49 | warning$ -- 6
50 | while$ -- 29
51 | width$ -- 8
52 | write$ -- 72
53 | (There were 6 warnings)
54 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/macros.tex:
--------------------------------------------------------------------------------
 1 | % Section References
 2 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
 3 | \newcommand{\eqnref}[1] {Equation \eqref{#1}}
 4 | \newcommand{\thmref}[1] {Theorem \ref{#1}}
 5 | \newcommand{\lmref}[1] {Lemma \ref{#1}}
 6 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
 7 | \renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
 8 | %\theoremstyle{plain} \newtheorem{thm}{Theorem}
 9 | 
10 | %Math Operators
11 | \DeclareMathOperator {\argmax} {argmax}
12 | %\DeclareMathOperator {\Pr} {Pr}
13 | \DeclareMathOperator {\sgn} {sgn}
14 | \DeclareMathOperator {\trace} {tr}
15 | \DeclareMathOperator {\connected} {connected}
16 | \DeclareMathOperator {\dist} {d_l}
17 | 
18 | \newcommand{\ud}{\, \mathrm{d}}
19 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
20 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
21 | \newcommand{\tuple}[1] {\langle #1 \rangle}
22 | 
23 | %Short hand
24 | \newcommand{\mdp} {\ensuremath{\mathcal{M}}}
25 | \newcommand{\states} {\mathcal{S}}
26 | \newcommand{\actions} {\mathcal{A}}
27 | \newcommand{\transitions} {\mathcal{P}}
28 | \newcommand{\rewards} {\mathcal{R}}
29 | \newcommand{\graph} {\mathcal{G}}
30 | \newcommand{\policy} {\pi}
31 | \newcommand{\initset} {\mathcal{I}}
32 | \newcommand{\stopcond} {\beta}
33 | \newcommand{\option} {\tuple{ \initset,\policy,\stopcond} }
34 | \newcommand{\options} {\mathcal{O}}
35 | 


--------------------------------------------------------------------------------
/src/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RL Framework
 3 | Authors: Arun Chaganty
 4 | General utility functions
 5 | """
 6 | 
 7 | import sys
 8 | import numpy as np
 9 | from ProgressBar import ProgressBar
10 | 
11 | def normalise( dist ):
12 |     return dist / np.sum( dist )
13 |     
14 | def choose( dist ):
15 |     vs, dist = zip( *dist )
16 |     dist = normalise( dist )
17 |     idx = np.random.multinomial( 1, dist ).argmax()
18 |     return vs[ idx ]
19 | 
20 | def progressIter( fn, lst ):
21 | 
22 |     progress = ProgressBar( 0, len(lst), mode='fixed' )
23 |     oldprog = str(progress)
24 | 
25 |     for n in lst:
26 |         v = fn( n )
27 | 
28 |         if v:
29 |             progress.update_amount(v)
30 |         else:
31 |             progress.increment_amount()
32 |         if oldprog != str(progress):
33 |             print progress, "\r",
34 |             sys.stdout.flush()
35 |             oldprog=str(progress)
36 |     print '\n'
37 | 
38 | def progressMap( fn, lst ):
39 | 
40 |     progress = ProgressBar( 0, len(lst), mode='fixed' )
41 |     oldprog = str(progress)
42 | 
43 |     out = []
44 | 
45 |     for n in lst:
46 |         v = fn( n )
47 |         out.append( v )
48 | 
49 |         progress.increment_amount()
50 |         if oldprog != str(progress):
51 |             print progress, "\r",
52 |             sys.stdout.flush()
53 |             oldprog=str(progress)
54 |     print '\n'
55 | 
56 |     return out
57 | 
58 | 


--------------------------------------------------------------------------------
/doc/rise-22-08-11/macros.tex:
--------------------------------------------------------------------------------
 1 | % Section References
 2 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
 3 | \newcommand{\thmref}[1] {\hyperref[#1]{Theorem~\eqref*{#1}}}
 4 | \newcommand{\lmref}[1] {\hyperref[#1]{Lemma~\eqref*{#1}}}
 5 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
 6 | \newcommand{\eqnref}[1] {Equation \eqref{#1}}
 7 | \renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
 8 | %\theoremstyle{plain} \newtheorem{thm}{Theorem}
 9 | 
10 | %Math Operators
11 | \DeclareMathOperator {\argmax} {argmax}
12 | %\DeclareMathOperator {\Pr} {Pr}
13 | \DeclareMathOperator {\sgn} {sgn}
14 | \DeclareMathOperator {\trace} {tr}
15 | \DeclareMathOperator {\connected} {connected}
16 | \DeclareMathOperator {\dist} {d_l}
17 | 
18 | \newcommand{\ud}{\, \mathrm{d}}
19 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
20 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
21 | \newcommand{\tuple}[1] {\langle #1 \rangle}
22 | 
23 | %Short hand
24 | \newcommand{\mdp} {\ensuremath{\mathcal{M}}}
25 | \newcommand{\states} {\mathcal{S}}
26 | \newcommand{\actions} {\mathcal{A}}
27 | \newcommand{\transitions} {\mathcal{P}}
28 | \newcommand{\rewards} {\mathcal{R}}
29 | \newcommand{\graph} {\mathcal{G}}
30 | \newcommand{\policy} {\pi}
31 | \newcommand{\initset} {\mathcal{I}}
32 | \newcommand{\stopcond} {\beta}
33 | \newcommand{\option} {\tuple{ \initset,\policy,\stopcond} }
34 | \newcommand{\options} {\mathcal{O}}
35 | 


--------------------------------------------------------------------------------
/doc/rise-30-12-11/macros.tex:
--------------------------------------------------------------------------------
 1 | % Section References
 2 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
 3 | \newcommand{\thmref}[1] {\hyperref[#1]{Theorem~\eqref*{#1}}}
 4 | \newcommand{\lmref}[1] {\hyperref[#1]{Lemma~\eqref*{#1}}}
 5 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
 6 | \newcommand{\eqnref}[1] {Equation \eqref{#1}}
 7 | \renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
 8 | %\theoremstyle{plain} \newtheorem{thm}{Theorem}
 9 | 
10 | %Math Operators
11 | \DeclareMathOperator {\argmax} {argmax}
12 | %\DeclareMathOperator {\Pr} {Pr}
13 | \DeclareMathOperator {\sgn} {sgn}
14 | \DeclareMathOperator {\trace} {tr}
15 | \DeclareMathOperator {\connected} {connected}
16 | \DeclareMathOperator {\dist} {d_l}
17 | 
18 | \newcommand{\ud}{\, \mathrm{d}}
19 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
20 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
21 | \newcommand{\tuple}[1] {\langle #1 \rangle}
22 | 
23 | %Short hand
24 | \newcommand{\mdp} {\ensuremath{\mathcal{M}}}
25 | \newcommand{\states} {S}
26 | \newcommand{\actions} {A}
27 | \newcommand{\transitions} {P}
28 | \newcommand{\rewards} {R}
29 | \newcommand{\Rewards} {\mathcal{R}}
30 | \newcommand{\graph} {\mathcal{G}}
31 | \newcommand{\policy} {\pi}
32 | \newcommand{\initset} {\mathcal{I}}
33 | \newcommand{\stopcond} {\beta}
34 | \newcommand{\option} {\tuple{ \initset,\policy,\stopcond} }
35 | \newcommand{\options} {\mathcal{O}}
36 | 


--------------------------------------------------------------------------------
/src/rooms-options-gen.sh:
--------------------------------------------------------------------------------
 1 | PYTHON=python2
 2 | ITERS=3
 3 | EPOCHS="1e5 1e6 1e7"
 4 | r=0.75
 5 | 
 6 | DD="options-rooms"
 7 | tmp_prefix="tmp1"
 8 | 
 9 | # Make the directory
10 | if [ ! -e $DD ]; then mkdir $DD; fi;
11 | 
12 | for n in 100; do
13 |   for scheme in "optimal-betweenness"; do
14 |     echo "Building $n $scheme options..."
15 |     PYTHONOPTIMIZE=3 $PYTHON ./make_options.py $e $n "$scheme" "MacroQ" "Rooms:../domains/rooms1.txt" $tmp_prefix
16 |     mv "$tmp_prefix.options" "$DD/$scheme-$e.options"
17 |   done;
18 |   for scheme in "optimal-small-world"; do
19 |     echo "Building $n $scheme(r=$r) options..."
20 |     PYTHONOPTIMIZE=3 $PYTHON ./make_options.py $e $n "$scheme:$r" "MacroQ" "Rooms:../domains/rooms1.txt" $tmp_prefix
21 |     mv "$tmp_prefix.options" "$DD/$scheme-$e-$r.options"
22 |   done;
23 | 
24 |   for e in $EPOCHS; do 
25 |     # Betweenness
26 |     for scheme in "betweenness"; do
27 |       echo "Building $n $scheme options..."
28 |       PYTHONOPTIMIZE=3 $PYTHON ./make_options.py $e $n "$scheme" "MacroQ" "Rooms:../domains/rooms1.txt" $tmp_prefix
29 |       mv "$tmp_prefix.options" "$DD/$scheme-$e.options"
30 |     done;
31 |     for scheme in "small-world"; do
32 |       echo "Building $n $scheme(r=$r) options..."
33 |       PYTHONOPTIMIZE=3 $PYTHON ./make_options.py $e $n "$scheme:$r" "MacroQ" "Rooms:../domains/rooms1.txt" $tmp_prefix
34 |       mv "$tmp_prefix.options" "$DD/$scheme-$e-$r.options"
35 |     done;
36 |   done;
37 | done;
38 | 


--------------------------------------------------------------------------------
/doc/aamas/conclusions.tex:
--------------------------------------------------------------------------------
 1 | \section{Conclusions and Future Work}
 2 | \label{sec:conclusions}
 3 | 
 4 | % Contributions
 5 | % - new scheme for generating options
 6 | We have devised a new scheme to generate options based on small world
 7 | network model. The options generated satisfy an intuitive criteria, that
 8 | the subtasks learnt should be easily composed to solve any other task.
 9 | The options greatly improve the connectivity properties of the domain,
10 | without leading to a state space blow up. Finally, they are interesting
11 | from a theoretical perspective, as they require only a logarithmic
12 | number of decisions required in a learning task.
13 | 
14 | % - absolutely model-free
15 | Experiments run on standard domains show significantly faster learning
16 | rates using small world options. At the same time, we have shown that
17 | learning small world options can be cheaper than learning bottleneck
18 | options, using a natural algorithm that extracts options from a handful
19 | of tasks it has solved. Another advantage of the scheme is that is does
20 | not require a model of the MDP. 
21 | 
22 | % Further work
23 | % - dynamically add/remove options
24 | % - figuring out r
25 | As future work, we would like to characterise what the exponent $r$
26 | should be in a general domain. Given the ease with which options can be
27 | discovered, it would be interesting to experiment with a dynamic scheme
28 | that adds options on the fly, while solving tasks.
29 | 
30 | 


--------------------------------------------------------------------------------
/src/rooms-scale.sh:
--------------------------------------------------------------------------------
 1 | ITERS=2
 2 | ENSEMBLES=2
 3 | EPOCHS=400
 4 | 
 5 | DD="rooms-scale"
 6 | scale="huge1"
 7 | tmp_prefix="rc-$scale"
 8 | 
 9 | # Make the directory
10 | if [ ! -e $DD ]; then mkdir $DD; fi;
11 | 
12 | # Run without options
13 | scheme=none
14 | echo "Running $scheme..."
15 | PYTHONOPTIMIZE=3 python2 ./main.py $ITERS $ENSEMBLES $EPOCHS "MacroQ" "RoomsOptions:../domains/rooms-scale/$scale.tsv:$scheme" $tmp_prefix
16 | mv "$tmp_prefix-return.dat" $DD/$scale-$scheme.return
17 | mv "$tmp_prefix-decisions.dat" $DD/$scale-$scheme.decisions
18 | 
19 | N="30% 50% 70 90%"
20 | for n in $N; do
21 |   for scheme in "betweenness" "random-path"; do
22 |       echo "Running $scheme with $n options..."
23 |       PYTHONOPTIMIZE=3 python2 ./main.py $ITERS $ENSEMBLES $EPOCHS "MacroQ" "RoomsOptions:../domains/rooms-scale/$scale.tsv:$scheme:$n" $tmp_prefix
24 |       mv "$tmp_prefix-return.dat" $DD/$scale-$scheme-$n.return
25 |       mv "$tmp_prefix-decisions.dat" $DD/$scale-$scheme-$n.decisions
26 |   done;
27 | 
28 |   scheme="small-world"
29 |   # Run for a bunch of 'r'
30 |   for r in 0.75 1.0 2.0; do
31 |       echo "Running $scheme(r=$r) with $n options..."
32 |       PYTHONOPTIMIZE=3 python2 ./main.py $ITERS $ENSEMBLES $EPOCHS "MacroQ" "RoomsOptions:../domains/rooms-scale/$scale.tsv:$scheme:$n:$r" $tmp_prefix
33 |       mv "$tmp_prefix-return.dat" $DD/$scale-$scheme-$n-$r.return
34 |       mv "$tmp_prefix-decisions.dat" $DD/$scale-$scheme-$n-$r.decisions
35 |   done;
36 | done;
37 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/experiments.tex:
--------------------------------------------------------------------------------
 1 | %\section{Empirical Performance}
 2 | \label{sec:experiments}
 3 | % Experimental results
 4 | 
 5 | % Graphs
 6 | \begin{figure}[h]
 7 | \centering
 8 | \subfigure[$P_2$-Distributed Options]{
 9 | \includegraphics[height=2in]{figures/rooms-options}
10 | \label{fig:rooms-options}
11 | }
12 | \subfigure[Cumulative Return]{
13 | \includegraphics[height=2.2in]{figures/rooms-algos-200}
14 | \label{fig:rooms-performance}
15 | }
16 | \label{fig:rooms}
17 | \caption{Results on the Rooms Domain}
18 | \end{figure}
19 | 
20 | % Brief description of Rooms, Taxi and Arbitrary Navigation domains.
21 | We trained agents using the MacroQ algorithm, and tested it on three
22 | standard domains, Taxi, Rooms and a $20\times20$ arbitrary navigation
23 | task, and compared the performance of agents using options generated
24 | (i) using betweenness centrality, (ii) using $P_0$ (uniformly random
25 | paths), (iii) using paths between nodes selected using $P_{r>0}$, and
26 | (iv) using a combination of (i) and (iii).  The performance of
27 | $P_r$-distributed options was worst when $r=0$, and increased to peak
28 | at a particular $r$ before decreasing again; behaviour also observed
29 | in Kleinberg's work. With appropriate $r$, $P_r$-distributed options
30 | accumulate significant return, and outperform bottleneck-based methods
31 | on navigation tasks. 
32 | 
33 | % Observations
34 | % When goal state and start state are further apart, the options that stand out
35 | % are more
36 | 
37 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/intro.tex:
--------------------------------------------------------------------------------
 1 | %\section{Introduction}
 2 | \label{sec:intro}
 3 | 
 4 | % Motivation
 5 | Understanding how we are able to perform a diverse set of complex
 6 | tasks has been a central question for the Artificial Intelligence
 7 | community. We hypothesise that the key to this ability lies in finding
 8 | a set of composable subtasks that ``easily'' span the set of all
 9 | tasks. Drawing parallels from Kleinberg's work on the small-world
10 | phenomenon in social networks \cite{Kleinberg}, we model our
11 | hypothesis using the options framework from reinforcement learning
12 | \cite{SuttonPrecupSingh1998}, and prove that given well-distributed
13 | subtasks, an agent can perform any task using only a logarithmic
14 | combination of subtasks and primitive actions. We support our
15 | hypothesis with experimental results.
16 | 
17 | % General Introduction
18 | The options framework provides extended actions with predefined
19 | policies as an abstraction for subtasks. There has been substantial
20 | work in learning options, mainly focussed around identifying
21 | `bottlenecks', regions that the agent tends to visit frequently
22 | \cite{McGovern2001}, either empirically as in \cite{McGovern2001}, or,
23 | more recently, using graph theoretic methods like betweenness
24 | centrality \cite{Simsek} or graph partitions \cite{Menache}, with the
25 | intuition that they will help the agent move between strongly
26 | connected components, and thus help in effective exploration. This
27 | does not meet our criteria of composability (tasks solved as series of
28 | subtasks) and universality ({\em any} state should be efficiently
29 | reachable).
30 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/graph-proof-2.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |   \tikzset{style={font=\Large}}
10 |     % Grid
11 |     \draw[clip] (-1,-1) rectangle (11,11);
12 |     \draw[step=1,color=lightgray] (0,0) grid (10,10);
13 |     \foreach \xpos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
14 |     {
15 |       \foreach \ypos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
16 |       {
17 |       \draw [color=lightgray,fill=lightgray,opacity=0.7] (\xpos,\ypos) circle (0.1);
18 |       };
19 |     };
20 | 
21 |     % Some long paths
22 | 
23 |     \node (v1) [circle,draw,color=darkgray,fill=black,opacity=0.7] at (1,3) {};
24 |     \node (v2) [circle,draw,color=darkgray,fill=black,opacity=0.7] at (5,6) {};
25 |     \draw [overlay,-latex,very thick] (v1) to [bend left] node[above left, color=black, opacity=1.0] {$\frac{1}{\Theta(\log n) \|u-v\|^{r} }$} (v2);
26 | 
27 |     \node (v3) [circle,draw,color=darkgray,fill=black,opacity=0.7] at (4,4) {};
28 |     \node (v4) [circle,draw,color=darkgray,fill=black,opacity=0.7] at (6,2) {};
29 |     \draw [overlay,-latex,very thick] (v3) to [bend left] node[above right, color=black, opacity=1.0] {$\sim \frac{1}{(2\sqrt{2})^{2} \log 10}$} (v4);
30 |     
31 |     \node (v5) [circle,draw,color=darkgray,fill=darkgray] at (3,2) {};
32 |     \node (v6) [circle,draw,color=darkgray,fill=darkgray] at (4,1) {};
33 |     \draw [overlay,-latex,very thick] (v5) to [bend right] node[below left, color=black, opacity=1.0] {$\sim \frac{1}{(\sqrt{2})^{2} \log 10}$} (v6);
34 | 
35 | \end{tikzpicture}
36 | \end{document}
37 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/macros.tex:
--------------------------------------------------------------------------------
 1 | % Section References
 2 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
 3 | \newcommand{\eqnref}[1] {Equation \eqref{#1}}
 4 | \newcommand{\thmref}[1] {Theorem \ref{#1}}
 5 | \newcommand{\lmref}[1] {Lemma \ref{#1}}
 6 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
 7 | %\renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
 8 | %\theoremstyle{plain} \newtheorem{thm}{Theorem}
 9 | 
10 | %Math Operators
11 | \DeclareMathOperator {\argmax} {argmax}
12 | \DeclareMathOperator {\sgn} {sgn}
13 | \DeclareMathOperator {\trace} {tr}
14 | \DeclareMathOperator{\E} {E}
15 | \DeclareMathOperator{\Var} {Var}
16 | 
17 | \renewcommand{\Re} {\mathbb{R}}
18 | 
19 | \newcommand{\ud}{\, \mathrm{d}}
20 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
21 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
22 | \newcommand{\tuple}[1] {\langle #1 \rangle}
23 | 
24 | %Short hand
25 | \newcommand{\mdp} {\ensuremath{\mathcal{M}}}
26 | \newcommand{\states} {\mathcal{S}}
27 | \newcommand{\actions} {\mathcal{A}}
28 | \newcommand{\transitions} {\mathcal{P}}
29 | \newcommand{\rewards} {\mathcal{R}}
30 | \newcommand{\graph} {\mathcal{G}}
31 | \newcommand{\policy} {\pi}
32 | \newcommand{\initset} {\mathcal{I}}
33 | \newcommand{\stopcond} {\beta}
34 | \newcommand{\option} {\tuple{ \initset,\policy,\stopcond} }
35 | \newcommand{\options} {\mathcal{O}}
36 | 
37 | %Math Operators
38 | \DeclareMathOperator {\ball} {B}
39 | \DeclareMathOperator {\ballf} {B^{f}}
40 | \DeclareMathOperator {\sball} {b}
41 | \DeclareMathOperator {\sballf} {b^{f}}
42 | 
43 | %Short hand
44 | \newcommand{\arbcnst} {\tilde{c}}
45 | \newcommand{\greedyalgo} {\ensuremath{\mathcal{GA}~}}
46 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/graph-proof.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Grid
10 |     \draw[clip] (-1,-1) rectangle (11,11);
11 |     \draw[step=1,color=lightgray] (0,0) grid (10,10);
12 |     \foreach \xpos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
13 |     {
14 |       \foreach \ypos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
15 |       {
16 |       \draw [color=lightgray,fill=lightgray,opacity=0.7] (\xpos,\ypos) circle (0.1);
17 |       };
18 |     };
19 | 
20 |     % Neighbourhoods
21 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=1];
22 |     % Size 2
23 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=2];
24 |     % Size 4
25 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=4];
26 |     % Size 8
27 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=8];
28 | 
29 |     % Goal node
30 |     \node (goal) [circle,draw,color=black,fill=green] at (4,4) {G};
31 |     %\node (v1) (4,4)
32 | 
33 |     % Sizes
34 |     \node at (4,3.2) {$\Theta((2^1)^2)$ nodes};
35 |     \node at (4,5.5) {$\Theta((2^2)^2)$ nodes};
36 |     \node at (4,7) {$\Theta((2^3)^2)$ nodes};
37 |     \node at (4,9) {$\Theta((2^j)^r)$ nodes};
38 | 
39 |     % Hops
40 |     \node (v0) at (10,7) {};
41 |     \node (v1) at (7,3) {};
42 |     \node (v2) at (5,4) {};
43 |     \draw [overlay,-latex,very thick,opacity=0.7] (v0) 
44 |         to [bend left] node[above left, opacity=1.0] 
45 |         {\Large $Pr = \frac{(2^j)^r \times (2^{j+2})^{-r}}{\Theta(\log n)} $} (v1);
46 |     \draw [overlay,-latex,very thick,opacity=0.7] (v1) to [bend right] (v2);
47 | 
48 | 
49 | 
50 | \end{tikzpicture}
51 | \end{document}
52 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/graph-proof-3.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Grid
10 |     \draw[clip] (-1,-1) rectangle (11,11);
11 |     \draw[step=1,color=lightgray] (0,0) grid (10,10);
12 |     \foreach \xpos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
13 |     {
14 |       \foreach \ypos in {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
15 |       {
16 |       \draw [color=lightgray,fill=lightgray,opacity=0.7] (\xpos,\ypos) circle (0.1);
17 |       };
18 |     };
19 | 
20 |     % Neighbourhoods
21 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=1];
22 |     % Size 2
23 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=2];
24 |     % Size 4
25 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=4];
26 |     % Size 8
27 |     \fill [fill=gray, opacity=0.2] (4,4) circle [radius=8];
28 | 
29 |     % Goal node
30 |     \node (goal) [circle,draw,color=black,fill=green] at (4,4) {G};
31 |     %\node (v1) (4,4)
32 | 
33 |     % Sizes
34 |     \node at (4,3.2) {$\Theta((2^1)^2)$ nodes};
35 |     \node at (4,5.5) {$\Theta((2^2)^2)$ nodes};
36 |     \node at (4,7) {$\Theta((2^3)^2)$ nodes};
37 |     \node at (4,9) {$\Theta((2^j)^r)$ nodes};
38 | 
39 |     % Hops
40 |     \node (v0) at (10,7) {};
41 |     \node (v1) at (7,3) {};
42 |     \node (v2) at (5,4) {};
43 |     \draw [overlay,-latex,very thick,opacity=0.7] (v0) 
44 |         to [bend left] node[above left, opacity=1.0] 
45 |         {\Large $Pr = \frac{(2^j)^r \times (2^{j+2})^{-r}}{\Theta(\log n)} $} (v1);
46 |     \draw [overlay,-latex,very thick,opacity=0.7] (v1) to [bend right] (v2);
47 | 
48 | 
49 | 
50 | \end{tikzpicture}
51 | \end{document}
52 | 


--------------------------------------------------------------------------------
/src/Runner.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RL Framework
 3 | Authors: Arun Chaganty
 4 | Responsible for running agents and interacting with the Environment
 5 | """
 6 | 
 7 | from Agent import *
 8 | from Environment import *
 9 | import Agents
10 | import Environments
11 | 
12 | def load_env( env_type ):
13 |     """Try to construct an environment"""
14 |     mod = __import__("Environments.%s"%(env_type), fromlist=[Environments])
15 |     assert( hasattr(mod, env_type) )
16 |     envClass = getattr( mod, env_type )
17 |     return envClass
18 | 
19 | def load_agent( agent_type ):
20 |     """Try to construct an agent"""
21 | 
22 |     mod = __import__("Agents.%s"%(agent_type), fromlist=[Agents])
23 |     assert( hasattr(mod, agent_type) )
24 |     agentClass = getattr( mod, agent_type )
25 |     return agentClass
26 | 
27 | def run(env, agent, episodes):
28 |     """ Simulate some episodes of running """
29 | 
30 |     state, reward, episode_ended = env.start(), 0, True
31 | 
32 |     episodic_return, episodic_epochs = [], []
33 |     ret, epochs = 0, 0
34 | 
35 |     episode = 0
36 |     while episode < episodes:
37 |         action = agent.act(state, reward, episode_ended)
38 |         state, reward, episode_ended = env.react(action)
39 | 
40 |         # Add rewards to ret
41 |         if isinstance( action, Option ):
42 |             # If this was an option, then multiple rewards would have been
43 |             # returned.
44 |             ret += sum( reward )
45 |             epochs += len( state ) - 1
46 |         else:
47 |             ret += reward
48 |             epochs += 1
49 | 
50 |         if episode_ended:
51 |             episodic_return.append( ret )
52 |             episodic_epochs.append( epochs )
53 |             epochs = 0
54 | 
55 | 
56 | 
57 |     # Chop off any extras
58 |     return ret[ : epochs ]
59 | 
60 | 


--------------------------------------------------------------------------------
/scripts/plot:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # vim:ft=python
 3 | 
 4 | # Plots a file given title, axes labels and files
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import numpy as np
 8 | 
 9 | import operator
10 | import sys
11 | import pdb
12 | 
13 | 
14 | def plotValues(title, xlabel, ylabel, d_titles, valuesSet, xIdx=0):
15 | 
16 |     plt.title( title )
17 |     plt.xlabel( xlabel )
18 |     plt.ylabel( ylabel )
19 | 
20 |     for label, values in zip(d_titles, valuesSet):
21 |         if values.shape[0] == 1:
22 |             values = np.matrix(zip(np.arange(values.shape[1]), values.A[0]))
23 | 
24 |         x = values.T[0].T
25 |         y = values.T[1:].T
26 |         plt.plot(x,y, label=label)
27 |     plt.legend()
28 | 
29 | def main(*args):
30 |     # Get file
31 |     try:
32 |         title = args[0]
33 |         xlabel = args[1]
34 |         ylabel = args[2]
35 |         d_titles = args[3].split(',')
36 |         files = args[4].split(',')
37 |         if len(args) > 4:
38 |             out = args[5]
39 |         else:
40 |             out = sys.stdout
41 |     except IndexError:
42 |         print "Usage: %s <title> <xlabel> <ylabel> <data-title,data-title> <data-file,data-file> [<out-file>]"%(sys.argv[0])
43 |         sys.exit(1)
44 | 
45 |     data = []
46 |     for f in files:
47 |         lines = open(f).readlines()
48 |         values = map(lambda line: map(float, line.split()), lines)
49 |         data.append(np.matrix(values))
50 | 
51 |     plotValues(title, xlabel, ylabel, d_titles, data)
52 | 
53 |     # Write to stdout
54 |     #plt.show()
55 |     plt.savefig(out)
56 | 
57 | if __name__ == "__main__":
58 |     #try:
59 |     main(*sys.argv[1:])
60 |     #except StandardError as e:
61 |     #    print "Usage %s <title> <xlabel> <ylabel> <data-file>"%(sys.argv[0])
62 |     #    sys.exit(1)
63 | 
64 | 


--------------------------------------------------------------------------------
/doc/project-report/report.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper]{llncs}
 2 | %\usepackage{fullpage}
 3 | %\usepackage{palatino}
 4 | 
 5 | \usepackage{makeidx}
 6 | \usepackage{amsmath}   
 7 | \usepackage[retainorgcmds]{IEEEtrantools}
 8 | \usepackage{thumbpdf}
 9 | \usepackage{multicol}   
10 | \usepackage{graphicx}   
11 | \usepackage{listings}
12 | \usepackage{algorithm}
13 | \usepackage{algorithmic}
14 | \usepackage{tikz}
15 | \usepackage{subfigure}
16 | 
17 | \usepackage[
18 |   pagebackref,
19 |   pdfpagelabels,
20 |   extension=pdf,
21 | ]{hyperref}
22 | \hypersetup{ 
23 |   pdftitle          = {Is Learning Small World?},
24 |   pdfsubject        = {Is Learning Small World?},
25 |   pdfauthor         = {Arun Tejasvi Chaganty, Prateek Gaur},
26 |   pdfkeywords       = {},
27 |   pdfcreator        = {pdflatex},
28 |   pdfproducer       = {LaTeX with hyperref and thumbpdf},
29 |   pdfstartpage      = {1},
30 |   pdfpagemode       = UseThumbs,
31 |   colorlinks        = true,
32 |   linkcolor         = red,
33 |   anchorcolor       = red,
34 |   citecolor         = blue,
35 |   filecolor         = red,
36 |   urlcolor          = red
37 | }
38 | 
39 | \input{macros}
40 | 
41 | \title{Is Learning Small World?}
42 | \author{ Arun Tejasvi Chaganty \inst{1} \and Prateek Gaur \inst{1} } 
43 | \institute{ Department of Computer Science and Engineering, \\
44 |             IIT Madras, Chennai, India - 600036 }
45 | 
46 | \pagestyle{headings}  % switches on printing of running heads
47 | 
48 | \frontmatter
49 | \mainmatter
50 | 
51 | \begin{document}
52 | 
53 | \maketitle
54 | %\pagebreak
55 | 
56 | % Outline
57 | \input{abstract}
58 | \input{intro}
59 | \input{theory}
60 | \input{experiments}
61 | \input{conclusions}
62 | 
63 | \bibliographystyle{alpha}
64 | \bibliography{library}{}
65 | 
66 | %\newpage
67 | %\appendix
68 | 
69 | %\include{code}
70 | 
71 | \end{document}
72 | 
73 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/report.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper]{llncs}
 2 | \usepackage[margin=1.5cm]{geometry}
 3 | %\usepackage{fullpage}[1in]
 4 | %\usepackage{palatino}
 5 | 
 6 | \usepackage{makeidx}
 7 | \usepackage{amsmath}   
 8 | \usepackage[retainorgcmds]{IEEEtrantools}
 9 | \usepackage{thumbpdf}
10 | \usepackage{multicol}   
11 | \usepackage{graphicx}   
12 | \usepackage{listings}
13 | \usepackage{algorithm}
14 | \usepackage{algorithmic}
15 | \usepackage{tikz}
16 | \usepackage{subfigure}
17 | 
18 | \usepackage[
19 |   pagebackref,
20 |   pdfpagelabels,
21 |   extension=pdf,
22 | ]{hyperref}
23 | \hypersetup{ 
24 |   pdftitle          = {Learning in a Small World},
25 |   pdfsubject        = {Learning in a Small World},
26 |   pdfauthor         = {Arun Tejasvi Chaganty, Prateek Gaur, Balaraman Ravindran },
27 |   pdfkeywords       = {},
28 |   pdfcreator        = {pdflatex},
29 |   pdfproducer       = {LaTeX with hyperref and thumbpdf},
30 |   pdfstartpage      = {1},
31 |   pdfpagemode       = UseThumbs,
32 |   colorlinks        = true,
33 |   linkcolor         = red,
34 |   anchorcolor       = red,
35 |   citecolor         = blue,
36 |   filecolor         = red,
37 |   urlcolor          = red
38 | }
39 | 
40 | \input{macros}
41 | 
42 | \title{Learning in a Small World}
43 | \author{ Arun Tejasvi Chaganty \and Prateek Gaur \and Balaraman Ravindran \inst{1} } 
44 | \institute{ Department of Computer Science and Engineering, \\
45 |             IIT Madras, Chennai, India - 600036 }
46 | 
47 | \pagestyle{headings}  % switches on printing of running heads
48 | 
49 | \frontmatter
50 | \mainmatter
51 | 
52 | \begin{document}
53 | 
54 | \maketitle
55 | %\pagebreak
56 | 
57 | % Outline
58 | \input{intro}
59 | \input{theory}
60 | \input{experiments}
61 | 
62 | \bibliographystyle{amsalpha}
63 | \bibliography{ewrl}{}
64 | 
65 | %\newpage
66 | %\appendix
67 | 
68 | %\include{code}
69 | 
70 | \end{document}
71 | 
72 | 


--------------------------------------------------------------------------------
/scripts/avg:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # vim:ft=python
 3 | 
 4 | # Average over files specified in argument
 5 | # Assumes first 'n' (default=1) columns are keys
 6 | 
 7 | import operator
 8 | import sys
 9 | 
10 | def tuple_add(t1, t2):
11 |     return tuple(map(lambda x: reduce(operator.add, x), zip(t1,t2)))
12 | 
13 | def read_from_files(keyCount, files):
14 |     # Store for the files
15 |     table = {}
16 | 
17 |     for f in files:
18 |         if f == "-":
19 |             f = sys.stdin
20 |         else:
21 |             f = open( f )
22 |         for line in f.readlines():
23 |             values = line.split()
24 |             key = tuple(values[:keyCount])
25 |             value = tuple(values[keyCount:])
26 |             # Use last index for count
27 |             key = tuple(key)
28 |             value = tuple(map(float, value)) + (1,)
29 |             if table.has_key(key):
30 |                 table[key] = tuple_add(table[key], value)
31 |             else:
32 |                 table[key] = value
33 |         f.close()
34 | 
35 |     # Average all fields 
36 |     for k, v in table.items():
37 |         table[k] = tuple(map(lambda x: x/v[-1], v[:-1])) + (v[-1],)
38 | 
39 |     return table
40 | 
41 | def main(*args):
42 |     keyCount = 1
43 |     files = []
44 |     if len(args) >= 2 and args[0] == "-k":
45 |         keyCount = args[1]
46 |         files = args[2:]
47 |     elif len(args) >= 1 and sys.argv[0] != "-k":
48 |         files = args[0:]
49 |     else:
50 |         print "Usage %s <list-of-files>"%(sys.argv[0])
51 |         print "Usage %s -k <n> <list-of-files>"%(sys.argv[0])
52 |         sys.exit(1)
53 | 
54 |     values = read_from_files(keyCount, files)
55 |     values = [k+v for (k,v) in values.items()]
56 |     values.sort(key=lambda t:t[0])
57 |     for v in values:
58 |         print ' '.join(map(str,v))
59 | 
60 | if __name__ == "__main__":
61 |     import sys
62 |     main(*sys.argv[1:])
63 | 


--------------------------------------------------------------------------------
/doc/proposal/report.tex:
--------------------------------------------------------------------------------
 1 | \documentclass[a4paper,twocolumn]{article}
 2 | \usepackage{fullpage}
 3 | \usepackage{palatino}
 4 | 
 5 | \usepackage{amsmath}   
 6 | \usepackage{amsthm}     
 7 | \usepackage[retainorgcmds]{IEEEtrantools}
 8 | \usepackage{thumbpdf}
 9 | \usepackage{multicol}   
10 | \usepackage{graphicx}   
11 | \usepackage{listings}
12 | \usepackage{algorithm}
13 | \usepackage{algorithmic}
14 | 
15 | \usepackage[
16 |   pagebackref,
17 |   pdfpagelabels,
18 |   extension=pdf,
19 | ]{hyperref}
20 | \hypersetup{ 
21 |   pdftitle          = {Exploring the Small-World Effect in Reinforcement Learning},
22 |   pdfsubject        = {Exploring the Small-World Effect in Reinforcement Learning},
23 |   pdfauthor         = {Arun Tejasvi Chaganty <arunchaganty@gmail.com>, Prateek Gaur <prtkgaur@gmail.com>},
24 |   pdfkeywords       = {},
25 |   pdfcreator        = {pdflatex},
26 |   pdfproducer       = {LaTeX with hyperref and thumbpdf},
27 |   pdfstartpage      = {1},
28 |   pdfpagemode       = UseThumbs,
29 |   colorlinks        = true,
30 |   linkcolor         = red,
31 |   anchorcolor       = red,
32 |   citecolor         = blue,
33 |   filecolor         = red,
34 |   urlcolor          = red
35 | }
36 | 
37 | \newcommand{\ud}{\, \mathrm{d}}
38 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
39 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
40 | 
41 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
42 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
43 | \renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
44 | 
45 | \title{Exploring the Small-World Effect in Reinforcement Learning}
46 | \author{Arun Tejasvi Chaganty (CS07B023) \\ Prateek Gaur (CS07B030)}
47 | \begin{document}
48 | 
49 | \maketitle
50 | %\pagebreak
51 | 
52 | \input{intro}
53 | 
54 | \bibliography{library}{}
55 | \bibliographystyle{alpha}
56 | 
57 | %\newpage
58 | %\appendix
59 | 
60 | %\include{code}
61 | 
62 | \end{document}
63 | 
64 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/theory.tex:
--------------------------------------------------------------------------------
 1 | %\section{Approach}
 2 | \label{sec:approach}
 3 | 
 4 | % Explain small world
 5 | As motivation, we look at the Kleinberg's analysis of the ``small
 6 | world phenomenon'' in social networks, defined to be exhibited when
 7 | individuals operating under a decentralised algorithm can transmit a
 8 | message from source to destination using a {\em short} path using only
 9 | local information such as the locations of their immediate
10 | acquaintances. 
11 | 
12 | % Statement of Kleinberg's results
13 | Consider a $k$-dimensional lattice of $n$ people \footnote{
14 | Kleinberg's proofs were limited to the $2$-dimensional case, but were
15 | extended to the $k$-dimensional case by Martel and Nyugen
16 | \cite{Martel2004} }, wherein each person is connected to one
17 | non-neighbour, according to the distribution $P_{r}( u, v )
18 | \propto \| u-v \|^{-r}$, where $\|u-v\|$ is the graph distance between
19 | nodes $u$ and $v$, and $r$ is a parameter. Kleinberg proves (a) that
20 | when $r=0$, i.e. extra connections are uniformly distributed, any
21 | decentralized algorithm will have an expected delivery time
22 | exponential in $\tilde{d}$ (the shortest path length between $u$ and
23 | $v$), and (b) when $r=k$, an algorithm can be constructed whose
24 | expected delivery time is only polynomial of small degree in
25 | $\tilde{d}$.
26 | 
27 | Similarly, we define an MDP with options to exhibit the small world
28 | property when an agent can efficiently reach a state of {\em maximal
29 | value} using only its local information. We construct a set of
30 | `small-world options' which connect states in the state-interaction
31 | graph according to $P_r$. By relating distance of two
32 | states in the state space, and the difference in value of the two
33 | states, we are able to prove that for a particular exponent($r$), the
34 | expected number of {\em decisions} an agent will have to make to reach
35 | a globally maximal value state will be poly-logarithmic in
36 | $|\states|$. 
37 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | Exploring the Small World Effect in Reinforcement Learning
 2 | ----------------------------------------------------------
 3 | 
 4 | In large domains, RL agents generally require a large number of samples to
 5 | learn a good policy. The options framework proposed by Sutton, Precup and Singh
 6 | provides extended actions for which a policy is already learnt, reducing the
 7 | complexity of the learning task, and generally making the learning task faster.
 8 | An open question in the options framework is discovering the options
 9 | themselves.  There has been substantial work to learn options, mainly focussed
10 | around identifying ``bottleneck'' states, either empirically, or
11 | more recently, using graph theoretic methods like betweeness or
12 | graph partitions.
13 | 
14 | We would like to test an alternative hypothesis; we memorise many actions,
15 | not necessarily bottleneck ones, and put them together; based on their
16 | necessity in solving problems these actions are either reinforced, or gradually
17 | forgotten.  The actions could be of varying complexity, and it is intuitive to
18 | expect that we probably learn a great deal more _simple_ actions than
19 | complex ones. In context of the options framework, the ``complex actions''
20 | correspond to options.
21 | 
22 | Our proposed approach is to use randomly constructed options that create a
23 | 'short-cut' between states, forming a sort of 'small-world' in the domain. This
24 | approach can be viewed as an extension of Kleinberg's popular model in the
25 | Social Network Analysis field, and we would like to note that RL domains are
26 | very grid-like as well. The analogy is further motivated by observing that the
27 | policy followed by an agent in the MDP framework is like distributed search; we
28 | are interested in moving from our source state to the destination (goal) state
29 | using only information available locally, i.e. the value function. We leave
30 | addressing the dynamics of such random options, i.e.  when options are added or
31 | removed, as future work.
32 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/rooms.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | \tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}
 9 |     % Darken walls
10 |     % Boundaries
11 |     \draw[fill=lightgray] (0,0) rectangle (1,13);
12 |     \draw[fill=lightgray] (0,13) rectangle (13,12);
13 |     \draw[fill=lightgray] (13,13) rectangle (12,0);
14 |     \draw[fill=lightgray] (13,0) rectangle (0,1);
15 | 
16 |     % Room Borders
17 |     \draw[fill=lightgray] (6,12) rectangle (7,10);
18 |     \draw[fill=lightgray] (6,9) rectangle (7,3);
19 |     \draw[fill=lightgray] (6,2) rectangle (7,1);
20 | 
21 |     \draw[fill=lightgray] (1,6) rectangle (2,7);
22 |     \draw[fill=lightgray] (4,6) rectangle (6,7);
23 | 
24 |     \draw[fill=lightgray] (7,5) rectangle (9,6);
25 |     \draw[fill=lightgray] (10,5) rectangle (12,6);
26 | 
27 |     % Grid
28 |     \draw[step=1,color=gray] (0,0) grid (13,13);
29 | 
30 |     % Goal
31 |     \draw (1.5, 11.5) node {S};
32 |     \draw (7.5, 2.5) node {G};
33 | 
34 |     % Option 1
35 |     \draw [o-latex] (2.5,2.5) -- (2.5,1.5) -- (1.5,1.5);
36 | 
37 |     % Option 2
38 |     \draw [o-latex] (2.5,3.5) -- (3.5,3.5) -- (3.5,4.5) -- (3.5,5.5);
39 | 
40 |     % Option 3
41 |     \draw [o-latex] (3.5,5.5) -- (4.5,5.5) -- (5.5,5.5);
42 | 
43 |     % Option 4
44 |     \draw [o-latex] (5.5,5.5) -- (5.5,4.5) -- (5.5,3.5) -- (5.5,2.3) -- (6.5,2.3) -- (7.5,2.3) -- (8.5,2.3) -- (9.7,2.3) -- (9.7,3.5) -- (9.7,4.5) -- (9.7,5.5) -- (9.7,6.5) -- (9.7,7.5) -- (9.7,8.5) -- (9.7,9.5) -- (9.5,9.5) -- (9.5,10.5);
45 | 
46 |     % Option 5
47 |     \draw [o-latex] (11.5,11.5) -- (11.5,10.5) -- (11.5,9.5) -- (11.5,8.5);
48 | 
49 |     % Option 6
50 |     \draw [o-latex] (11.5,8.5) -- (11.5,7.5) -- (10.5,7.5) -- (9.3,7.5) -- (9.3,6.5) -- (9.3,5.5) -- (9.3,4.5)  -- (8.5,4.5) -- (7.5,4.5) -- (7.5,3.5) -- (7.5,2.8);
51 | 
52 |     % Option 7
53 |     \draw [o-latex] (1.5,7.5) -- (2.5,7.5) -- (2.5,6.5) -- (2.5,5.5) -- (2.5,4.5) -- (2.5,3.5) -- (2.5,2.7) -- (3.5,2.7) -- (4.5,2.7) -- (5.5,2.7) -- (6.5,2.7) -- (7.5,2.7) -- (8.5,2.7) -- (9.5,2.7);
54 | 
55 |     % Option 8
56 |     \draw [o-latex] (9.5,4.5) -- (9.5,3.5)  -- (10.5,3.5); 
57 | 
58 | \end{tikzpicture}
59 | \end{document}
60 | 


--------------------------------------------------------------------------------
/doc/project-report/intro.tex:
--------------------------------------------------------------------------------
 1 | \section{Introduction}
 2 | \label{sec:intro}
 3 | 
 4 | % General Introduction
 5 | In large domains, RL agents generally require a large number of samples to learn
 6 | a good policy. The options framework proposed by Sutton, Precup and Singh
 7 | \cite{SuttonPrecupSingh1998} provides extended actions for which a policy is
 8 | already learnt, reducing the complexity of the learning task, and generally
 9 | making the learning task faster.  An open question in the options framework is
10 | discovering the options themselves.  There has been substantial work to learn
11 | options, mainly focussed around identifying ``bottleneck'' states, either
12 | empirically as in the work bye Stolle \cite{Stolle}, or more recently, using
13 | graph theoretic methods like betweeness \cite{Simsek} or graph partitions
14 | \cite{Simsek2005} explored by Simsek and Barto.
15 | 
16 | % Motivation
17 | In this work, we propose a method for creating options motivated from a
18 | cognitive perspective, based on the following hypothesis: we memorise many
19 | actions, not necessarily bottleneck ones, and evolve them. Based on their
20 | necessity in solving problems these actions are either reinforced, or gradually
21 | forgotten. The actions could be of varying complexity, and it is intuitive to
22 | expect that we probably learn a great deal more {\em simple} actions than
23 | complex ones. In context of the options framework, these actions correspond to
24 | options, and ``complex actions'' correspond to longer options.
25 | 
26 | % Our options
27 | A desirable set of options gives the agent a set of skills which can be put
28 | together to efficiently accomplish almost any task. From the perspective of the
29 | state-space interaction graph, this is similar to the problem of distributed
30 | search studied by Kleinberg \cite{Kleinberg}; adding edges to a graph such that
31 | any node can be efficiently reached. Guided by this intuition, the method we
32 | propose generates options using a generalisation of the inverse-square law,
33 | along the lines of the small-world graph generation model proposed by Kleinberg.
34 | 
35 | % Summary of the results
36 | Our results show that agents trained using our `small-world' options indeed
37 | perform well, and converge to optimal performance quickly and with little
38 | variance. 
39 | 
40 | 


--------------------------------------------------------------------------------
/src/Environments/TaxiOptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TaxiOptions Environment
 3 | """
 4 | 
 5 | import numpy as np
 6 | import networkx as nx
 7 | import pdb
 8 | 
 9 | from Environment import *
10 | import OptionGenerator
11 | from Taxi import Taxi
12 | 
13 | class TaxiOptions( OptionEnvironment ):
14 | 
15 |     @staticmethod
16 |     def create( spec, scheme = 'none', count = 20, *args ):
17 |         """
18 |         @spec - Specification (size, endpoints, barriers); either exactly
19 |                 specified in a file, or with numeric values in a list
20 |         @option_scheme - none|manual|optimal|small-world|random|ozgur's betweenness|ozgur's randomness|end
21 |         @n_actions - Number of steps that need to taken
22 |         comment : optimal(shortest path to destination)??|random|ozgur's betweenness|ozgur's randomness
23 |         """
24 | 
25 |         env = Taxi.create( spec )
26 | 
27 |         # Percentage
28 |         if isinstance(count,str):
29 |             count = int(count[:-1])
30 |             count = count*env.S/100
31 | 
32 |         # Add options for all the optimal states
33 |         O = []
34 |         if scheme == "none":
35 |             pass
36 |         elif scheme == "random-node":
37 |             O = OptionGenerator.optimal_options_from_random_nodes( env, count, *args )
38 |         elif scheme == "random-path":
39 |             O = OptionGenerator.optimal_options_from_random_paths( env, count, *args )
40 |         elif scheme == "betweenness":
41 |             O = OptionGenerator.optimal_options_from_betweenness( env, count, *args )
42 |         elif scheme == "small-world":
43 |             O = OptionGenerator.optimal_options_from_small_world( env, count, *args )
44 |         elif scheme == "betweenness+small-world":
45 |             O = OptionEnvironment.optimal_options_from_betweenness( env, count )
46 |             count_ = count - len( O ) 
47 |             O += OptionEnvironment.optimal_options_from_small_world( env, count_, *args )
48 |         elif scheme == "load":
49 |             O = OptionGenerator.options_from_file( *args )[:count]
50 |         else:
51 |             raise NotImplemented() 
52 | 
53 |         return OptionEnvironment( TaxiOptions, env.S, env.A, env.P, env.R, env.R_bias, env.start_set, env.end_set, O )
54 |     @staticmethod
55 |     def reset_rewards( env, *args ):
56 |         return env
57 | 
58 | 


--------------------------------------------------------------------------------
/doc/aamas/macros.tex:
--------------------------------------------------------------------------------
 1 | % Section References
 2 | \newcommand{\secref}[1] {\hyperref[#1]{Section~\ref*{#1}}}
 3 | \newcommand{\appendixref}[1] {\hyperref[#1]{Appendix~\ref*{#1}}}
 4 | \newcommand{\exref}[1] {\hyperref[#1]{Example~\ref*{#1}}}
 5 | \newcommand{\eqnref}[1] {Equation \eqref{#1}}
 6 | \newcommand{\figref}[1] {\hyperref[#1]{Figure~\ref*{#1}}}
 7 | \newcommand{\thmref}[1] {\hyperref[#1]{Theorem~\ref*{#1}}}
 8 | \newcommand{\lmref}[1] {\hyperref[#1]{Lemma~\ref*{#1}}}
 9 | \newcommand{\algoref}[1] {\hyperref[#1]{Algorithm~\ref*{#1}}}
10 | \renewcommand{\algorithmiccomment}[1]{\textit{// #1}}
11 | %\theoremstyle{plain} \newtheorem{thm}{Theorem}
12 | 
13 | %Math Operators
14 | \DeclareMathOperator {\argmax} {argmax}
15 | \DeclareMathOperator {\argmin} {argmin}
16 | \DeclareMathOperator {\sgn} {sgn}
17 | \DeclareMathOperator {\trace} {tr}
18 | \DeclareMathOperator{\E} {E}
19 | \DeclareMathOperator{\Var} {Var}
20 | 
21 | \renewcommand{\Re} {\mathbb{R}}
22 | 
23 | \newcommand{\ud}{\, \mathrm{d}}
24 | \newcommand{\diff}[1] {\frac{\partial}{\, \partial #1}}
25 | \newcommand{\diffn}[2] {\frac{\partial^{#2}}{\, \partial {#1}^{#2}}}
26 | \newcommand{\tuple}[1] {\langle #1 \rangle}
27 | 
28 | %Short hand
29 | \newcommand{\states} {S}
30 | \newcommand{\States} {\mathcal{S}}
31 | \newcommand{\transitions} {P}
32 | \newcommand{\Transitions} {\mathcal{P}}
33 | \newcommand{\actions} {A}
34 | \newcommand{\Actions} {\mathcal{A}}
35 | \newcommand{\rewards} {R}
36 | \newcommand{\Rewards} {\mathcal{R}}
37 | \newcommand{\graph} {\mathcal{G}}
38 | \newcommand{\mdp} {M}
39 | \newcommand{\Mdp} {\mathcal{M}}
40 | \newcommand{\policy} {\pi}
41 | \newcommand{\initset} {\mathcal{I}}
42 | \newcommand{\stopcond} {\beta}
43 | \newcommand{\option} {\tuple{ \initset,\policy,\stopcond} }
44 | \newcommand{\options} {\mathcal{O}}
45 | 
46 | %Math Operators
47 | \DeclareMathOperator {\Qf} {Q}
48 | \DeclareMathOperator {\Vf} {V}
49 | \newcommand{\epsilonm} {\bar{\epsilon}}
50 | 
51 | 
52 | %Math Operators
53 | \DeclareMathOperator {\ball} {B}
54 | \DeclareMathOperator {\ballf} {B^{f}}
55 | \DeclareMathOperator {\sball} {b}
56 | \DeclareMathOperator {\sballf} {b^{f}}
57 | 
58 | %Short hand
59 | \newcommand{\arbcnst} {\tilde{c}}
60 | \newcommand{\greedyalgo} {\ensuremath{\mathcal{GA}~}}
61 | \newcommand{\egreedyalgo} {\ensuremath{\mathcal{GA}_{\epsilon}~}}
62 | 
63 | \newcommand{\klein} {\mathcal{K}}
64 | 
65 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/tangocolors.sty:
--------------------------------------------------------------------------------
 1 | % Defines the tango palette for use with LaTeX.
 2 | %
 3 | % Copyright 2006 by Patrick Pletscher <pat _at_ pletscher.org>
 4 | %
 5 | % This program can be redistributed and/or modified under the terms
 6 | % of the GNU Public License, version 2.
 7 | 
 8 | % butter (yellowish)
 9 | \definecolor{tabutter}{rgb}{0.98824, 0.91373, 0.30980}		% #fce94f
10 | \definecolor{ta2butter}{rgb}{0.92941, 0.83137, 0}		% #edd400
11 | \definecolor{ta3butter}{rgb}{0.76863, 0.62745, 0}		% #c4a000
12 | 
13 | % orange
14 | \definecolor{taorange}{rgb}{0.98824, 0.68627, 0.24314}		% #fcaf3e
15 | \definecolor{ta2orange}{rgb}{0.96078, 0.47451, 0}		% #f57900
16 | \definecolor{ta3orange}{rgb}{0.80784, 0.36078, 0}		% #ce5c00
17 | 
18 | % chocolate (brownish)
19 | \definecolor{tachocolate}{rgb}{0.91373, 0.72549, 0.43137}	% #e9b96e
20 | \definecolor{ta2chocolate}{rgb}{0.75686, 0.49020, 0.066667}	% #c17d11
21 | \definecolor{ta3chocolate}{rgb}{0.56078, 0.34902, 0.0078431}	% #8f5902
22 | 
23 | % chameleon (greenish)
24 | \definecolor{tachameleon}{rgb}{0.54118, 0.88627, 0.20392}	% #8ae234
25 | \definecolor{ta2chameleon}{rgb}{0.45098, 0.82353, 0.086275}	% #73d216
26 | \definecolor{ta3chameleon}{rgb}{0.30588, 0.60392, 0.023529}	% #4e9a06
27 | 
28 | % sky blue
29 | \definecolor{taskyblue}{rgb}{0.44706, 0.56078, 0.81176}		% #728fcf
30 | \definecolor{ta2skyblue}{rgb}{0.20392, 0.39608, 0.64314}	% #3465a4
31 | \definecolor{ta3skyblue}{rgb}{0.12549, 0.29020, 0.52941}	% #204a87
32 | 
33 | % plum (violettish)
34 | \definecolor{taplum}{rgb}{0.67843, 0.49804, 0.65882}		% #ad7fa8
35 | \definecolor{ta2plum}{rgb}{0.45882, 0.31373, 0.48235}		% #75507b
36 | \definecolor{ta3plum}{rgb}{0.36078, 0.20784, 0.4}		% #5c3566
37 | 
38 | % scarlet red
39 | \definecolor{tascarletred}{rgb}{0.93725, 0.16078, 0.16078}	% #ef2929
40 | \definecolor{ta2scarletred}{rgb}{0.8, 0, 0}			% #cc0000
41 | \definecolor{ta3scarletred}{rgb}{0.64314, 0, 0}			% #a40000
42 | 
43 | % aluminium
44 | \definecolor{taaluminium}{rgb}{0.93333, 0.93333, 0.92549}	% #eeeeec
45 | \definecolor{ta2aluminium}{rgb}{0.82745, 0.84314, 0.81176}	% #d3d7cf
46 | \definecolor{ta3aluminium}{rgb}{0.72941, 0.74118, 0.71373}	% #babdb6
47 | 
48 | % gray
49 | \definecolor{tagray}{rgb}{0.53333, 0.54118, 0.52157}		% #888a85
50 | \definecolor{ta2gray}{rgb}{0.33333, 0.34118, 0.32549}		% #555753
51 | \definecolor{ta3gray}{rgb}{0.18039, 0.20392, 0.21176}		% #2e3436
52 | 


--------------------------------------------------------------------------------
/src/Agents/IntraOptionQ.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements the Intra Option Q-Learning Algorithm
 3 | """
 4 | 
 5 | from Agent import *
 6 | from Environment import *
 7 | 
 8 | import pdb
 9 | import numpy as np
10 | 
11 | class IntraOptionQ(OptionValueAgent):
12 |     """
13 |     Implements the Intra Option Q-Learning Algorithm
14 |     """
15 | 
16 |     def update_Q(self, state, action, state_, action_, reward):
17 |         """Update the Q function
18 |         @state - old state (sequence)
19 |         @action - old action
20 |         @state_ - current state
21 |         @action_ - current action
22 |         @reward - reward (sequence)
23 |         """
24 | 
25 |         if not state:
26 |             return
27 | 
28 |         def do_update( st, a, st_, a_, r ):
29 |             if st_:
30 |                 # Find the highest value primitive action
31 |                 q_ = max( ( q_ for (a_, q_) in self.Q[st_] if not isinstance( a_, Option ) ) )
32 |             else:
33 |                 # Happens only at end of episodes
34 |                 q_ = 0
35 | 
36 |             # Q-update of primitive action
37 |             q = self.get_value( st, a )
38 |             q += self.alpha * (r + self.gamma * q_ - q)
39 |             self.set_value( st, a, q )
40 | 
41 |             # Update all options that have this action in their policy as
42 |             # well
43 |             for (o, q) in self.Q[ st ]:
44 |                 if isinstance( o, Option ) and any( ( a_ for (a_,pr) in o.pi[ st ] if a_ == a) ):
45 |                     q = self.get_value( st, o )
46 |                     if st_:
47 |                         q_ = ( 1 - o.B( st ) ) * self.get_value( st_, o ) + o.B( st ) * max( ( q_ for (a_, q_) in self.Q[st_] ) )
48 |                     else:
49 |                         q_ = 0
50 |                     q += self.alpha * (r + self.gamma * q_ - q)
51 |                     self.set_value( st, o, q )
52 | 
53 |         if isinstance( action, Option ):
54 |             # Traverse the state sequence 
55 |             for i in xrange( len( reward ) ):
56 |                 # Find all the updatable options and actions
57 |                 st, a = state_[ i ]
58 |                 st_, a_ = state_[i+1]
59 |                 r = reward[ i ]
60 |                 do_update( st, a, st_, a_, r )
61 |         else:
62 |             do_update( state, action, state_, action_, reward )
63 | 
64 | 


--------------------------------------------------------------------------------
/doc/aamas/rooms.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | \tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Darken walls
10 |     % Boundaries
11 |     \draw[fill=lightgray] (0,0) rectangle (1,13);
12 |     \draw[fill=lightgray] (0,13) rectangle (13,12);
13 |     \draw[fill=lightgray] (13,13) rectangle (12,0);
14 |     \draw[fill=lightgray] (13,0) rectangle (0,1);
15 | 
16 |     % Room Borders
17 |     \draw[fill=lightgray] (6,12) rectangle (7,10);
18 |     \draw[fill=lightgray] (6,9) rectangle (7,3);
19 |     \draw[fill=lightgray] (6,2) rectangle (7,1);
20 | 
21 |     \draw[fill=lightgray] (1,6) rectangle (2,7);
22 |     \draw[fill=lightgray] (4,6) rectangle (6,7);
23 | 
24 |     \draw[fill=lightgray] (7,5) rectangle (9,6);
25 |     \draw[fill=lightgray] (10,5) rectangle (12,6);
26 | 
27 |     % Grid
28 |     \draw[step=1,color=gray] (0,0) grid (13,13);
29 | 
30 |     % Goal
31 | %    \draw (1.5, 11.5) node { \Large{\bf S} };
32 |     \draw (7.5, 2.5) node { \Large{\bf G} };
33 | 
34 |     % Option 1
35 |     \draw [o-latex, line width=2pt] (2.5,2.5) -- (2.5,1.5) -- (1.5,1.5);
36 | 
37 |     % Option 2
38 |     \draw [o-latex, line width=2pt] (2.5,3.5) -- (3.5,3.5) -- (3.5,4.5) -- (3.5,5.5);
39 | 
40 |     % Option 3
41 |     \draw [o-latex, line width=2pt] (3.5,5.5) -- (4.5,5.5) -- (5.5,5.5);
42 | 
43 |     % Option 4
44 |     \draw [o-latex, line width=2pt] (5.5,5.5) -- (5.5,4.5) -- (5.5,3.5) -- (5.5,2.3) -- (6.5,2.3) -- (7.5,2.3) -- (8.5,2.3) -- (9.7,2.3) -- (9.7,3.5) -- (9.7,4.5) -- (9.7,5.5) -- (9.7,6.5) -- (9.7,7.5) -- (9.7,8.5) -- (9.7,9.5) -- (9.5,9.5) -- (9.5,10.5);
45 | 
46 |     % Option 5
47 |     \draw [o-latex, line width=2pt] (11.5,11.5) -- (11.5,10.5) -- (11.5,9.5) -- (11.5,8.5);
48 | 
49 |     % Option 6
50 |     \draw [o-latex, line width=2pt] (11.5,8.5) -- (11.5,7.5) -- (10.5,7.5) -- (9.3,7.5) -- (9.3,6.5) -- (9.3,5.5) -- (9.3,4.5)  -- (8.5,4.5) -- (7.5,4.5) -- (7.5,3.5) -- (7.5,2.8);
51 | 
52 |     % Option 7
53 |     \draw [o-latex, line width=2pt] (1.5,7.5) -- (2.5,7.5) -- (2.5,6.5) -- (2.5,5.5) -- (2.5,4.5) -- (2.5,3.5) -- (2.5,2.7) -- (3.5,2.7) -- (4.5,2.7) -- (5.5,2.7) -- (6.5,2.7) -- (7.5,2.7) -- (8.5,2.7) -- (9.5,2.7);
54 | 
55 |     % Option 8
56 |     \draw [o-latex, line width=2pt] (9.5,4.5) -- (9.5,3.5)  -- (10.5,3.5); 
57 | 
58 | \end{tikzpicture}
59 | \end{document}
60 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/rooms.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | \tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Darken walls
10 |     % Boundaries
11 |     \draw[fill=lightgray] (0,0) rectangle (1,13);
12 |     \draw[fill=lightgray] (0,13) rectangle (13,12);
13 |     \draw[fill=lightgray] (13,13) rectangle (12,0);
14 |     \draw[fill=lightgray] (13,0) rectangle (0,1);
15 | 
16 |     % Room Borders
17 |     \draw[fill=lightgray] (6,12) rectangle (7,10);
18 |     \draw[fill=lightgray] (6,9) rectangle (7,3);
19 |     \draw[fill=lightgray] (6,2) rectangle (7,1);
20 | 
21 |     \draw[fill=lightgray] (1,6) rectangle (2,7);
22 |     \draw[fill=lightgray] (4,6) rectangle (6,7);
23 | 
24 |     \draw[fill=lightgray] (7,5) rectangle (9,6);
25 |     \draw[fill=lightgray] (10,5) rectangle (12,6);
26 | 
27 |     % Grid
28 |     \draw[step=1,color=gray] (0,0) grid (13,13);
29 | 
30 |     % Goal
31 |     \draw (1.5, 11.5) node { \Large{\bf S} };
32 |     \draw (7.5, 2.5) node { \Large{\bf G} };
33 | 
34 |     % Option 1
35 |     \draw [o-latex, line width=2pt] (2.5,2.5) -- (2.5,1.5) -- (1.5,1.5);
36 | 
37 |     % Option 2
38 |     \draw [o-latex, line width=2pt] (2.5,3.5) -- (3.5,3.5) -- (3.5,4.5) -- (3.5,5.5);
39 | 
40 |     % Option 3
41 |     \draw [o-latex, line width=2pt] (3.5,5.5) -- (4.5,5.5) -- (5.5,5.5);
42 | 
43 |     % Option 4
44 |     \draw [o-latex, line width=2pt] (5.5,5.5) -- (5.5,4.5) -- (5.5,3.5) -- (5.5,2.3) -- (6.5,2.3) -- (7.5,2.3) -- (8.5,2.3) -- (9.7,2.3) -- (9.7,3.5) -- (9.7,4.5) -- (9.7,5.5) -- (9.7,6.5) -- (9.7,7.5) -- (9.7,8.5) -- (9.7,9.5) -- (9.5,9.5) -- (9.5,10.5);
45 | 
46 |     % Option 5
47 |     \draw [o-latex, line width=2pt] (11.5,11.5) -- (11.5,10.5) -- (11.5,9.5) -- (11.5,8.5);
48 | 
49 |     % Option 6
50 |     \draw [o-latex, line width=2pt] (11.5,8.5) -- (11.5,7.5) -- (10.5,7.5) -- (9.3,7.5) -- (9.3,6.5) -- (9.3,5.5) -- (9.3,4.5)  -- (8.5,4.5) -- (7.5,4.5) -- (7.5,3.5) -- (7.5,2.8);
51 | 
52 |     % Option 7
53 |     \draw [o-latex, line width=2pt] (1.5,7.5) -- (2.5,7.5) -- (2.5,6.5) -- (2.5,5.5) -- (2.5,4.5) -- (2.5,3.5) -- (2.5,2.7) -- (3.5,2.7) -- (4.5,2.7) -- (5.5,2.7) -- (6.5,2.7) -- (7.5,2.7) -- (8.5,2.7) -- (9.5,2.7);
54 | 
55 |     % Option 8
56 |     \draw [o-latex, line width=2pt] (9.5,4.5) -- (9.5,3.5)  -- (10.5,3.5); 
57 | 
58 | \end{tikzpicture}
59 | \end{document}
60 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/rooms.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Darken walls
10 |     % Boundaries
11 |     \draw[fill=lightgray] (0,0) rectangle (1,13);
12 |     \draw[fill=lightgray] (0,13) rectangle (13,12);
13 |     \draw[fill=lightgray] (13,13) rectangle (12,0);
14 |     \draw[fill=lightgray] (13,0) rectangle (0,1);
15 | 
16 |     % Room Borders
17 |     \draw[fill=lightgray] (6,12) rectangle (7,10);
18 |     \draw[fill=lightgray] (6,9) rectangle (7,3);
19 |     \draw[fill=lightgray] (6,2) rectangle (7,1);
20 | 
21 |     \draw[fill=lightgray] (1,6) rectangle (2,7);
22 |     \draw[fill=lightgray] (4,6) rectangle (6,7);
23 | 
24 |     \draw[fill=lightgray] (7,5) rectangle (9,6);
25 |     \draw[fill=lightgray] (10,5) rectangle (12,6);
26 | 
27 |     % Grid
28 |     \draw[step=1,color=gray] (0,0) grid (13,13);
29 | 
30 |     % Goal
31 | %    \draw (1.5, 11.5) node { \Large{\bf S} };
32 |     \draw (7.5, 2.5) node { \Large{\bf G} };
33 | 
34 |     % Option 1
35 |     %\draw [o-latex, line width=2pt] (2.5,2.5) -- (2.5,1.5) -- (1.5,1.5);
36 | 
37 |     % Option 2
38 |     %\draw [o-latex, line width=2pt] (2.5,3.5) -- (3.5,3.5) -- (3.5,4.5) -- (3.5,5.5);
39 | 
40 |     % Option 3
41 |     %\draw [o-latex, line width=2pt] (3.5,5.5) -- (4.5,5.5) -- (5.5,5.5);
42 | 
43 |     % Option 4
44 |     %\draw [o-latex, line width=2pt] (5.5,5.5) -- (5.5,4.5) -- (5.5,3.5) -- (5.5,2.3) -- (6.5,2.3) -- (7.5,2.3) -- (8.5,2.3) -- (9.7,2.3) -- (9.7,3.5) -- (9.7,4.5) -- (9.7,5.5) -- (9.7,6.5) -- (9.7,7.5) -- (9.7,8.5) -- (9.7,9.5) -- (9.5,9.5) -- (9.5,10.5);
45 | 
46 |     % Option 5
47 |     %\draw [o-latex, line width=2pt] (11.5,11.5) -- (11.5,10.5) -- (11.5,9.5) -- (11.5,8.5);
48 | 
49 |     % Option 6
50 |     %\draw [o-latex, line width=2pt] (11.5,8.5) -- (11.5,7.5) -- (10.5,7.5) -- (9.3,7.5) -- (9.3,6.5) -- (9.3,5.5) -- (9.3,4.5)  -- (8.5,4.5) -- (7.5,4.5) -- (7.5,3.5) -- (7.5,2.8);
51 | 
52 |     % Option 7
53 |     %\draw [o-latex, line width=2pt] (1.5,7.5) -- (2.5,7.5) -- (2.5,6.5) -- (2.5,5.5) -- (2.5,4.5) -- (2.5,3.5) -- (2.5,2.7) -- (3.5,2.7) -- (4.5,2.7) -- (5.5,2.7) -- (6.5,2.7) -- (7.5,2.7) -- (8.5,2.7) -- (9.5,2.7);
54 | 
55 |     % Option 8
56 |     %\draw [o-latex, line width=2pt] (9.5,4.5) -- (9.5,3.5)  -- (10.5,3.5); 
57 | 
58 | \end{tikzpicture}
59 | \end{document}
60 | 


--------------------------------------------------------------------------------
/doc/iisc-ravindran/src/rooms-sw-options.tex:
--------------------------------------------------------------------------------
 1 | \documentclass{article}
 2 | \usepackage{tikz}
 3 | \usetikzlibrary{external}
 4 | \usetikzlibrary{arrows}
 5 | %\tikzexternalize % activate!
 6 | 
 7 | \begin{document}
 8 | \begin{tikzpicture}[]
 9 |     % Darken walls
10 |     % Boundaries
11 |     \draw[fill=lightgray] (0,0) rectangle (1,13);
12 |     \draw[fill=lightgray] (0,13) rectangle (13,12);
13 |     \draw[fill=lightgray] (13,13) rectangle (12,0);
14 |     \draw[fill=lightgray] (13,0) rectangle (0,1);
15 | 
16 |     % Room Borders
17 |     \draw[fill=lightgray] (6,12) rectangle (7,10);
18 |     \draw[fill=lightgray] (6,9) rectangle (7,3);
19 |     \draw[fill=lightgray] (6,2) rectangle (7,1);
20 | 
21 |     \draw[fill=lightgray] (1,6) rectangle (2,7);
22 |     \draw[fill=lightgray] (4,6) rectangle (6,7);
23 | 
24 |     \draw[fill=lightgray] (7,5) rectangle (9,6);
25 |     \draw[fill=lightgray] (10,5) rectangle (12,6);
26 | 
27 |     % Grid
28 |     \draw[step=1,color=gray] (0,0) grid (13,13);
29 | 
30 |     % Goal
31 | %    \draw (1.5, 11.5) node { \Large{\bf S} };
32 |     \draw (7.5, 2.5) node { \Large{\bf G} };
33 | 
34 |     % Option 1
35 |     \draw [o-latex, line width=2pt] (2.5,2.5) -- (2.5,1.5) -- (1.5,1.5);
36 | 
37 |     % Option 2
38 |     \draw [o-latex, line width=2pt] (2.5,3.5) -- (3.5,3.5) -- (3.5,4.5) -- (3.5,5.5);
39 | 
40 |     % Option 3
41 |     \draw [o-latex, line width=2pt] (3.5,5.5) -- (4.5,5.5) -- (5.5,5.5);
42 | 
43 |     % Option 4
44 |     \draw [o-latex, line width=2pt] (5.5,5.5) -- (5.5,4.5) -- (5.5,3.5) -- (5.5,2.3) -- (6.5,2.3) -- (7.5,2.3) -- (8.5,2.3) -- (9.7,2.3) -- (9.7,3.5) -- (9.7,4.5) -- (9.7,5.5) -- (9.7,6.5) -- (9.7,7.5) -- (9.7,8.5) -- (9.7,9.5) -- (9.5,9.5) -- (9.5,10.5);
45 | 
46 |     % Option 5
47 |     \draw [o-latex, line width=2pt] (11.5,11.5) -- (11.5,10.5) -- (11.5,9.5) -- (11.5,8.5);
48 | 
49 |     % Option 6
50 |     \draw [o-latex, line width=2pt] (11.5,8.5) -- (11.5,7.5) -- (10.5,7.5) -- (9.3,7.5) -- (9.3,6.5) -- (9.3,5.5) -- (9.3,4.5)  -- (8.5,4.5) -- (7.5,4.5) -- (7.5,3.5) -- (7.5,2.8);
51 | 
52 |     % Option 7
53 |     \draw [o-latex, line width=2pt] (1.5,7.5) -- (2.5,7.5) -- (2.5,6.5) -- (2.5,5.5) -- (2.5,4.5) -- (2.5,3.5) -- (2.5,2.7) -- (3.5,2.7) -- (4.5,2.7) -- (5.5,2.7) -- (6.5,2.7) -- (7.5,2.7) -- (8.5,2.7) -- (9.5,2.7);
54 | 
55 |     % Option 8
56 |     \draw [o-latex, line width=2pt] (9.5,4.5) -- (9.5,3.5)  -- (10.5,3.5); 
57 | 
58 | \end{tikzpicture}
59 | \end{document}
60 | 


--------------------------------------------------------------------------------
/src/Environments/RoomsOptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RoomsOptions Environment
 3 | """
 4 | 
 5 | import numpy as np
 6 | import networkx as nx
 7 | import pdb
 8 | 
 9 | from Environment import *
10 | import OptionGenerator
11 | from Rooms import Rooms
12 | 
13 | class RoomsOptions( ):
14 | 
15 |     @staticmethod
16 |     def create( spec, K=1 scheme = 'none', count = 20, *args ):
17 |         """
18 |         @spec - Specification (size, endpoints, barriers); either exactly
19 |                 specified in a file, or with numeric values in a list
20 |         @option_scheme - none|manual|optimal|small-world|random|ozgur's betweenness|ozgur's randomness|end
21 |         @n_actions - Number of steps that need to taken
22 |         comment : optimal(shortest path to destination)??|random|ozgur's betweenness|ozgur's randomness
23 |         """
24 | 
25 |         env = Rooms.create( spec, K )
26 | 
27 |         # Percentage
28 |         if isinstance(count,str):
29 |             count = int(count[:-1])
30 |             count = count*env.S/100
31 | 
32 |         # Add options for all the optimal states
33 |         O = []
34 |         if scheme == "none":
35 |             pass
36 |         elif scheme == "random-node":
37 |             O = OptionGenerator.optimal_options_from_random_nodes( env, count, *args )
38 |         elif scheme == "random-path":
39 |             O = OptionGenerator.optimal_options_from_random_paths( env, count, *args )
40 |         elif scheme == "betweenness":
41 |             O = OptionGenerator.optimal_options_from_betweenness( env, count, *args )
42 |         elif scheme == "small-world":
43 |             O = OptionGenerator.optimal_options_from_small_world( env, count, *args )
44 |         elif scheme == "betweenness+small-world":
45 |             O = OptionEnvironment.optimal_options_from_betweenness( env, count )
46 |             count_ = count - len( O ) 
47 |             O += OptionEnvironment.optimal_options_from_small_world( env, count_, *args )
48 |         elif scheme == "load":
49 |             O = OptionGenerator.options_from_file( count, *args )
50 |         else:
51 |             raise NotImplemented() 
52 | 
53 |         return OptionEnvironment( RoomsOptions, env.S, env.A, env.P, env.R, env.R_bias, env.start_set, env.end_set, O )
54 | 
55 |     @staticmethod
56 |     def reset_rewards( env, spec, K=1, *args ):
57 |         O = env.O
58 |         env = Rooms.reset_rewards( env, spec, K )
59 |         return OptionEnvironment( RoomsOptions, env.S, env.A, env.P, env.R, env.R_bias, env.start_set, env.end_set, O )
60 | 
61 | 


--------------------------------------------------------------------------------
/doc/aamas/abstract-submission.txt:
--------------------------------------------------------------------------------
 1 | Understanding how we are able to perform a diverse set of complex
 2 | tasks has been a central question for the Artificial Intelligence
 3 | community. We hypothesise that the key to this ability lies in finding
 4 | a set of composable subtasks that "easily" span the set of all tasks.
 5 | Drawing parallels from Kleinberg's work on the small-world phenomenon
 6 | in social networks, we model our hypothesis using the options
 7 | framework from reinforcement learning, and prove that given
 8 | well-distributed subtasks, an agent can perform any task using only a
 9 | logarithmic combination of subtasks and primitive actions. We support
10 | our hypothesis with experimental results.
11 | 
12 | The options framework provides extended actions with predefined
13 | policies as an abstraction for subtasks. There has been substantial
14 | work in learning options, mainly focussed around identifying
15 | 'bottlenecks', regions that the agent tends to visit frequently,
16 | either empirically, or, more recently, using graph theoretic methods
17 | like betweenness centrality or graph partitions, with the intuition
18 | that they will help the agent move between strongly connected
19 | components, and thus help in effective exploration.  This does not
20 | meet our criteria of composability (tasks solved as series of
21 | subtasks) and universality (any state should be efficiently
22 | reachable).
23 | 
24 | As motivation, we look at the Kleinberg's analysis of the "small world
25 | phenomenon" in social networks, defined to be exhibited when
26 | individuals operating under a decentralised algorithm can transmit a
27 | message from source to destination using a short path using only local
28 | information such as the locations of their immediate acquaintances.
29 | Kleinberg showed that in a lattice with additional edges distributed
30 | according to the inverse power law, an agent could indeed do so in
31 | time logarithmic in the size of the network.
32 | 
33 | Similarly, we define an MDP with options to exhibit the small world
34 | property when an agent can efficiently reach a state of maximal value
35 | using only its local information. We construct a set of 'small-world
36 | options' which connect states in the state-interaction graph according
37 | to the inverse square law. By relating distance of two states in the
38 | state space, and the difference in value of the two states, we are
39 | able to prove that for a particular exponent(r), the expected number
40 | of decisions an agent will have to make to reach a globally maximal
41 | value state will be logarithmic in |S|. 
42 | 


--------------------------------------------------------------------------------
/domains/rooms-scale/medium1.tsv:
--------------------------------------------------------------------------------
 1 | F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F	F	F	F	F		F	F	F
 2 | F	F	F	F	F	F	F	F	F	F	F								F		F	F	F	F	F	F	F		F		F	F	F
 3 | F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F	F	F		F	F	F	F	F
 4 | F	F	F	F	F	F	F	F	F	F	F				F	F	F		F		F	F	F	F	F	F	F				F		F
 5 | F	F	F	F	F	F	F	F	F	F	F				F	F	F		F		F	F	F	F	F	F	F		F	F	F		F
 6 | F	F	F	F	F	F	F	F	F	F	F				F	F	F		F		F	F	F	F	F	F	F		F	F	F		F
 7 | F	F	F	F	F	F	F	F	F	F	F				F	F	F		F		F	F	F	F	F	F	F	F	F	F	F		F
 8 | F	F	F	F	F	F	F	F	F	F	F				F	F	F		F										F	F	F		F
 9 | F	F	F	F	F	F	F	F	F	F	F				F	F	F		F	F	F	F	F	F	F	F	F	F	F	F	F		F
10 | F	F	F	F	F	F	F	F	F	F	F				F	F	F												F	F	F		F
11 | F	F	F	F	F	F	F	F	F	F	F				F	F	F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F
12 | 		F						F		F								F	F	F	F	F	F	F	F	F						F
13 | 		F	F	F	F	F		F		F	F	F	F	F				F	F	F	F	F	F	F	F	F		F	F	F		F
14 | 		F	F	F	F	F		F		F	F	F	F	F				F	F	F	F	F	F	F	F	F		F	F	F		F
15 | 		F	F	F	F	F		F		F	F	F	F	F				F	F	F	F	F	F	F	F	F		F	F	F		F
16 | 		F	F	F	F	F		F		F	F	F	F	F				F										F		F		F
17 | 		F	F	F	F	F		F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F		F	F	F		F	F	F
18 | 		F	F	F	F	F						F				F	F	F	F	F	F	F	F	F		F	F	F		F	F	F
19 | 		F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F		F	F	F
20 | 								F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F				F
21 | F	F	F	F	F	F	F		F	F	F	F	F		F		F	F	F	F	F	F	F	F	F	F	F	F	F		F	F	F
22 | F	F	F	F	F		F		F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F		F	F	F
23 | F	F	F	F	F		F		F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F	F	F	F	F
24 | F	F	F	F	F		F								F		F	F	F	F	F	F	F	F	F		F	F	F		F	F	F
25 | F	F	F	F	F	F	F	F	F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F		F	F	F
26 | F	F	F	F	F		F	F	F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F				F
27 | F	F	F	F	F		F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F		F	F	F	F	F	F	F
28 | 		F				F	F	F	F	F	F	F																		F	F	F
29 | F	F	F				F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F
30 | F														F	F	F	F	F	F	F										F	F	F
31 | F		F	F	F	F	F		F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F				F	F	F
32 | F		F	F	F	F	F		F	F	F		F		F	F	F	F	F	F	F		F	F	F	F	F				F		F
33 | F	F	F	F	F	F	F		F	F	F		F		F	F	F	F	F	F	F		F	F	F	F	F		F	F	F		F
34 | F		F	F	F	F	F														F								F	F	F		F
35 | F		F	F	F	F	F		F	F	F	F	F		F	F	F	F	F	F	F		F	F	F	F	F	F	F	F	F		F
36 | 						F		F	F	F	F	F		F	F	F	F	F	F	F		F	F	F	F	F						F
37 | F	F	F	F	F	F	F		F	F	F	F	F		F	F	F	F	F	F	F		F	F	F	F	F	F	F	F	F		F
38 | F	F	F	F	F				F		F				F	F	F	F	F	F	F		F	F	F	F	F		F	F	F		F
39 | F	F	F	F	F	F	F	F	F		F		F	F	F	F	F	F	F	F	F		F	F	F	F	F		F	F	F		F
40 | F	F	F	F	F				F		F		F		F	F	F	F	F	F	F		F						F	F	F		F
41 | F	F	F	F	F		F	F	F		F		F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F		F
42 | F				F		F	F	F		F												F	F	F	F	F						F
43 | F	F	F	F	F		F	F	F		F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F	F
44 | 


--------------------------------------------------------------------------------
/doc/aamas/texify:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # Equation Beautifier
 3 | # Arun Chaganty <arunchaganty@gmail.com>
 4 | # Replaces [],() with \left( \right), etc. in math modes
 5 | #
 6 | 
 7 | import re
 8 | import sys
 9 | 
10 | mode = { 
11 | #"single" : False, # $
12 | "double" : False, # $$
13 | "eqnarray" : False,
14 | "IEEE" : False,
15 | }
16 | 
17 | rgx = {
18 | #"single" : re.compile(r"\$[^$]"),
19 | "double" : re.compile(r"\$\$"),
20 | "eqnarray" : re.compile(r"\\(begin|end){eqnarray\*?}"),
21 | "IEEE" : re.compile(r"\\(begin|end){IEEEeqnarray\*?}"),
22 | "input" : re.compile(r"\\input{([^}]*)}")
23 | }
24 | 
25 | replacements = {
26 | "(": r"\left(",
27 | "[": r"\left[",
28 | ")": r"\right)",
29 | "]": r"\right]",
30 | }
31 | 
32 | def transform( in_str ):
33 |     for k,v in replacements.items():
34 |         in_str = in_str.replace(k,v)
35 |     return in_str
36 | 
37 | def printChunk( in_str, mode, start, end ):
38 |     chunk = in_str[start:end]
39 |     if mode:
40 |         # Apply transformations
41 |         chunk = transform( chunk )
42 | 
43 |     sys.stdout.write( "%s"%( chunk ) )
44 | 
45 | def parseTeX( in_str ):
46 |     """Parse TeX"""
47 |     global mode
48 | 
49 |     start = 0
50 |     # Check for math mode markers - else just print the string
51 |     # BUG: Only one marker per line
52 |     for marker in rgx.keys():
53 |         if rgx[marker].findall( in_str ):
54 |             if marker == "input":
55 |                 fs = rgx[marker].findall( in_str )
56 |                 for f in fs:
57 |                     f = f.strip() + ".tex"
58 |                     sys.stderr.write( "Parsing dependent file %s\n"%( f ) )
59 |                     contents = open( f ).read()
60 |                     parseTeX( contents )
61 |                 break
62 |             else:
63 |                 for m in rgx[marker].finditer( in_str ):
64 |                     # Keep printing and toggling.
65 |                     printChunk( in_str, mode[marker], start, m.end()-1 )
66 |                     mode[marker] = not mode[marker]
67 |                     start = m.end()-1
68 |                 printChunk( in_str, mode[marker], start, len(in_str))
69 |                 break
70 |     else:
71 |         for marker in mode.keys():
72 |             if mode[marker]:
73 |                 printChunk( in_str, mode[marker], start, len(in_str))
74 |                 break
75 |         else:
76 |             sys.stdout.write( "%s"%(in_str) )
77 | 
78 | if __name__ == "__main__":
79 | 
80 |     try:
81 |         while True:
82 |             in_str = raw_input()
83 |             parseTeX(in_str)
84 |             sys.stdout.write("\n")
85 |     except EOFError:
86 |         pass
87 | 
88 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/texify:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # Equation Beautifier
 3 | # Arun Chaganty <arunchaganty@gmail.com>
 4 | # Replaces [],() with \left( \right), etc. in math modes
 5 | #
 6 | 
 7 | import re
 8 | import sys
 9 | 
10 | mode = { 
11 | #"single" : False, # $
12 | "double" : False, # $$
13 | "eqnarray" : False,
14 | "IEEE" : False,
15 | }
16 | 
17 | rgx = {
18 | #"single" : re.compile(r"\$[^$]"),
19 | "double" : re.compile(r"\$\$"),
20 | "eqnarray" : re.compile(r"\\(begin|end){eqnarray\*?}"),
21 | "IEEE" : re.compile(r"\\(begin|end){IEEEeqnarray\*?}"),
22 | "input" : re.compile(r"\\input{([^}]*)}")
23 | }
24 | 
25 | replacements = {
26 | "(": r"\left(",
27 | "[": r"\left[",
28 | ")": r"\right)",
29 | "]": r"\right]",
30 | }
31 | 
32 | def transform( in_str ):
33 |     for k,v in replacements.items():
34 |         in_str = in_str.replace(k,v)
35 |     return in_str
36 | 
37 | def printChunk( in_str, mode, start, end ):
38 |     chunk = in_str[start:end]
39 |     if mode:
40 |         # Apply transformations
41 |         chunk = transform( chunk )
42 | 
43 |     sys.stdout.write( "%s"%( chunk ) )
44 | 
45 | def parseTeX( in_str ):
46 |     """Parse TeX"""
47 |     global mode
48 | 
49 |     start = 0
50 |     # Check for math mode markers - else just print the string
51 |     # BUG: Only one marker per line
52 |     for marker in rgx.keys():
53 |         if rgx[marker].findall( in_str ):
54 |             if marker == "input":
55 |                 fs = rgx[marker].findall( in_str )
56 |                 for f in fs:
57 |                     f = f.strip() + ".tex"
58 |                     sys.stderr.write( "Parsing dependent file %s\n"%( f ) )
59 |                     contents = open( f ).read()
60 |                     parseTeX( contents )
61 |                 break
62 |             else:
63 |                 for m in rgx[marker].finditer( in_str ):
64 |                     # Keep printing and toggling.
65 |                     printChunk( in_str, mode[marker], start, m.end()-1 )
66 |                     mode[marker] = not mode[marker]
67 |                     start = m.end()-1
68 |                 printChunk( in_str, mode[marker], start, len(in_str))
69 |                 break
70 |     else:
71 |         for marker in mode.keys():
72 |             if mode[marker]:
73 |                 printChunk( in_str, mode[marker], start, len(in_str))
74 |                 break
75 |         else:
76 |             sys.stdout.write( "%s"%(in_str) )
77 | 
78 | if __name__ == "__main__":
79 | 
80 |     try:
81 |         while True:
82 |             in_str = raw_input()
83 |             parseTeX(in_str)
84 |             sys.stdout.write("\n")
85 |     except EOFError:
86 |         pass
87 | 
88 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/texify:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # Equation Beautifier
 3 | # Arun Chaganty <arunchaganty@gmail.com>
 4 | # Replaces [],() with \left( \right), etc. in math modes
 5 | #
 6 | 
 7 | import re
 8 | import sys
 9 | 
10 | mode = { 
11 | #"single" : False, # $
12 | "double" : False, # $$
13 | "eqnarray" : False,
14 | "IEEE" : False,
15 | }
16 | 
17 | rgx = {
18 | #"single" : re.compile(r"\$[^$]"),
19 | "double" : re.compile(r"\$\$"),
20 | "eqnarray" : re.compile(r"\\(begin|end){eqnarray\*?}"),
21 | "IEEE" : re.compile(r"\\(begin|end){IEEEeqnarray\*?}"),
22 | "input" : re.compile(r"\\input{([^}]*)}")
23 | }
24 | 
25 | replacements = {
26 | "(": r"\left(",
27 | "[": r"\left[",
28 | ")": r"\right)",
29 | "]": r"\right]",
30 | }
31 | 
32 | def transform( in_str ):
33 |     for k,v in replacements.items():
34 |         in_str = in_str.replace(k,v)
35 |     return in_str
36 | 
37 | def printChunk( in_str, mode, start, end ):
38 |     chunk = in_str[start:end]
39 |     if mode:
40 |         # Apply transformations
41 |         chunk = transform( chunk )
42 | 
43 |     sys.stdout.write( "%s"%( chunk ) )
44 | 
45 | def parseTeX( in_str ):
46 |     """Parse TeX"""
47 |     global mode
48 | 
49 |     start = 0
50 |     # Check for math mode markers - else just print the string
51 |     # BUG: Only one marker per line
52 |     for marker in rgx.keys():
53 |         if rgx[marker].findall( in_str ):
54 |             if marker == "input":
55 |                 fs = rgx[marker].findall( in_str )
56 |                 for f in fs:
57 |                     f = f.strip() + ".tex"
58 |                     sys.stderr.write( "Parsing dependent file %s\n"%( f ) )
59 |                     contents = open( f ).read()
60 |                     parseTeX( contents )
61 |                 break
62 |             else:
63 |                 for m in rgx[marker].finditer( in_str ):
64 |                     # Keep printing and toggling.
65 |                     printChunk( in_str, mode[marker], start, m.end()-1 )
66 |                     mode[marker] = not mode[marker]
67 |                     start = m.end()-1
68 |                 printChunk( in_str, mode[marker], start, len(in_str))
69 |                 break
70 |     else:
71 |         for marker in mode.keys():
72 |             if mode[marker]:
73 |                 printChunk( in_str, mode[marker], start, len(in_str))
74 |                 break
75 |         else:
76 |             sys.stdout.write( "%s"%(in_str) )
77 | 
78 | if __name__ == "__main__":
79 | 
80 |     try:
81 |         while True:
82 |             in_str = raw_input()
83 |             parseTeX(in_str)
84 |             sys.stdout.write("\n")
85 |     except EOFError:
86 |         pass
87 | 
88 | 


--------------------------------------------------------------------------------
/doc/project-report/texify:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # Equation Beautifier
 3 | # Arun Chaganty <arunchaganty@gmail.com>
 4 | # Replaces [],() with \left( \right), etc. in math modes
 5 | #
 6 | 
 7 | import re
 8 | import sys
 9 | 
10 | mode = { 
11 | #"single" : False, # $
12 | "double" : False, # $$
13 | "eqnarray" : False,
14 | "IEEE" : False,
15 | }
16 | 
17 | rgx = {
18 | #"single" : re.compile(r"\$[^$]"),
19 | "double" : re.compile(r"\$\$"),
20 | "eqnarray" : re.compile(r"\\(begin|end){eqnarray\*?}"),
21 | "IEEE" : re.compile(r"\\(begin|end){IEEEeqnarray\*?}"),
22 | "input" : re.compile(r"\\input{([^}]*)}")
23 | }
24 | 
25 | replacements = {
26 | "(": r"\left(",
27 | "[": r"\left[",
28 | ")": r"\right)",
29 | "]": r"\right]",
30 | }
31 | 
32 | def transform( in_str ):
33 |     for k,v in replacements.items():
34 |         in_str = in_str.replace(k,v)
35 |     return in_str
36 | 
37 | def printChunk( in_str, mode, start, end ):
38 |     chunk = in_str[start:end]
39 |     if mode:
40 |         # Apply transformations
41 |         chunk = transform( chunk )
42 | 
43 |     sys.stdout.write( "%s"%( chunk ) )
44 | 
45 | def parseTeX( in_str ):
46 |     """Parse TeX"""
47 |     global mode
48 | 
49 |     start = 0
50 |     # Check for math mode markers - else just print the string
51 |     # BUG: Only one marker per line
52 |     for marker in rgx.keys():
53 |         if rgx[marker].findall( in_str ):
54 |             if marker == "input":
55 |                 fs = rgx[marker].findall( in_str )
56 |                 for f in fs:
57 |                     f = f.strip() + ".tex"
58 |                     sys.stderr.write( "Parsing dependent file %s\n"%( f ) )
59 |                     contents = open( f ).read()
60 |                     parseTeX( contents )
61 |                 break
62 |             else:
63 |                 for m in rgx[marker].finditer( in_str ):
64 |                     # Keep printing and toggling.
65 |                     printChunk( in_str, mode[marker], start, m.end()-1 )
66 |                     mode[marker] = not mode[marker]
67 |                     start = m.end()-1
68 |                 printChunk( in_str, mode[marker], start, len(in_str))
69 |                 break
70 |     else:
71 |         for marker in mode.keys():
72 |             if mode[marker]:
73 |                 printChunk( in_str, mode[marker], start, len(in_str))
74 |                 break
75 |         else:
76 |             sys.stdout.write( "%s"%(in_str) )
77 | 
78 | if __name__ == "__main__":
79 | 
80 |     try:
81 |         while True:
82 |             in_str = raw_input()
83 |             parseTeX(in_str)
84 |             sys.stdout.write("\n")
85 |     except EOFError:
86 |         pass
87 | 
88 | 


--------------------------------------------------------------------------------
/src/Environments/ArbitraryNavigationOptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ArbitraryNavigationOptions Environment
 3 | """
 4 | 
 5 | import numpy as np
 6 | import networkx as nx
 7 | import pdb
 8 | 
 9 | from Environment import *
10 | import OptionGenerator
11 | from ArbitraryNavigation import ArbitraryNavigation
12 | 
13 | class ArbitraryNavigationOptions( ):
14 | 
15 |     @staticmethod
16 |     def create( height, width, scheme = 'none', count = 20, *args ):
17 |         """
18 |         @spec - Specification (size, endpoints, barriers); either exactly
19 |                 specified in a file, or with numeric values in a list
20 |         @option_scheme - none|manual|optimal|small-world|random|ozgur's betweenness|ozgur's randomness|end
21 |         @n_actions - Number of steps that need to taken
22 |         comment : optimal(shortest path to destination)??|random|ozgur's betweenness|ozgur's randomness
23 |         """
24 | 
25 |         env = ArbitraryNavigation.create( height, width )
26 |         g = env.to_graph()
27 |         gr = g.reverse()
28 | 
29 |         # Percentage
30 |         if isinstance(count,str):
31 |             count = int(count[:-1])
32 |             count = count*env.S/100
33 | 
34 |         # Add options for all the optimal states
35 |         O = []
36 |         if scheme == "none":
37 |             pass
38 |         elif scheme == "random-node":
39 |             O = OptionGenerator.optimal_options_from_random_nodes( env, count, *args )
40 |         elif scheme == "random-path":
41 |             O = OptionGenerator.optimal_options_from_random_paths( env, count, *args )
42 |         elif scheme == "betweenness":
43 |             O = OptionGenerator.optimal_options_from_betweenness( env, count, *args )
44 |         elif scheme == "small-world":
45 |             O = OptionGenerator.optimal_options_from_small_world( env, count, *args )
46 |         elif scheme == "betweenness+small-world":
47 |             O = OptionEnvironment.optimal_options_from_betweenness( env, count )
48 |             count_ = count - len( O ) 
49 |             O += OptionEnvironment.optimal_options_from_small_world( env, count_, *args )
50 |         elif scheme == "load":
51 |             O = OptionGenerator.options_from_file( *args )[:count]
52 |         else:
53 |             raise NotImplemented() 
54 | 
55 |         return OptionEnvironment( ArbitraryNavigationOptions, env.S, env.A, env.P, env.R, env.R_bias, env.start_set, env.end_set, O )
56 | 
57 |     @staticmethod
58 |     def reset_rewards( env, road_map ):
59 |         O = env.O
60 |         env = ArbitraryNavigation.reset_rewards( env, road_map )
61 |         return OptionEnvironment( ArbitraryNavigationOptions, env.S, env.A, env.P, env.R, env.R_bias, env.start_set, env.end_set, O )
62 | 
63 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/comments.txt:
--------------------------------------------------------------------------------
 1 | *1) Balaraman Ravindran, you have it flipped around.
 2 | 
 3 | *2) McGovern, Barto is more of a standard reference on Bottleneck states.
 4 | 
 5 | *3) Graph partitioning based methods, if you had to include one reference, I would suggest Shie Mannor's work.
 6 | 
 7 | -4) Ozgur looks at her betweenness work as also having been inspired by SNA. :)
 8 | 
 9 | -5) Kleinberg talks about distributed navigation property. Does he mention that the ability to navigate the network is what constitutes "small-world"-ness? 
10 | 
11 | *6) I don't understand Fig 1(b). Is that cumulative return? If so, either you have to mention that explicitly. or redraw it so that it is instantaneous (i.e., only the current epoch) return. The latter is the usual convention when not analyzing regret. :). Any RL person looking at this graph is going to assume that the methods initially unlearn and then start learning. :). 
12 | 
13 | ?7) Another issue with the graph. Does this imply that even on the rooms world task, betweenness does worse than "no options". Any explanation for that?  Does this have arbitrary goal states anywhere in the rooms and not just at the doorways? Even then Doina showed that having good options (i.e., doorways) beats primitive actions. Does that imply that the betweenness method doesn't find the doorways? 
14 | 
15 | * 8) "Reaching bottlenecks alone is insufficient, especially when high value states do not lie on or near bottlenecks, as in arbitrary navigation tasks."  Now that is a little tricky. What is your definition of a bottle neck? Also, how do bottleneck detection methods operate? For e.g., in McGovern's work, she only looks at bottleneck states on successful trajectories, that are not there in unsuccessful ones. But this limits the usefulness of the option only to those tasks from which the options were derived. So a sentence or two on how we interpret bottlenecks and usefulness might be in order here. 
16 | 
17 | ?9) I ask for my own intuition. What determines the dimension of the lattice? The max. no. of neighbors that a node has? Have you looked at the playroom domain? From the IMRL paper (Satinder, Barto, Chatnanez(?)). What would be dimension of the lattice there? How about the game world that Pradyot uses for his experiments? How about the C-space of a robot arm? If it is a 5-DoF robot, would it be a 5-d lattice?  
18 | 
19 | *10) State of maximal value: Is that locally maximal or globally maximal? In the true optimal value fn. or the current value fn. estimate? I know the answer, since I asked you. Maybe a line to that effect in the paper? 
20 | 
21 | 11) The document is very well written. I would be hard pressed to leave out anything, if you ask me. :). So fitting in the rest might be a problem. One possibility is to get rid of some of the white space below the title, etc. Also, given the relaxed nature of the reviewing, etc., I am sure they will not mind a little over run, like you have now. :).
22 | 


--------------------------------------------------------------------------------
/doc/aamas/algo.tex:
--------------------------------------------------------------------------------
 1 | \section{Options from Experience} 
 2 | \label{sec:algo}
 3 | 
 4 | In \secref{sec:theory}, we remarked that we need to generate $O(|S|)$
 5 | options. Given the scale of this number, we require an algorithm to
 6 | efficiently generate options within a budget of training epochs in order
 7 | for small world options to be practical. Drawing insight from the proof
 8 | of \thmref{thm:small-world}, the objective of small world options is to
 9 | bring the agent into an exponentially smaller neighbourhood of the
10 | maximal value state. This suggests that cheaply generated options may
11 | still be acceptable.
12 | 
13 | The algorithm (\algoref{algo:small-world-experience}) we propose takes
14 | a given MDP $\mdp$, and trains an agent to learn $T$ different tasks
15 | (i.e. different $\rewards$) on it, evenly dividing the epoch budget
16 | amongst them. With each learned task, we certainly will have a good policy
17 | for path options from any state to the state of maximal value, $M_v$.
18 | However, we observe that will also have a good policy for path options
19 | from $u$ to $v$ is the path is `along the gradient' of $Q$, i.e. when
20 | $V(u) < V(v) < V(M_v)$. Observing that $V(s) \approx \argmax_{v}
21 | Q(s,\pi(s))$, we detail the algorithm to construct options from the
22 | $Q$-value function in \algoref{algo:qoptions}. We use this algorithm to
23 | construct many options from a single task solution.
24 | 
25 | \begin{algorithm}[H]
26 |   \caption{Small World Options from Experience}
27 |   \label{algo:small-world-experience}
28 |   \begin{algorithmic}[1]
29 |       \REQUIRE $\mdp$, $\Rewards$, $r$, $n$, epochs, $T$
30 |       \STATE $O \gets \emptyset$
31 |       \FOR{ $i= 0 \to T$ }
32 |         \STATE $\rewards \sim \Rewards$
33 |         \STATE $Q \gets $ Solve $\mdp$ with $\rewards$ using
34 |             $\frac{\textrm{epochs}}{T}$ epochs
35 |         \STATE $O' \gets $ QOptions( $Q$, $r$,
36 |             $\frac{n}{T}$ )
37 |         \STATE $O \gets O \cup O'$
38 |       \ENDFOR
39 |       \RETURN A random subset of $n$ options from $O$
40 |   \end{algorithmic}
41 | \end{algorithm}
42 | \begin{algorithm}[H]
43 |   \caption{{\bf QOptions}: Options from a $Q$-Value Function}
44 |   \label{algo:qoptions}
45 |   \begin{algorithmic}[1]
46 |       \REQUIRE $Q$, $r$, $n$
47 |       \STATE $O \gets \emptyset$
48 |       \STATE $\pi \gets $ greedy policy from $Q$
49 |       \FORALL{ $s$ in $\states$ }
50 |         \STATE Choose an $s'$ according to $P_r$
51 |         \IF{ $Q(s', \pi(s')) > Q(s, \pi(s))$ }
52 |           \STATE $O \gets O \cup \tuple{\{s\}, \pi, \{s'\} \cup \{t \mid Q(s',\pi(s')) < Q(t, \pi(t))\} }$
53 |         \ENDIF
54 |       \ENDFOR{ $s$ in $\states$ }
55 |       \RETURN A random subset of $n$ options from $O$
56 |   \end{algorithmic}
57 | \end{algorithm}
58 | 
59 | We note here except for sampling $s'$ from $P_r$, we do not require any
60 | knowledge of the MDP, nor do we need to construct a local model of the
61 | same. $s'$ can approximately be sampled using $\frac{E[l]}{\log(|S|)}$
62 | in place of $P_r$. 
63 | 
64 | 


--------------------------------------------------------------------------------
/doc/aamas/paper.tex:
--------------------------------------------------------------------------------
  1 | \documentclass{aamas2012}
  2 | % Setting letter size with pdflatex
  3 | \pdfpagewidth=8.5truein
  4 | \pdfpageheight=11truein
  5 | 
  6 | \usepackage{float}
  7 | \usepackage{makeidx}
  8 | \usepackage{amsmath}   
  9 | \usepackage{amsfonts}   
 10 | \usepackage[retainorgcmds]{IEEEtrantools}
 11 | \usepackage{thumbpdf}
 12 | \usepackage{multicol}   
 13 | \usepackage{graphicx}   
 14 | \usepackage{listings}
 15 | \usepackage{algorithm}
 16 | \usepackage{algorithmic}
 17 | \usepackage{tikz}
 18 | \usepackage{subfigure}
 19 | 
 20 | \usepackage{hyperref}
 21 | \hypersetup{ 
 22 |   pdftitle          = {Learning in a Small World},
 23 |   pdfauthor         = {Arun Tejasvi Chaganty, Prateek Gaur, Balaraman Ravindran},
 24 |   colorlinks        = true,
 25 |   linkcolor         = red,
 26 |   urlcolor          = red,
 27 |   citecolor         = blue,
 28 | }
 29 | 
 30 | \input{macros}
 31 | \newtheorem{theorem}{Theorem}
 32 | \newtheorem{lemma}{Lemma}
 33 | \newdef{definition}{Definition}
 34 | \newdef{example}{Example}
 35 | \newcommand{\draft}[1]{\textbf{TODO: #1}}
 36 | 
 37 | \title{Learning in a Small World}
 38 | 
 39 | % Author information
 40 | \numberofauthors{3}
 41 | \author{ 
 42 | \alignauthor 
 43 | Paper  280
 44 | % Commented until we get accepted
 45 | % Arun Tejasvi Chaganty\\
 46 | %        \affaddr{Deptt. of Computer Science and Engineering,}\\
 47 | %        \affaddr{IIT Madras}\\
 48 | %        \affaddr{Chennai, India - 600036}\\
 49 | %        \email{arunc@cse.iitm.ac.in}
 50 | % \alignauthor
 51 | % Prateek Gaur\\
 52 | %        \affaddr{Deptt. of Computer Science and Engineering,}\\
 53 | %        \affaddr{IIT Madras}\\
 54 | %        \affaddr{Chennai, India - 600036}\\
 55 | %        \email{prtkgaur@cse.iitm.ac.in}
 56 | % \alignauthor
 57 | % Balaraman Ravindran\\
 58 | %        \affaddr{Deptt. of Computer Science and Engineering,}\\
 59 | %        \affaddr{IIT Madras}\\
 60 | %        \affaddr{Chennai, India - 600036}\\
 61 | %        \email{ravi@cse.iitm.ac.in}
 62 | } 
 63 | 
 64 | \begin{document}
 65 | 
 66 | \maketitle
 67 | %\pagebreak
 68 | 
 69 | % Outline
 70 | \input{abstract}
 71 | 
 72 | \category{I.2.6}{Artificial Intelligence}{Learning}
 73 | \category{I.2.8}{Artificial Intelligence}{Problem Solving, Control Methods and Search}
 74 | \terms{Algorithms, Theory, Experimentation}
 75 | \keywords{reinforcement learning, options framework, social network analysis, small world phenomenon}
 76 | 
 77 | % Introduction: Motivate the problem (using lifelong learning,
 78 | % transfer)
 79 | % Emphasise on not blowing up state space, summarise results
 80 | \input{intro}
 81 | % Background: Define MDPs, Options
 82 | \input{background}
 83 | % Define the algorithm. Prove the interesting result
 84 | \input{theory}
 85 | % Describe the algorithms to; a) generate small world options b)
 86 | % extract from learning episodes
 87 | \input{algo}
 88 | % Experiment section
 89 | \input{experiments}
 90 | % Discuss conclusions
 91 | \input{conclusions}
 92 | 
 93 | \bibliographystyle{abbrv}
 94 | \bibliography{ref}{}
 95 | 
 96 | \balancecolumns
 97 | 
 98 | \appendix
 99 | \input{small-world-theory}
100 | \balancecolumns 
101 | 
102 | \end{document}
103 | 
104 | 


--------------------------------------------------------------------------------
/doc/proposal/intro.tex:
--------------------------------------------------------------------------------
 1 | \section{Introduction}
 2 | \label{sec:intro}
 3 | 
 4 | % Slow learning in RL
 5 | In large domains, RL agents generally require a large number of samples to learn
 6 | a good policy. The options framework proposed by Sutton, Precup and Singh
 7 | \cite{SuttonPrecupSingh1998} provides extended actions for which a policy is
 8 | already learnt, reducing the complexity of the learning task, and generally
 9 | making the learning task faster.  An open question in the options framework is
10 | discovering the options themselves.  There has been substantial work to learn
11 | options, mainly focussed around identifying ``bottleneck'' states, either
12 | empirically as in the work bye Stolle \cite{Stolle}, or more recently, using
13 | graph theoretic methods like betweeness \cite{Simsek} or graph partitions
14 | \cite{Simsek2005} explored by Simsek and Barto.
15 | 
16 | We would like to test an alternative hypothesis; we memorise many actions,
17 | not necessarily bottleneck ones, and put them together; based on their
18 | necessity in solving problems these actions are either reinforced, or gradually
19 | forgotten.  The actions could be of varying complexity, and it is intuitive to
20 | expect that we probably learn a great deal more {\em simple} actions than
21 | complex ones. In context of the options framework, the ``complex actions''
22 | correspond to options.
23 | 
24 | Our proposed approach is to use randomly constructed options that create a
25 | 'short-cut' between states, forming a sort of 'small-world' in the domain. This
26 | approach can be viewed as an extension of Kleinberg's popular model
27 | \cite{Kleinberg} in the Social Network Analysis field, and we would like to note
28 | that RL domains are very grid-like as well. The analogy is further motivated by
29 | observing that the policy followed by an agent in the MDP framework is like
30 | distributed search; we are interested in moving from our source state to the
31 | destination (goal) state using only information available locally, i.e. the
32 | value function. We leave addressing the dynamics of such random options, i.e.
33 | when options are added or removed, as future work.
34 | 
35 | As part of our work, we would like to address the following interesting questions,
36 | \begin{enumerate}
37 | \item
38 |     What is the ``distance'' analogue in the RL domain? How can the dimension
39 |     be characterised?
40 | \item
41 |     How many such random edges need to be added in order to learn new problems
42 |     quickly? How does shaping the number of ``long short-cuts'' affect
43 |     performance? Can we use similar results as proposed in Kleinberg's original
44 |     work?
45 | \item
46 |     Can we apply this technique to create sub-goals in ``local'' state transition
47 |     graphs as in \cite{Simsek2005}?
48 | \end{enumerate}
49 | 
50 | Finally, we in order to evaluate the performance of our approach, we will
51 | present the learning performance of agents trained on the Tic-Tac-Toe, Taxi and
52 | Playroom domains, using the intra-option and macro-Q learning algorithms. We
53 | will compare our performance with the agent without options, as well as an
54 | agent trained using the options learnt using betweeness measures. 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/make_options.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | RL Framework
 4 | Authors: Arun Chaganty
 5 | Make small world options
 6 | 
 7 | Take an environment, and an agent. Simulate the agent on various
 8 | reward instances of the environment, and use the generated policies to
 9 | construct options.
10 | """
11 | 
12 | import re
13 | import numpy as np
14 | import pickle
15 | 
16 | from Agent import *
17 | from Environment import *
18 | import OptionGenerator
19 | import Runner
20 | 
21 | def main( epoch_budget, count, gen_type, gen_args, agent_type, agent_args, env_type, env_args, file_prefix ):
22 |     """
23 |     @arg epochs: Maximum number of epochs to use
24 |     @arg count: Number of options to learn
25 |     @arg agent_type: String name of agent
26 |     @arg agent_args: Arguments to the agent constructor
27 |     @arg env_type: String name of environment
28 |     @arg env_args: Arguments to the environment constructor
29 |     """
30 | 
31 |     env = env_type.create( *env_args )
32 | 
33 |     if gen_type == "betweenness":
34 |         options = OptionGenerator.learn_options_from_betweenness( epoch_budget, count, env, env_args, agent_type, agent_args )
35 |     elif gen_type == "optimal-betweenness":
36 |         options = OptionGenerator.optimal_options_from_betweenness( env, count )
37 |     elif gen_type == "small-world":
38 |         options = OptionGenerator.learn_options_from_small_world( epoch_budget, count, env, env_args, agent_type, agent_args, *gen_args )
39 |     elif gen_type == "optimal-small-world":
40 |         options = OptionGenerator.optimal_options_from_small_world( env, count, *gen_args )
41 |     else:
42 |         raise NotImplemented()
43 | 
44 |     # Save options
45 |     f = open("%s.options"%( file_prefix ), "w")
46 |     pickle.dump( options, f )
47 |     f.close()
48 | 
49 | def print_help(args):
50 |     """Print help"""
51 |     print "Usage: %s <epoch_budget> <count> <gen:args> <agent:args> <environment:args>" % (args[0])
52 | 
53 | def convert(arg):
54 |     """Convert string arguments to numbers if possible"""
55 |     if arg.isdigit():
56 |         return int(arg)
57 |     elif re.match("[0-9]*\.[0-9]+", arg):
58 |         return float(arg)
59 |     elif re.match("[0-9]*e[0-9]+", arg):
60 |         return int(float(arg))
61 |     else:
62 |         return arg
63 | 
64 | if __name__ == "__main__":
65 |     import sys
66 |     def main_wrapper():
67 |         """Wrapper around the main call - converts input arguments"""
68 |         if "-h" in sys.argv[1:]:
69 |             print_help(sys.argv)
70 |             sys.exit( 0 )
71 |         elif len(sys.argv) <> 7:
72 |             print "Invalid number of arguments"
73 |             print_help(sys.argv)
74 |             sys.exit( 1 )
75 |         else:
76 |             epoch_budget = convert( sys.argv[1] )
77 |             count = convert( sys.argv[2] )
78 | 
79 |             gen_str = sys.argv[3].split(":")
80 |             gen_args = map( convert, gen_str[1:] )
81 |             gen_type = gen_str[0]
82 | 
83 |             agent_str = sys.argv[4].split(":")
84 |             agent_args = map( convert, agent_str[1:] )
85 |             agent_type = Runner.load_agent( agent_str[0] )
86 | 
87 |             env_str = sys.argv[5].split(":")
88 |             env_args = map( convert, env_str[1:] )
89 |             env_type = Runner.load_env( env_str[0] )
90 | 
91 |             file_prefix = sys.argv[ 6 ]
92 | 
93 |             main( epoch_budget, count, gen_type, gen_args, agent_type, agent_args, env_type, env_args, file_prefix )
94 | 
95 |     main_wrapper()
96 | 
97 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/poster.tex:
--------------------------------------------------------------------------------
 1 | % LaTeX Poster vim:ts=2 sw=2
 2 | % 
 3 | \documentclass[final,hyperref={pdfpagelabels=false}]{beamer}
 4 | \mode<presentation>{\usetheme{I6pd2}}
 5 | 
 6 | \usepackage{grffile}
 7 | \usepackage[english]{babel}
 8 | \usepackage[latin1]{inputenc}
 9 | \usepackage{amsmath,amsthm, amssymb, latexsym}
10 | \usepackage{algorithm,algorithmic}
11 | 
12 | \boldmath
13 | \usepackage{graphicx}
14 | \usepackage[orientation=portrait,size=a2,scale=1.4]{beamerposter}
15 | % change list indention level
16 | % \setdefaultleftmargin{3em}{}{}{}{}{}
17 | %\usepackage{snapshot} % will write a .dep file with all dependencies, allows for easy bundling
18 | 
19 | \usepackage{array,booktabs,tabularx}
20 | \newcolumntype{Z}{>{\centering\arraybackslash}X} % centered tabularx columns
21 | \newcommand{\pphantom}{\textcolor{ta3aluminium}} % phantom introduces a vertical space in p formatted table columns??!!
22 | 
23 | %\listfiles
24 | 
25 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
26 | \graphicspath{{figures/}}
27 | \input{macros}
28 |  
29 | \title{Learning in a Small World}
30 | \author{ Arun Tejasvi Chaganty \and Prateek Gaur \and Balaraman Ravindran \inst{1} } 
31 | \institute[Indian Institute of Technology Madras]{ Reconfigurable and
32 | Intelligent Systems Engineering Lab, \\ Deptt of Computer Science and
33 | Engineering, \\ IIT Madras, Chennai, India - 600036 }
34 | %\date[Sep. 8th, 2009]{Sep. 8th, 2009}
35 | 
36 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
37 | % You wlll have to manually set the height of the page
38 | % 105cm is good for A0. 52 for A2, 22 for A3
39 | \newlength{\columnheight}
40 | \setlength{\columnheight}{50cm}
41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
42 | \begin{document}
43 | 
44 | % Everything is contained in this frame
45 | \begin{frame}
46 |   % Standard two column layout
47 |   \begin{columns}
48 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
49 |     % Column 1
50 |     \begin{column}{.49\textwidth}
51 |       \begin{beamercolorbox}[center,wd=\textwidth]{postercolumn}
52 |         \begin{minipage}[T]{.95\textwidth}  % tweaks the width, makes a new \textwidth
53 |           \parbox[t][\columnheight]{\textwidth}{ % must be some better way to set the the height, width and textwidth simultaneously
54 |             % Since all columns are the same length, it is all nice and tidy.  You have to get the height empirically
55 |             % ---------------------------------------------------------%
56 |             % fill each column with content            
57 |             \input{column1}
58 |           }
59 |         \end{minipage}
60 |       \end{beamercolorbox}
61 |     \end{column}
62 | 
63 |     %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
64 |     % Column 2
65 |     \begin{column}{.49\textwidth}
66 |       \begin{beamercolorbox}[center,wd=\textwidth]{postercolumn}
67 |         \begin{minipage}[T]{.95\textwidth} % tweaks the width, makes a new \textwidth
68 |           \parbox[t][\columnheight]{\textwidth}{ % must be some better way to set the the height, width and textwidth simultaneously
69 |             % Since all columns are the same length, it is all nice and tidy.  You have to get the height empirically
70 |             % ---------------------------------------------------------%
71 |             % fill each column with content
72 |             \input{column2}
73 |           }
74 |           % ---------------------------------------------------------%
75 |           % end the column
76 |         \end{minipage}
77 |       \end{beamercolorbox}
78 |     \end{column}
79 |     % ---------------------------------------------------------%
80 |     % end the column
81 |   \end{columns}
82 |   \vskip2ex
83 | \end{frame}
84 | 
85 | \end{document}
86 | 
87 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | RL Framework
  4 | Authors: Arun Chaganty
  5 | Entry point
  6 | """
  7 | 
  8 | import re
  9 | import numpy as np
 10 | 
 11 | from Agent import *
 12 | from Environment import *
 13 | import Runner
 14 | from ProgressBar import ProgressBar
 15 | 
 16 | def main( iterations, ensembles, episodes, agent_type, agent_args, env_type, env_args, file_prefix ):
 17 |     """RL Testbed.
 18 |     @arg iterations: Number of environments to average over
 19 |     @arg ensembles: Number of bots to average over
 20 |     @arg episodes: Number of episodes to run for
 21 |     @arg agent_type: String name of agent
 22 |     @arg agent_args: Arguments to the agent constructor
 23 |     @arg env_type: String name of environment
 24 |     @arg env_args: Arguments to the environment constructor
 25 |     """
 26 |     # Load agent and environment
 27 | 
 28 |     progress = ProgressBar( 0, ensembles*iterations, mode='fixed' )
 29 |     # Needed to prevent glitches
 30 |     oldprog = str(progress)
 31 | 
 32 |     # Counters
 33 |     ret = np.zeros( episodes, dtype=float )
 34 |     min_, max_ = np.inf * np.ones( episodes, dtype=float) , -np.inf * np.ones( episodes, dtype=float)
 35 |     var = np.zeros( episodes, dtype=float )
 36 | 
 37 |     env = env_type.create( *env_args )
 38 |     for i in xrange( 1, iterations+1 ):
 39 |         env = env.domain.reset_rewards( env, *env_args )
 40 | 
 41 |         ret_ = np.zeros( episodes, dtype=float )
 42 |         # Initialise environment and agent
 43 |         for j in xrange( 1, ensembles+1 ):
 44 |             agent = agent_type( env.Q, *agent_args )
 45 |             ret__ = Runner.run( env, agent, episodes )
 46 |             ret__ = np.cumsum( ret__ )
 47 |             # Add to ret_
 48 |             ret_ += (ret__ - ret_) / j
 49 | 
 50 |             # print progress
 51 |             progress.increment_amount()
 52 |             if oldprog != str(progress):
 53 |                 print progress, "\r",
 54 |                 sys.stdout.flush()
 55 |                 oldprog=str(progress)
 56 | 
 57 |         ret += (ret_ - ret) / i
 58 |         min_ = np.min( np.vstack( ( min_, ret_ ) ), axis=0 )
 59 |         max_ = np.max( np.vstack( ( max_, ret_ ) ), axis=0 )
 60 | 
 61 |         var_ = np.power( ret_, 2 )
 62 |         var += (var_ - var) / i
 63 |     print "\n"
 64 | 
 65 |     var = np.sqrt( var - np.power( ret, 2 ) )
 66 | 
 67 |     f = open("%s-return.dat"%( file_prefix ), "w")
 68 |     # Print ret
 69 |     for i in xrange( len( ret ) ):
 70 |         f.write( "%d %f %f %f %f\n"%( i+1, ret[ i ], min_[i], max_[i], var[ i ] ) )
 71 |     f.close()
 72 | 
 73 | def print_help(args):
 74 |     """Print help"""
 75 |     print "Usage: %s <episodes> <episodes> <agent:args> <environment:args>" % (args[0])
 76 | 
 77 | def convert(arg):
 78 |     """Convert string arguments to numbers if possible"""
 79 |     if arg.isdigit():
 80 |         return int(arg)
 81 |     elif re.match("[0-9]*\.[0-9]+", arg):
 82 |         return float(arg)
 83 |     elif re.match("[0-9]*e[0-9]+", arg):
 84 |         return int(float(arg))
 85 |     else:
 86 |         return arg
 87 | 
 88 | if __name__ == "__main__":
 89 |     import sys
 90 |     def main_wrapper():
 91 |         """Wrapper around the main call - converts input arguments"""
 92 |         if "-h" in sys.argv[1:]:
 93 |             print_help(sys.argv)
 94 |             sys.exit( 0 )
 95 |         elif len(sys.argv) <> 7:
 96 |             print "Invalid number of arguments"
 97 |             print_help(sys.argv)
 98 |             sys.exit( 1 )
 99 |         else:
100 |             iterations = convert( sys.argv[1] )
101 |             ensembles = convert( sys.argv[2] )
102 |             episodes = convert( sys.argv[3] )
103 | 
104 |             agent_str = sys.argv[4].split(":")
105 |             agent_args = map( convert, agent_str[1:] )
106 |             agent_type = Runner.load_agent( agent_str[0] )
107 | 
108 |             env_str = sys.argv[5].split(":")
109 |             env_args = map( convert, env_str[1:] )
110 |             env_type = Runner.load_env( env_str[0] )
111 | 
112 |             file_prefix = sys.argv[ 6 ]
113 | 
114 |             main( iterations, ensembles, episodes, agent_type, agent_args, env_type, env_args, file_prefix )
115 | 
116 |     main_wrapper()
117 | 
118 | 


--------------------------------------------------------------------------------
/doc/aamas/small-world-theory.tex:
--------------------------------------------------------------------------------
 1 | \section{Small Worlds}
 2 | \label{sec:small-world-theory}
 3 | 
 4 | % Introduction and motivation for the proof
 5 | In this section we will tackle the proof of the main theorem in
 6 | \secref{sec:theory},
 7 | 
 8 | \begin{theorem}
 9 |   %
10 |   Let $f : V \to \Re$ be a function embedded on the graph $\graph(V,E)$,
11 |   such that, $\kappa_1 \|u-v\| - c_1 \le \|f(u) - f(v)\| \le \kappa_2
12 |   \|u - v\| - c_2$, where $0 \le \kappa_1 \le \kappa_2$, and $0 \le c_2
13 |   \le \frac{c_1}{2}$. Let $M_f$ be the global maxima of $f$. Let
14 |   \egreedyalgo be an $\epsilon$-greedy algorithm with respect to $f$,
15 |   i.e.  an algorithm which chooses with probability $1-\epsilon$ to
16 |   transit to the neighbouring state closest to $M_f$, i.e. $N(u)
17 |   = \argmin_v \|f(v) - f(M_f)\|$.
18 |   
19 |   If $\graph(V,E)$ is $r$-dimensional lattice, and contains a long
20 |   distance edge distributed according to $P_r: p(u,v) \propto
21 |   \|u-v\|^{-r}$, then \egreedyalgo takes $O( (\log |V|)^2 )$ steps to
22 |   reach $M_f$.
23 | \end{theorem}
24 | \begin{proof}
25 | 
26 | This result is a simple extension of Kleinberg's result in
27 | \cite{Kleinberg2000}, and follows the proof presented there, albeit with
28 | the somewhat cleaner notation and formalism of \cite{Martel2004}. We
29 | begin by defining the necessary formalism to present the proof.
30 | 
31 | \begin{definition}
32 | Let us define $\ball_l(u)$ to be the set of nodes contained within
33 | a ``ball'' of radius $l$ centered at $u$, i.e.  $\ball_l(u) = \{ v \mid
34 | \|u - v\| < l \}$, and $\sball_l(u)$ to be the set of nodes on its
35 | surface, i.e. $\sball_l(u) = \{ v \mid \|u - v\| = l \}$.
36 | 
37 | Given a function $f:V \to \Re$ embedded on $\graph(V,E)$, we analogously
38 | define $\ballf_l(u) = \{ v \mid \|f(u) - f(v)\| < l \}$. For notational
39 | convenience, we take $\ballf_l$ to be $\ballf_l(M_f)$.
40 | \end{definition}
41 | 
42 | \begin{lemma}
43 |     The inverse normalised coefficient for $p(u,v)$ is $c_u = \Theta(
44 |     \log n )$, and $p(u,v) = \|u - v\|^{-r} \Theta(\log n)^{-1}$.
45 | \end{lemma}
46 | \begin{proof}
47 |     \begin{eqnarray*}
48 |         c_u &=& \sum_{v \ne u} \|u - v\|^{-r} \\
49 |             &=& \sum_{j=1}^{r(n-1)} \sball_j(u) j^{-r}.
50 |     \end{eqnarray*}
51 |     It can easily be shown that the $\sball_l(u) = \Theta( l^{k-1} )$.
52 |     Thus, $c_u$ reduces to a harmonic sum, and hence is equal to
53 |     $\Theta( \log n )$. The second part of the lemma follows as $p(u,v)
54 |     = \frac{ \|u - v\|^{-r} }{c_u}$. 
55 | \end{proof}
56 | 
57 | We are now ready to prove that \egreedyalgo takes $O( (\log |V|)^2 )$
58 | decisions. Let a node $u$ be in phase $j$ when $u \in \ballf_{2^{j+1}}
59 | \setminus \ballf_{2^{j}}$. The probability that phase $j$ will end this
60 | step is equal to the probability that $N(u) \in \ballf_{2^{j}}$. 
61 | 
62 | The size of $\ballf_{2^{j}}$ is at least $|\ball_{\frac{
63 | 2^{j}+c_2}{\kappa_2}}| = \Theta(\frac{2^{j}+c_2}{\kappa_2})$. The
64 | distance between $u$ and a node in $\ballf_{2^{j}}$ is at most
65 | $\frac{2^{j+1} + c_1}{ \kappa_1 } + \frac{2^{j} + c_2}{\kappa_2}
66 | < 2(\frac{2^{j+1} + c_2}{\kappa_2})$. The probability of a link between
67 | these two nodes is at least $(\frac{2^{j+2} + 2 c_1}{\kappa_1})^{-r}
68 | \Theta(\log n)^{-1} $. Thus, 
69 | 
70 | \begin{eqnarray*}
71 |     P(u, \ballf_{2^{j}} ) &\ge& \frac{(1-\epsilon)}{\Theta( \log n )} (\frac{2^{j}+c_2}{\kappa_2})^{r} \times (\frac{2^{j+2} + 2 c_1}{\kappa_1})^{-r} \\
72 |     &\ge& \frac{(1-\epsilon)}{\Theta( \log n )} \times (\frac{\kappa_1}{4\kappa_2} )^{r} \times ( \frac{ 1 + \frac{c_2}{2^{j}} }{ 1 + \frac{c_1}{2 \times 2^{j}} })^{r}\\
73 |     &\ge& \frac{(1-\epsilon)}{\Theta( \log n )} \times (\frac{\kappa_1}{4\kappa_2} )^{r} \times ( \frac{ 1 + c_2 }{ 1 + \frac{c_1}{2} })^{r} .\\
74 | \end{eqnarray*}
75 | 
76 | Let number of decisions required to leave phase $j$ be $X_j$. Then, 
77 | \begin{eqnarray*}
78 |     \E[X_j] &\le& \sum_{i=0}^{\infty} (1 - P(u, \ballf_{2^{j}} ))^i \\
79 |             &\le& \frac{1}{P(u, \ballf_{2^{j}} )} \\
80 |             &\le& \Theta( \log n ) \frac{1}{(1-\epsilon)} (\frac{4 \kappa_2}{\kappa_1})^{r} ( \frac{ 1 + \frac{c_1}{2} }{ 1 + c_2 })^{r}\\
81 |             &\le& \Theta( \log n ).
82 | \end{eqnarray*}
83 | Thus, it takes at most $O(\log n)$ decisions to leave phase $j$. By construction, there are at most $\log n$
84 | phases, and thus at most $O((\log n)^2)$ decisions.
85 | \end{proof}
86 | 
87 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/ewrl.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{Simsek,
 2 | author = {\c{S}im\c{s}ek, \"{O}zg\"{u}r and Barto, Andrew G},
 3 | booktitle = {NIPS},
 4 | file = {:home/teju/mendeley-papers/Şimşek, Barto/NIPS/Şimşek, Barto - 2008 - Skill characterization based on betweenness.pdf:pdf},
 5 | pages = {1--8},
 6 | title = {{Skill characterization based on betweenness}},
 7 | year = {2008}
 8 | }
 9 | 
10 | @inproceedings{Martel2004,
11 | author = {Martel, Chip and Nguyen, Van},
12 | booktitle = {PODC},
13 | file = {:home/teju/mendeley-papers/Martel, Nguyen/PODC/Martel, Nguyen - 2004 - Analyzing Kleinberg ’s (and other) Small-world Models.pdf:pdf},
14 | isbn = {1581138024},
15 | keywords = {diameter,random graphs,routing,small-world network},
16 | title = {{Analyzing Kleinberg's (and other) Small-world Models}},
17 | volume = {2},
18 | year = {2004}
19 | }
20 | 
21 | @article{Kleinberg,
22 | author = {Kleinberg, Jon},
23 | file = {:home/teju/mendeley-papers/Kleinberg/Unknown/Kleinberg - Unknown - The Small-World Phenomenon An Algorithmic Perspective.pdf:pdf},
24 | pages = {1--14},
25 | title = {{The Small-World Phenomenon : An Algorithmic Perspective}}
26 | }
27 | 
28 | @article{Stolle,
29 | abstract = {Temporally extended actions (e.g., macro actions) have proven very useful in speeding up learning, ensuring robustness and building prior knowledge into AI systems. The options framework (Precup, 2000; Sutton, Precup \& Singh, 1999) provides a natural way of incorporating such actions into reinforcement learning systems, but leaves open the issue of how good options might be identi- fied. In this paper, we empirically explore a simple approach to creating options. The underlying assumption is that the agent will be asked to perform different goal-achievement tasks in an environment that is otherwise the same over time. Our approach is based on the intuition that “bottleneck” states, i.e. states that are frequently visited on system trajectories, could prove to be useful subgoals (e.g. McGovern \& Barto, 2001; Iba, 1989). We present empirical studies of this approach in two gridworld navigation tasks. One of the environments we explored contains bottleneck states, and the algo- rithm indeed finds these states, as expected. The second environment is an empty gridworld with no obstacles. Although the environment does not contain bottle- neck states, our approach still finds useful options, which essentially allow the agent to travel around the environment more quickly},
30 | author = {Stolle, Martin and Precup, Doina},
31 | file = {:home/teju/mendeley-papers/Stolle, Precup/Artificial Intelligence/Stolle, Precup - Unknown - Learning Options in Reinforcement Learning.pdf:pdf},
32 | journal = {Artificial Intelligence},
33 | title = {{Learning Options in Reinforcement Learning}}
34 | }
35 | 
36 | @article{SuttonPrecupSingh1998,
37 | author = {Sutton, Richard S and Precup, Doina and Singh, Satinder},
38 | file = {:home/teju/mendeley-papers/Sutton, Precup, Singh/Artificial Intelligence/Sutton, Precup, Singh - 1998 - Between MDPs and Semi-MDPs Learning , Planning , and Representing Knowledge at Multiple Temporal Scales at Multiple Temporal Scales.pdf:pdf},
39 | journal = {Artificial Intelligence},
40 | title = {{Between MDPs and Semi-MDPs : Learning , Planning , and Representing Knowledge at Multiple Temporal Scales at Multiple Temporal Scales}},
41 | year = {1998}
42 | }
43 | 
44 | @article{Simsek2005,
45 | address = {New York, New York, USA},
46 | author = {\c{S}im\c{s}ek, \"{O}zg\"{u}r and Wolfe, Alicia P. and Barto, Andrew G.},
47 | doi = {10.1145/1102351.1102454},
48 | file = {:home/teju/mendeley-papers/Şimşek, Wolfe, Barto/Proceedings of the 22nd international conference on Machine learning - ICML '05/Şimşek, Wolfe, Barto - 2005 - Identifying useful subgoals in reinforcement learning by local graph partitioning.pdf:pdf},
49 | isbn = {1595931805},
50 | journal = {Proceedings of the 22nd international conference on Machine learning - ICML '05},
51 | pages = {816--823},
52 | publisher = {ACM Press},
53 | title = {{Identifying useful subgoals in reinforcement learning by local graph partitioning}},
54 | url = {http://portal.acm.org/citation.cfm?doid=1102351.1102454},
55 | year = {2005}
56 | }
57 | 
58 | @inproceedings{Menache,
59 | author = {Menache, Ishai and Mannor, Shie and Shimkin, Nahum},
60 | booktitle = {ECML},
61 | file = {:home/teju/mendeley-papers/Menache, Mannor, Shimkin/ECML/Menache, Mannor, Shimkin - 2002 - Q-Cut - Dynamic Discovery of Sub-Goals in Reinforcement Learning.pdf:pdf},
62 | title = {{Q-Cut - Dynamic Discovery of Sub-Goals in Reinforcement Learning}},
63 | year = {2002}
64 | }
65 | 


--------------------------------------------------------------------------------
/src/ProgressBar.py:
--------------------------------------------------------------------------------
  1 | # A Python Library to create a Progress Bar.
  2 | # Copyright (C) 2008  BJ Dierkes <wdierkes@5dollarwhitebox.org>
  3 | #
  4 | # This program is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | #
  9 | # This program is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | #
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | #
 17 | #
 18 | # This class is an improvement from the original found at:
 19 | #
 20 | #   http://code.activestate.com/recipes/168639/
 21 | #
 22 |  
 23 | import sys,os
 24 |  
 25 | class ProgressBar:
 26 |     def __init__(self, min_value = 0, max_value = 100, width=77,**kwargs):
 27 |         self.char = kwargs.get('char', '#')
 28 |         self.mode = kwargs.get('mode', 'dynamic') # fixed or dynamic
 29 |         if not self.mode in ['fixed', 'dynamic']:
 30 |             self.mode = 'fixed'
 31 |  
 32 |         self.bar = ''
 33 |         self.min = min_value
 34 |         self.max = max_value
 35 |         self.span = max_value - min_value
 36 |         self.width = width
 37 |         self.amount = 0       # When amount == max, we are 100% done 
 38 |         self.update_amount(0) 
 39 |  
 40 |  
 41 |     def increment_amount(self, add_amount = 1):
 42 |         """
 43 |         Increment self.amount by 'add_ammount' or default to incrementing
 44 |         by 1, and then rebuild the bar string. 
 45 |         """
 46 |         new_amount = self.amount + add_amount
 47 |         if new_amount < self.min: new_amount = self.min
 48 |         if new_amount > self.max: new_amount = self.max
 49 |         self.amount = new_amount
 50 |         self.build_bar()
 51 |  
 52 |  
 53 |     def update_amount(self, new_amount = None):
 54 |         """
 55 |         Update self.amount with 'new_amount', and then rebuild the bar 
 56 |         string.
 57 |         """
 58 |         if not new_amount: new_amount = self.amount
 59 |         if new_amount < self.min: new_amount = self.min
 60 |         if new_amount > self.max: new_amount = self.max
 61 |         self.amount = new_amount
 62 |         self.build_bar()
 63 |  
 64 |  
 65 |     def build_bar(self):
 66 |         """
 67 |         Figure new percent complete, and rebuild the bar string base on 
 68 |         self.amount.
 69 |         """
 70 |         diff = float(self.amount - self.min)
 71 |         percent_done = int(round((diff / float(self.span)) * 100.0))
 72 |  
 73 |         # figure the proper number of 'character' make up the bar 
 74 |         all_full = self.width - 2
 75 |         num_hashes = int(round((percent_done * all_full) / 100))
 76 |  
 77 |         if self.mode == 'dynamic':
 78 |             # build a progress bar with self.char (to create a dynamic bar
 79 |             # where the percent string moves along with the bar progress.
 80 |             self.bar = self.char * num_hashes
 81 |         else:
 82 |             # build a progress bar with self.char and spaces (to create a 
 83 |             # fixe bar (the percent string doesn't move)
 84 |             self.bar = self.char * num_hashes + ' ' * (all_full-num_hashes)
 85 |  
 86 |         percent_str = str(percent_done) + "%"
 87 |         self.bar = '[ ' + self.bar + ' ] ' + percent_str
 88 |  
 89 |  
 90 |     def __str__(self):
 91 |         return str(self.bar)
 92 |  
 93 |  
 94 | def main():
 95 |     print
 96 |     limit = 1000000
 97 |  
 98 |     print 'Example 1: Fixed Bar'
 99 |     prog = ProgressBar(0, limit, 77, mode='fixed')
100 |     oldprog = str(prog)
101 |     for i in xrange(limit+1):
102 |         prog.update_amount(i)
103 |         if oldprog != str(prog):
104 |             print prog, "\r",
105 |             sys.stdout.flush()
106 |             oldprog=str(prog)
107 |  
108 |     print '\n\n'
109 |  
110 |     print 'Example 2: Dynamic Bar'
111 |     prog = ProgressBar(0, limit, 77, mode='dynamic', char='-')
112 |     oldprog = str(prog)
113 |     for i in xrange(limit+1):
114 |         prog.increment_amount()
115 |         if oldprog != str(prog):
116 |             print prog, "\r",
117 |             sys.stdout.flush()
118 |             oldprog=str(prog)
119 |  
120 |     print '\n\n'
121 |  
122 |  
123 | if __name__ == '__main__':
124 |     main()
125 | 


--------------------------------------------------------------------------------
/src/Agent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RL Framework
  3 | Authors: Arun Chaganty
  4 | Agent Base Classes; Represented by an MDP
  5 | """
  6 | 
  7 | import numpy as np
  8 | import random
  9 | 
 10 | from Environment import *
 11 | 
 12 | class Agent:
 13 |     Q = []
 14 |     def __init__(self, Q):
 15 |         self.Q = Q
 16 | 
 17 |     def act( self, state, reward, episode_ended ):
 18 |         raise NotImplemented()
 19 | 
 20 |     def greedy_policy( self ):
 21 |         S = len( self.Q )
 22 | 
 23 |         pi = {}
 24 |         for s in xrange( S ):
 25 |             # Choose greedy action
 26 |             actions = self.Q[ s ]
 27 |             max_value = max( ( pr for (a,pr) in actions ) )
 28 |             a = random.choice( tuple( a for (a,pr) in actions if pr == max_value ) )
 29 |             pi[s] = ((a,1.0),)
 30 |         return pi
 31 | 
 32 | class ValueAgent( Agent ):
 33 |     old_state = None
 34 |     old_action = None
 35 |     e = 0.01
 36 |     alpha = 0.1
 37 |     gamma = 0.9
 38 |     rate = 0.99
 39 | 
 40 |     def __init__(self, Q, e = 0.01, alpha = 0.1, gamma = 0.9, rate = 0.99 ):
 41 |         Agent.__init__( self, Q )
 42 | 
 43 |         self.Q = []
 44 |         for A in Q:
 45 |             self.Q.append( [ (a,0) for a in A ] )
 46 | 
 47 |         self.e = e
 48 |         self.alpha = alpha
 49 |         self.gamma = gamma
 50 |         self.rate = rate
 51 |         self.old_state = None
 52 |         self.old_action = None
 53 | 
 54 |     def act( self, state, reward, episode_ended ):
 55 | 
 56 |         # Explore
 57 |         actions = self.Q[ state ]
 58 |         if random.random() < self.e:
 59 |             action = random.choice( tuple( a for (a,pr) in actions ) )
 60 |         # Exploit
 61 |         else:
 62 |             max_value = max( ( pr for (a,pr) in actions ) )
 63 |             action = random.choice( tuple( a for (a,pr) in actions if pr == max_value ) )
 64 | 
 65 |         # Update actions
 66 |         if episode_ended:
 67 |             self.update_Q( self.old_state, self.old_action, None, None, reward )
 68 |             self.e = self.e * (1 - self.rate)
 69 |         else:
 70 |             self.update_Q( self.old_state, self.old_action, state, action, reward )
 71 | 
 72 |         self.old_state = state
 73 |         self.old_action = action
 74 | 
 75 |         return action
 76 | 
 77 |     def update_Q( self, state, action, state_, action_, reward ):
 78 |         raise NotImplemented()
 79 | 
 80 | 
 81 |     def get_idx( self, state, action ):
 82 |         actions = self.Q[state]
 83 |         for i in xrange( len( actions ) ):
 84 |             if action == actions[ i ][ 0 ]: 
 85 |                 return i
 86 |         raise ValueError()
 87 | 
 88 |     def get_value( self, state, action ):
 89 |         try:
 90 |             a, q = self.Q[ state ][ self.get_idx( state, action ) ]
 91 |         except ValueError:
 92 |             q = 0
 93 |         return q
 94 | 
 95 |     def set_value( self, state, action, value ):
 96 |         self.Q[ state ][ self.get_idx( state, action ) ] = (action, value)
 97 | 
 98 | class OptionValueAgent( ValueAgent ):
 99 | 
100 |     def act( self, state, reward, episode_ended ):
101 | 
102 |         # Revisit the history and update using it as well
103 |         if isinstance( self.old_action, Option ):
104 |             state_  = state[-1][0] 
105 |         else:
106 |             state_ = state
107 |         
108 |         # Explore
109 |         actions = self.Q[ state_ ]
110 |         if random.random() < self.e:
111 |             action = random.choice( tuple( a for (a,pr) in actions ) )
112 |         # Exploit
113 |         else:
114 |             max_value = max( ( pr for (a,pr) in actions ) )
115 |             action = random.choice( tuple( a for (a,pr) in actions if pr == max_value ) )
116 | 
117 |         # Update actions
118 |         if episode_ended:
119 |             # In the case of options, send along the old state list
120 |             if isinstance( self.old_action, Option ):
121 |                 # Replace the last state with a None one (end of epsiode)
122 |                 state = state[:-1] + [(None, None),]
123 |                 self.update_Q( self.old_state, self.old_action, state, None, reward )
124 |             else:
125 |                 self.update_Q(self.old_state, self.old_action, None, None, reward)
126 |             self.e = self.e * (1 - self.rate)
127 |         else:
128 |             self.update_Q( self.old_state, self.old_action, state, action, reward )
129 | 
130 |         self.old_state = state_
131 |         self.old_action = action
132 | 
133 |         return action
134 | 
135 |     def update_Q( self, state, action, state_, action_, reward ):
136 |         raise NotImplemented()
137 | 
138 | 


--------------------------------------------------------------------------------
/doc/project-report/experiments.tex:
--------------------------------------------------------------------------------
  1 | \section{Empirical Performance}
  2 | \label{sec:experiments}
  3 | % Experimental results
  4 | 
  5 | % Things to be tested
  6 | We evaluated the options defined using our method on the Taxi domain described
  7 | in \secref{sec:approach}. We compared the performance of Macro Q learning and
  8 | Intra-option Q-learning agents using the following option schemes,
  9 | \begin{itemize}
 10 |    \item \textbf{Small World} Options were generated randomly connecting two nodes of
 11 |        the domain using an inverse square law, as described in
 12 |        \secref{sec:approach}, with $r = 2$.
 13 |    \item \textbf{Betweenness} Options were generated to take any node to a local maxima
 14 |        of the betweenness function.
 15 |    \item \textbf{Random} Options were generated by randomly connecting two nodes in the
 16 |        domain.
 17 |    \item \textbf{Manual} Options were manually defined to take the taxi to one of the
 18 |        four pads by the shortest path.
 19 |    \item \textbf{None} No options were used.
 20 | \end{itemize}
 21 | 
 22 | The Intra-option Q-learning algorithm requires that the option policies
 23 | themselves be Markov, require that every state that the option can visit be part
 24 | of $\initset$. For the experiments using the Intra-option learning algorithm, we
 25 | modified the ``Small World'' options slightly such that all nodes along the path
 26 | were in the initiation set of the option as well.
 27 | 
 28 | % Experimental Parameters
 29 | To compare the different approaches, we measuring the ratio of how long the taxi
 30 | takes to complete the task to the shortest time possible to complete the task,
 31 | and have termed this measure "optimality".  Each experiment was averaged over
 32 | $200$ runs, and run for $1500$ episodes. We set $\alpha$ to $0.8$ for all the
 33 | experiments. We ran the experiments for two values of $\gamma$, $0.90$ and
 34 | $0.99$, with equal performance. We present the results only for $\gamma$ set to
 35 | $0.99$.
 36 | 
 37 | We plotted the value of optimality versus the number of episodes of the agent,
 38 | and have also zoomed into the later episodes to have a closer look at the
 39 | converged behaviour.
 40 | 
 41 | % Plots
 42 | \begin{figure}[ht]
 43 |     \centering
 44 |     \subfigure[]{
 45 |     \includegraphics[width=4in]{figures/MacroQ-0_99-taxi1}
 46 |     }
 47 |     \subfigure[]{
 48 |     \includegraphics[width=4in]{figures/MacroQ-0_99e-taxi1}
 49 |     }
 50 |     \caption{Macro Q-learning using 20 options }
 51 |     \label{fig:MacroQ-0.99}
 52 | \end{figure}
 53 | 
 54 | \begin{figure}[ht]
 55 |     \centering
 56 |     \subfigure[]{
 57 |     \includegraphics[width=4in]{figures/IntraQm-0_99-taxi1}
 58 |     }
 59 |     \subfigure[]{
 60 |     \includegraphics[width=4in]{figures/IntraQm-0_99e-taxi1}
 61 |     }
 62 |     \caption{Intra-option Q-learning using 20 options }
 63 |     \label{fig:IntraQ-0.99}
 64 | \end{figure}
 65 | 
 66 | \begin{figure}[ht]
 67 |     \centering
 68 |     \subfigure[]{
 69 |     \includegraphics[width=4in]{figures/IntraQm-0_99-50-taxi1}
 70 |     }
 71 |     \subfigure[]{
 72 |     \includegraphics[width=4in]{figures/IntraQm-0_99e-50-taxi1}
 73 |     }
 74 |     \caption{Intra-option Q-learning, using 50 options }
 75 |     \label{fig:IntraQ-0.99-50}
 76 | \end{figure}
 77 | 
 78 | % Brief on results
 79 | We note that the agents using Small World options converge quickly to the
 80 | optimal value (i.e. 1), and have small variance. Expectedly, the performance
 81 | using Intra-option Q-learning (\autoref{fig:IntraQ-0.99}) is significantly
 82 | better than Macro Q-learning (\autoref{fig:MacroQ-0.99}). The performance of
 83 | small world options does not significantly differ between using 20 options
 84 | (\autoref{fig:IntraQ-0.99}) or 50 options (\autoref{fig:IntraQ-0.99-50}).
 85 | 
 86 | We find it surprising that betweenness performs worse than the remaining
 87 | schemes. Our options differ from the random options defined in \cite{Simsek} as
 88 | we select random paths instead of random nodes to which all other nodes are
 89 | connected. As a result, for many nodes there are few if any options. As adding
 90 | options also increases the number of actions that an agent can choose from, this
 91 | might have lead to the better performance of Random and Small World, which are
 92 | both path-based options.
 93 | 
 94 | % \begin{table}[ht]
 95 | %     \centering
 96 | %     \begin{tabular}{ r | r }
 97 | %           &  \\ \hline
 98 | %           &  \\
 99 | %     \end{tabular} 
100 | %     \caption{ }
101 | %     \label{tbl:rtt-summary}
102 | % \end{table}
103 | 
104 | % \begin{figure}[s]
105 | %     \centering
106 | %     \includegraphics[width=5in]{filename}
107 | %     \caption{ }
108 | %     \label{fig:high-variance-rtt}
109 | % \end{figure}
110 | 
111 | 


--------------------------------------------------------------------------------
/doc/ewrl-abstract/ewrl.bib:
--------------------------------------------------------------------------------
 1 | @inproceedings{Simsek,
 2 | author = {\c{S}im\c{s}ek, \"{O}zg\"{u}r and Barto, Andrew G},
 3 | booktitle = {NIPS},
 4 | file = {:home/teju/mendeley-papers/Şimşek, Barto/NIPS/Şimşek, Barto - 2008 - Skill characterization based on betweenness.pdf:pdf},
 5 | pages = {1--8},
 6 | title = {{Skill characterization based on betweenness}},
 7 | year = {2008}
 8 | }
 9 | 
10 | @inproceedings{Martel2004,
11 | author = {Martel, Chip and Nguyen, Van},
12 | booktitle = {PODC},
13 | file = {:home/teju/mendeley-papers/Martel, Nguyen/PODC/Martel, Nguyen - 2004 - Analyzing Kleinberg ’s (and other) Small-world Models.pdf:pdf},
14 | isbn = {1581138024},
15 | keywords = {diameter,random graphs,routing,small-world network},
16 | title = {{Analyzing Kleinberg's (and other) Small-world Models}},
17 | volume = {2},
18 | year = {2004}
19 | }
20 | 
21 | @article{Kleinberg,
22 | author = {Kleinberg, Jon},
23 | file = {:home/teju/mendeley-papers/Kleinberg/ACM Theory of Computing/Kleinberg - 2000 - The Small-World Phenomenon An Algorithmic Perspective.pdf:pdf},
24 | journal = {ACM Theory of Computing},
25 | pages = {163--170},
26 | title = {{The Small-World Phenomenon : An Algorithmic Perspective}},
27 | volume = {32},
28 | year = {2000}
29 | }
30 | 
31 | @article{Stolle,
32 | abstract = {Temporally extended actions (e.g., macro actions) have proven very useful in speeding up learning, ensuring robustness and building prior knowledge into AI systems. The options framework (Precup, 2000; Sutton, Precup \& Singh, 1999) provides a natural way of incorporating such actions into reinforcement learning systems, but leaves open the issue of how good options might be identi- fied. In this paper, we empirically explore a simple approach to creating options. The underlying assumption is that the agent will be asked to perform different goal-achievement tasks in an environment that is otherwise the same over time. Our approach is based on the intuition that “bottleneck” states, i.e. states that are frequently visited on system trajectories, could prove to be useful subgoals (e.g. McGovern \& Barto, 2001; Iba, 1989). We present empirical studies of this approach in two gridworld navigation tasks. One of the environments we explored contains bottleneck states, and the algo- rithm indeed finds these states, as expected. The second environment is an empty gridworld with no obstacles. Although the environment does not contain bottle- neck states, our approach still finds useful options, which essentially allow the agent to travel around the environment more quickly},
33 | author = {Stolle, Martin and Precup, Doina},
34 | file = {:home/teju/mendeley-papers/Stolle, Precup/Artificial Intelligence/Stolle, Precup - Unknown - Learning Options in Reinforcement Learning.pdf:pdf},
35 | journal = {Artificial Intelligence},
36 | title = {{Learning Options in Reinforcement Learning}}
37 | }
38 | 
39 | @article{SuttonPrecupSingh1998,
40 | author = {Sutton, Richard S and Precup, Doina and Singh, Satinder},
41 | file = {:home/teju/mendeley-papers/Sutton, Precup, Singh/Artificial Intelligence/Sutton, Precup, Singh - 1998 - Between MDPs and Semi-MDPs Learning , Planning , and Representing Knowledge at Multiple Temporal Scales at Multiple Temporal Scales.pdf:pdf},
42 | journal = {Artificial Intelligence},
43 | title = {{Between MDPs and Semi-MDPs : Learning , Planning , and Representing Knowledge at Multiple Temporal Scales at Multiple Temporal Scales}},
44 | year = {1998}
45 | }
46 | 
47 | @article{Simsek2005,
48 | address = {New York, New York, USA},
49 | author = {\c{S}im\c{s}ek, \"{O}zg\"{u}r and Wolfe, Alicia P. and Barto, Andrew G.},
50 | doi = {10.1145/1102351.1102454},
51 | file = {:home/teju/mendeley-papers/Şimşek, Wolfe, Barto/Proceedings of the 22nd international conference on Machine learning - ICML '05/Şimşek, Wolfe, Barto - 2005 - Identifying useful subgoals in reinforcement learning by local graph partitioning.pdf:pdf},
52 | isbn = {1595931805},
53 | journal = {Proceedings of the 22nd international conference on Machine learning - ICML '05},
54 | pages = {816--823},
55 | publisher = {ACM Press},
56 | title = {{Identifying useful subgoals in reinforcement learning by local graph partitioning}},
57 | url = {http://portal.acm.org/citation.cfm?doid=1102351.1102454},
58 | year = {2005}
59 | }
60 | 
61 | @inproceedings{Menache,
62 | author = {Menache, Ishai and Mannor, Shie and Shimkin, Nahum},
63 | booktitle = {ECML},
64 | file = {:home/teju/mendeley-papers/Menache, Mannor, Shimkin/ECML/Menache, Mannor, Shimkin - 2002 - Q-Cut - Dynamic Discovery of Sub-Goals in Reinforcement Learning.pdf:pdf},
65 | title = {{Q-Cut - Dynamic Discovery of Sub-Goals in Reinforcement Learning}},
66 | year = {2002}
67 | }
68 | 
69 | @inproceedings{McGovern2001,
70 | author = {McGovern, Amy and Barto, Andrew G},
71 | booktitle = {ICML},
72 | file = {:home/teju/mendeley-papers/McGovern, Barto/ICML/McGovern, Barto - 2001 - Automatic Discovery of Subgoals in Reinforcement Learning using Diverse Density.pdf:pdf},
73 | pages = {1--8},
74 | title = {{Automatic Discovery of Subgoals in Reinforcement Learning using Diverse Density}},
75 | year = {2001}
76 | }
77 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/column1.tex:
--------------------------------------------------------------------------------
  1 | \begin{block}{Introduction}
  2 |     \begin{itemize}
  3 |         \item How do we perform such a diverse set of complex tasks?
  4 |             \begin{itemize}              
  5 |                 \item Given an MDP with options, $\mdp
  6 |                     \tuple{\states,\options,\transitions,\cdot}$, can we
  7 |                     quickly learn any task (i.e. different $\rewards$)?
  8 |             \end{itemize}
  9 |         \item Most literature focuses on finding options to reach `bottlenecks',
 10 |             which are common subgoals across tasks. The objective of these
 11 |             options is to aid in early exploration.
 12 |             \begin{itemize}              
 13 |                 \item A. McGovern and A. G. Barto, ``Automatic Discovery of
 14 |                     Subgoals in Reinforcement Learning using Diverse Density,''
 15 |                     in ICML, 2001
 16 |                 \item I. Menache, S. Mannor, and N. Shimkin, ``Q-Cut - Dynamic
 17 |                     Discovery of Sub-Goals in Reinforcement Learning,'' in ECML,
 18 |                     2002. 
 19 |                 \item \"{O}. \c{S}im\c{s}ek and A. G. Barto, ``Skill
 20 |                     characterization based on betweenness,'' in NIPS, 2008 
 21 |             \end{itemize}              
 22 |         \item Our1 Hypothesis: The key is in finding a set of composable subtasks
 23 |             spanning the space of tasks.
 24 |     \end{itemize}              
 25 | \end{block}
 26 | 
 27 | \vfill
 28 | \begin{block}{Motivation: The Small World Phenomenon}
 29 |     \begin{itemize}
 30 |         \item Kleinberg: ``Individuals using local information are collectively
 31 |             very effective at actually constructing short paths between two
 32 |             points in a social network.'' \\
 33 |             {\small J. Kleinberg, ``The Small-World
 34 |             Phenomenon: An Algorithmic Perspective'' in ACM Theory of Computing,
 35 |             2000}
 36 |         \item Kleinberg constructed a family of networks for which the expected
 37 |             time to deliver a message from any source to any destination was
 38 |             $( \log |\mbox{size of network}| )^2$, using the inverse power law distribution.
 39 |             \begin{itemize}              
 40 |                 \item Structural properties of the network are important
 41 |             \end{itemize}              
 42 |         \item Can we do the same for learning?
 43 |             \begin{itemize}              
 44 |                 \item A {\bf small-world RL domain} has the property that an
 45 |                     agent using local information (e.g. the value function) can
 46 |                     effectively reach a state of {\em maximal value}.
 47 |             \end{itemize}              
 48 |     \end{itemize}
 49 | \end{block}
 50 | 
 51 | \vfill
 52 | \begin{block}{Generating Options according to $P_r$}
 53 |     \begin{columns}
 54 |         \begin{column}{.39\textwidth}
 55 |             \begin{figure}[h]
 56 |                 \centering
 57 |                 \includegraphics[height=3in]{figures/rooms-options}
 58 |                 \label{fig:rooms-options}
 59 |                 \caption{Some $P_2$ Options}
 60 |             \end{figure}
 61 |         \end{column}
 62 |         \begin{column}{.49\textwidth}                
 63 |             \begin{itemize}
 64 |                 \item Consider the state-interaction graph of $\mdp$. 
 65 |                 \item For each state $s \in \states$, select a single $s'$
 66 |                     reachable from $s$ according to the inverse power law
 67 |                     distribution $P_r : p(s,s') \propto \|s - s'\|^{-r}$.
 68 |                 \item For each $(s,s')$ pair, construct an option $o:\option$
 69 |                     with $\initset = \{s\}$, $\stopcond = \{s'\}$, and $\pi =
 70 |                     \mbox{optimal policy to reach $s'$}$.
 71 |             \end{itemize}
 72 |         \end{column}
 73 |     \end{columns}
 74 | 
 75 |     {\em Note: This construction adds just one additional action for each state,
 76 |     and thus does not blow up the agent's search space.}
 77 | 
 78 | \end{block}
 79 | 
 80 | \vfill
 81 | \begin{block}{Theorem: $O( (\log n)^2 )$ Decisions}
 82 |     Assume $\mdp$ to have states arranged in a $k$-dimensional lattice, with
 83 |     noisy (with parameter $\epsilon$) primitive navigation actions $\actions$,
 84 |     and rewards distributed between $[0,1]$.
 85 | 
 86 |     Using only the value of neighboring states, an agent with options $\options$
 87 |     generated by $P_k$, can reach a state of maximal value in $O( ( \log
 88 |     |\states| )^2)$ decisions.
 89 | 
 90 |     \begin{itemize}
 91 |         \item We relate the value of two states $u$ and $v$, and their lattice distance,
 92 |             $$ \log \frac{ V(v) }{ V(u) } \approx \log( \sqrt{ \frac{1 - \epsilon}{ \epsilon } } ) \|u - v\| + c,$$ 
 93 |             where $c \in [0,\frac{1}{1-\gamma}]$.
 94 |         \item Following Kleinberg's analysis, we show that using the optimal
 95 |             value function, the agent makes $O(\log |\states|)$ decisions to get
 96 |             exponentially closer to the maximal value state. 
 97 |     \end{itemize}
 98 | 
 99 | \end{block}
100 | 


--------------------------------------------------------------------------------
/src/Environments/ArbitraryNavigation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RL Framework
  3 | Authors: Arun Chaganty, Prateek Gaur
  4 | Arbitrary Navigation Environment
  5 | """
  6 | 
  7 | import numpy as np
  8 | from Environment import *
  9 | import functools 
 10 | import pdb
 11 | 
 12 | class ArbitraryNavigation():
 13 |     """
 14 |     Arbitrary Navigation Environment
 15 |     Expects size of area to be given
 16 |     """
 17 | 
 18 |     MOVE_UP     = 0
 19 |     MOVE_DOWN   = 1
 20 |     MOVE_LEFT   = 2
 21 |     MOVE_RIGHT  = 3
 22 | 
 23 |     ACCURACY = 0.80
 24 | 
 25 |     REWARD_BIAS = -1
 26 |     REWARD_FAILURE = -20 - REWARD_BIAS
 27 |     REWARD_SUCCESS = 50 - REWARD_BIAS
 28 |     REWARD_CHECKPOINT = 0 # - REWARD_BIAS
 29 | 
 30 |     @staticmethod
 31 |     def state_idx( size, y, x ):
 32 |         """Compute the index of the state"""
 33 | 
 34 |         st, offset = x, size[1]
 35 |         st, offset = st + offset * y, offset * size[0]
 36 | 
 37 |         return st
 38 | 
 39 |     @staticmethod
 40 |     def idx_state( size, st ):
 41 |         """Compute the state for the index"""
 42 |         x, state = state % size[1], state / size[1]
 43 |         y, state = state % size[0], state / size[0]
 44 | 
 45 |         return y, x
 46 | 
 47 |     @staticmethod
 48 |     def get_random_goal( size ):
 49 |         loc = np.random.randint( 0, size[0] ), np.random.randint( 0, size[1] ) 
 50 |         return loc
 51 | 
 52 |     @staticmethod
 53 |     def make_mdp( size ):
 54 |         state_idx = functools.partial( ArbitraryNavigation.state_idx, size )
 55 | 
 56 |         goal = ArbitraryNavigation.get_random_goal( size )
 57 | 
 58 |         S = size[ 0 ] * size[ 1 ]
 59 |         A = 4 # up down left right
 60 |         P = [ [ [] for i in xrange( S ) ] for j in xrange( A ) ]
 61 |         R = {}
 62 |         R_bias = ArbitraryNavigation.REWARD_BIAS
 63 | 
 64 |         # Populate the P table
 65 |         ACCURACY = ArbitraryNavigation.ACCURACY
 66 |         RESIDUE = (1.0 - ACCURACY)/3
 67 |         for y in xrange( size[ 0 ] ):
 68 |             for x in xrange( size[ 1 ] ):
 69 |                 s = state_idx( y, x )
 70 | 
 71 |                 if y > 0:
 72 |                     up_state = y-1, x
 73 |                 else:
 74 |                     up_state = y, x
 75 |                 if y + 1 < size[ 0 ]:
 76 |                     down_state = y+1, x
 77 |                 else:
 78 |                     down_state = y, x
 79 |                 if x > 0:
 80 |                     left_state = y, x-1
 81 |                 else:
 82 |                     left_state = y, x
 83 |                 if x + 1 < size[ 1 ]:
 84 |                     right_state = y, x+1
 85 |                 else:
 86 |                     right_state = y, x
 87 | 
 88 |                 P[ ArbitraryNavigation.MOVE_UP ][ s ] = [
 89 |                         ( state_idx( *up_state ), ACCURACY ),
 90 |                         ( state_idx( *down_state ), RESIDUE ),
 91 |                         ( state_idx( *left_state ), RESIDUE ),
 92 |                         ( state_idx( *right_state ), RESIDUE ), ]
 93 |                 P[ ArbitraryNavigation.MOVE_DOWN ][ s ] = [
 94 |                         ( state_idx( *up_state ), RESIDUE ),
 95 |                         ( state_idx( *down_state ), ACCURACY ),
 96 |                         ( state_idx( *left_state ), RESIDUE ),
 97 |                         ( state_idx( *right_state ), RESIDUE ), ]
 98 |                 P[ ArbitraryNavigation.MOVE_LEFT ][ s ] = [
 99 |                         ( state_idx( *up_state ), RESIDUE ),
100 |                         ( state_idx( *down_state ), RESIDUE ),
101 |                         ( state_idx( *left_state ), ACCURACY ),
102 |                         ( state_idx( *right_state ), RESIDUE ), ]
103 |                 P[ ArbitraryNavigation.MOVE_RIGHT ][ s ] = [
104 |                         ( state_idx( *up_state ), RESIDUE ),
105 |                         ( state_idx( *down_state ), RESIDUE ),
106 |                         ( state_idx( *left_state ), RESIDUE ),
107 |                         ( state_idx( *right_state ), ACCURACY ), ]
108 | 
109 |         # Add rewards to all states that transit into the goal state
110 |         s = state_idx( *goal )
111 |         for s_ in xrange( S ):
112 |             R[ (s_,s) ] = ArbitraryNavigation.REWARD_SUCCESS - ArbitraryNavigation.REWARD_BIAS
113 |         
114 |         start_set = None
115 |         end_set = [ s ]
116 | 
117 |         return S, A, P, R, R_bias, start_set, end_set
118 | 
119 |     @staticmethod
120 |     def create( height, width ):
121 |         """Create a place from @spec"""
122 |         return Environment( ArbitraryNavigation, *ArbitraryNavigation.make_mdp( (height, width) ) )
123 | 
124 |     @staticmethod
125 |     def reset_rewards( env, height, width ):
126 |         size = (height, width)
127 |         state_idx = functools.partial( ArbitraryNavigation.state_idx, size )
128 |         goal = ArbitraryNavigation.get_random_goal( size )
129 | 
130 |         # Reset the rewards
131 |         R = {}
132 |         # Add rewards to all states that transit into the goal state
133 |         s = state_idx( *goal )
134 |         for s_ in xrange( S ):
135 |             R[ (s_,s) ] = ArbitraryNavigation.REWARD_SUCCESS - ArbitraryNavigation.REWARD_BIAS
136 |         
137 |         start_set = None
138 |         end_set = [ s ]
139 | 
140 |         return Environment( ArbitraryNavigation, env.S, env.A, env.P, R, env.R_bias, start_set, end_set )
141 | 
142 | 


--------------------------------------------------------------------------------
/doc/aamas/background.tex:
--------------------------------------------------------------------------------
  1 | \section{Background}
  2 | \label{sec:background}
  3 | 
  4 | % MDPs
  5 | In reinforcement learning, the standard representation of an environment
  6 | and task instance is a Markov decision process (MDP). An MDP can be
  7 | represented as the tuple, \\ $\tuple{ \states, \actions, \transitions,
  8 | \rewards, \gamma }$, where $\states$ and $\actions$ are finite sets of
  9 | states and actions, $\transitions: \states \times \actions \times
 10 | \states \to [0,1]$ describes the dynamics of the world through
 11 | state-action transition probabilites, $\rewards: \states \times \actions
 12 | \to \Re$ describes the task at hand by ascribing rewards for state
 13 | transitions, and $\gamma \in [0,1]$ is a discount factor that weighs the
 14 | value of future rewards.
 15 | 
 16 | In this setting, an agent in a state $s \in \states$ chooses an action
 17 | $a \in \actions$, and moves to a state $s'$ with probability
 18 | $\transitions(s,a,s')$, receiving a reward $\rewards(s,s')$. The
 19 | objective of the agent is to find a policy $\pi: \states \times \actions
 20 | \to [0,1]$, i.e. a decision procedure for selecting actions, that
 21 | maximises the reward it accumulates in the long run, $R = \sum_{i}
 22 | \gamma^i r_i$. $R$ is also called the return.
 23 | 
 24 | We define the value function $V: \states \to \Re$ to be the expected
 25 | return from $s$, and $Q: \states \times \actions \to \Re$ to be the
 26 | expected return from $s$, after taking the action $a$. The optimal value
 27 | function must satisfy the Bellman optimality equation, 
 28 | \begin{eqnarray*}
 29 |   V(s) &=& \max_{a} \rewards(s,a) + \gamma \sum_{s' \in \states} \transitions(s,a,s') V(s') \\
 30 |   Q(s,a) &=& \rewards(s,a) + \gamma \sum_{s' \in \states} \transitions(s,a,s') \max_{a'} Q(s',a').
 31 | \end{eqnarray*}
 32 | 
 33 | Given an optimal $Q$, an agent can construct an optimal policy,
 34 | $\pi(s,a^*) = 1$ when $a^* = \argmax_{a} Q(s,a)$, and $0$ otherwise. In
 35 | principle, if the agent knew the MDP, it could construct the optimal
 36 | value function, and from it an optimal policy.  However, in the usual
 37 | setting, the agent is only aware of the state-action space, $\states$
 38 | and $\actions$, and must learn $Q$ through exploration. The Q-learning
 39 | algorithm learns $Q$ with a simple update for every step the agent
 40 | takes, 
 41 | \begin{eqnarray*}
 42 |     Q(s,a) &=& Q(s,a) + \alpha [ r + \gamma \max_{a'} Q(s',a') - Q(s,a) ],
 43 | \end{eqnarray*}
 44 | \noindent
 45 | where $\alpha \in [0,1]$ is a parameter that controls the learning rate.
 46 | It has been shown that the Q-learning algorithm converges to the optimal
 47 | value function in the limit with fairly permissive assumptions.
 48 | 
 49 | % Options
 50 | The options framework provide a temporal abstraction for subtasks. An
 51 | option $\option$ is described by an initiation set $\initset \subset
 52 | \states$, a policy $\pi$, and a terminating condition $\beta$.  An agent
 53 | can exercise an option in any state $s \in \initset$, following which,
 54 | it will follow the policy $\pi$ described by the option, until the
 55 | terminating condition $\beta(s)$ is satisfied. The terminating condition
 56 | $\beta$ can be stochastic.
 57 | 
 58 | Several learning algorithms have been proposed for agents using options
 59 | \cite{SuttonPrecupSingh1999,BartoMahadevan2003}. One simple such method that
 60 | we will use is MacroQ, a generalisation of the Q-learning algorithm
 61 | described above. The MacroQ algorithm updates the value function only
 62 | after completion of the option. If the option $o$ was initiated in the
 63 | state $s$, and continues for $k$ steps before terminating in $s'$, the
 64 | corresponding $Q$ function update will be,
 65 | \begin{eqnarray*}
 66 |     Q(s,o) &=& Q(s,o) + \alpha [ r + \gamma^{k} \max_{o' \in \actions \cup \options} Q(s',o') - Q(s,o) ].
 67 | \end{eqnarray*}
 68 | 
 69 | Different tasks in the same domain can be described by different
 70 | $\rewards$. Let $\rewards$ be sampled from the family $\Rewards$. Our
 71 | objective then is to find a set of options $O$ that reduces the expected
 72 | learning time over $\Rewards$.
 73 | 
 74 | \begin{example}
 75 |   \label{example:taxi}
 76 | 
 77 | \begin{figure}[th]
 78 |     \centering
 79 |     \input{taxi}
 80 |     \label{fig:taxi-domain}
 81 |     \caption{The Taxi Domain}
 82 | \end{figure}
 83 | To make the discussion more tangible, let us look at an example, the
 84 | Taxi domain, shown in \figref{fig:taxi-domain}. The agent is a taxi
 85 | navigating in this road-map. It must pick up a passenger at one of the
 86 | 4 pads, R, G, B or Y.  Subsequently, it must carry the passenger to a
 87 | destination, which is also one of the above four pads. The states of
 88 | the taxi would then be a tuple containing the location of the
 89 | passenger (in one of the four pads, or within the taxi), the
 90 | destination of the passenger, and location of the taxi in the map.
 91 | The actions the taxi can perform are moving up, down, left or right in
 92 | the map, as well as pick up or drop a passenger at a pad. 
 93 | Typical options for such a domain would be an option that can be
 94 | started anywhere, and has a policy that takes the taxi to the one of
 95 | the pads in the shortest possible manner. Such an option is generic,
 96 | and does not depend on where the passenger or destination are. The RL
 97 | agent must then learn to choose the right option when picking up the
 98 | passenger.
 99 | \end{example}
100 | 
101 | 


--------------------------------------------------------------------------------
/doc/aamas/experiments.tex:
--------------------------------------------------------------------------------
  1 | \section{Experimental Results}
  2 | \label{sec:experiments}
  3 | % Experimental results
  4 | 
  5 | We trained MacroQ learning agents on several standard domains, and
  6 | measured the cumulative return obtained using the following option
  7 | generation schemes: 
  8 | \begin{itemize}
  9 |    \item \textbf{None}: No options were used.
 10 |    \item \textbf{Random}: Options were generated by randomly connecting
 11 |      two nodes in the domain (this is equivalent to $P_0$).
 12 |    \item \textbf{Betweenness}: As a representative of bottleneck-based
 13 |      schemes, options were generated to take any node to a local maxima
 14 |      of betweenness centrality, as described in \cite{Simsek2008}. 
 15 |    \item \textbf{Small World}: Options were generated randomly
 16 |      connecting two nodes of the domain using an inverse square law, as
 17 |      described in \secref{sec:theory}.
 18 | \end{itemize}
 19 | 
 20 | Each experiment, unless mentioned otherwise, was run for $10$ randomly
 21 | generated tasks in the domain; each task ran for $40,000$ epochs, and
 22 | was averaged over an ensemble of $20$ agents.
 23 | 
 24 | \subsection{Optimal Options}
 25 | The agents were run on the following three domains using the algorithm
 26 | sketched in \secref{sec:theory}:
 27 | \begin{itemize}
 28 |    \item \textbf{Arbitrary Navigation}: The agent must reach an
 29 |      arbitrary goal state in an obstacle-free $x \times y$ grid-world. 
 30 |    \item \textbf{Rooms}: The agent must navigate a floor plan with
 31 |      4 rooms to reach an arbitrary goal state.
 32 |    \item \textbf{Taxi}: This is the domain described in
 33 |      \exref{example:taxi}.
 34 | \end{itemize}
 35 | 
 36 | Optimal policies were given to the options generated according to the
 37 | schemes described above. 
 38 | 
 39 | \begin{table}
 40 |  \centering
 41 |  \begin{tabular}{ r | r r r }
 42 |              & Arbt. Navi           & Rooms               & Taxi                  \\ \hline
 43 |    None      & -31.82               &  -1.27              & -16.90                \\
 44 |    Random    & -31.23               & -10.76              & -18.83                \\
 45 |    Betw.     & -18.28               & -8.94               &  {\bf 80.48}          \\
 46 |    Sm-W      & {\bf -14.24 [$r=4$]} & {\bf 8.54[$r=2$]}   &   0.66 [$r=0.75$]     \\
 47 |  \end{tabular}
 48 |  \caption{Cumulative Return}
 49 |  \label{tbl:optimal-returns}
 50 | \end{table}
 51 | 
 52 | The results of these experiments are summarised in
 53 | Table \autoref{tbl:optimal-returns}. Small world options perform significantly
 54 | better than the other schemes in navigation-oriented tasks like Rooms or
 55 | Arbitrary Navigation. In the Taxi domain, options generated by the
 56 | betweenness scheme outperform the small world options. This is expected
 57 | because the goal states in this domain lie at betweenness maxima.
 58 | 
 59 | \begin{figure}[th]
 60 |   \centering
 61 |     \includegraphics[width=2.4in]{figures/rooms-options}
 62 |     \label{fig:rooms-options}
 63 |     \caption{Rooms: Options learnt}
 64 | \end{figure}
 65 | 
 66 | \begin{figure}[th]
 67 |   \centering
 68 |     \includegraphics[width=3in]{figures/rooms-return-200}
 69 |     \label{fig:rooms-return}
 70 |     \caption{Rooms: Cumulative Return with 200 options}
 71 | \end{figure}
 72 | 
 73 | Some of the small world options preferred in Rooms domain are shown in
 74 | \figref{fig:rooms-options}. The graph shows several examples of options
 75 | that compose together to arrive near the goal state. We have also
 76 | plotted the learning behaviour in \figref{fig:rooms-return}. 
 77 | 
 78 | \begin{figure}[th]
 79 |   \centering
 80 |       \includegraphics[width=3in]{figures/rooms-exp}
 81 |       \label{fig:rooms-exp}
 82 |     \caption{Rooms: $r$ vs Cumulative Return}
 83 | \end{figure}
 84 | 
 85 | \subsection{Sensitivity of $r$}
 86 | We do not yet have a clear understanding of how the exponent $r$ should
 87 | be chosen. \figref{fig:rooms-exp} plots $r$ versus the cumulative return
 88 | on the Rooms domain. The performance of the agent without options after
 89 | $20,000$ epochs is also plotted for reference. There is a range of $r$
 90 | ($\approx 0.75$ to $1.5$) with good performance, after which the
 91 | performance steadily drops. This behaviour is easily explained; as the
 92 | exponent goes up, the small world options generated are very short, and
 93 | do not help the agent get nearer to the maximal value state. The optimal
 94 | range of $r$ is slightly counter-intuitive because the Rooms domain is
 95 | a two dimensional lattice with some edges removed. As a consequence of
 96 | the reduced connectivity, and perhaps due to stochastic factors, longer
 97 | range options are preferred.
 98 | 
 99 | \begin{figure}[th]
100 |   \centering
101 |     \includegraphics[width=3in]{figures/rooms-learnt-200}
102 |       \label{fig:rooms-learnt}
103 |     \caption{Rooms: Options Learnt on a Budget}
104 | \end{figure}
105 | 
106 | \subsection{Options Learnt on a Budget}
107 | In \secref{sec:algo}, we describe an algorithm to construct small world
108 | options efficiently when given a limited number of learning epochs. We
109 | compared the performance of these options with betweenness options
110 | learnt similarly, and have plotted our results in
111 | \figref{fig:rooms-learnt}. Despite using many more
112 | options, the small world options thus created significantly outperform
113 | betweenness options learnt with the same budget, and are even comparable
114 | to the optimal betweenness options.
115 | 
116 | 


--------------------------------------------------------------------------------
/doc/project-report/theory.tex:
--------------------------------------------------------------------------------
  1 | \section{Approach}
  2 | \label{sec:approach}
  3 | 
  4 | % MDPs
  5 | Before we describe our approach for generating options, we briefly review Markov
  6 | Decision Processes (MDPs) and options framework. An MDP $\tuple{
  7 |     \states,\actions,\rewards }$ is a standard representation for a RL problem,
  8 | where $\states$ is the set of states in the world, $\actions$ is the set of
  9 | actions that take the agent from one state to another with some transition
 10 | probability, and $rewards$ is the set of rewards obtained when moving from one
 11 | state to another. The objective is to find a decision procedure or policy $\pi$
 12 | that maximises the return, or the rewards accumulated in the long run. 
 13 | 
 14 | % Options
 15 | An option $\option$ is described by an initiation set $\initset \subset
 16 | \states$, a policy $\pi$, and a terminating condition $\beta$. An agent can
 17 | exercise an option in any state $s \in \initset$, following which, it will
 18 | follow the policy $\pi$ described by the option, until the terminating condition
 19 | $\beta(s)$ is satisfied. The terminating condition $\beta$ can be stochastic as
 20 | well. 
 21 | 
 22 | % Example
 23 | 
 24 | \begin{figure}[h]
 25 |     \center
 26 |     \input{taxi}
 27 |     \caption{The Taxi Domain}
 28 |     \label{fig:taxi-domain}
 29 | \end{figure}
 30 | 
 31 | To make the discussion more tangible, let us look at an example, the Taxi
 32 | domain, shown in \autoref{fig:taxi-domain}. The agent is a taxi navigating in
 33 | this road-map. It must pick up a passenger at one of the 4 pads, A, B, C or D.
 34 | Subsequently, it must carry the passenger to a destination, which is also one of
 35 | the above four pads. The states of the taxi would then be the location of the
 36 | passenger (in one of the four pads, or within the taxi), the destination of the
 37 | passenger, and location of the taxi in the map. The actions the taxi can perform
 38 | are moving up, down, left or right in the map, as well as pick up a passenger or
 39 | drop him at the destination.  Typical options for such a domain would be an
 40 | option that can be started anywhere, and has a policy that takes the taxi to the
 41 | one of the pads in the shortest possible manner. Such an option is generic, and
 42 | does not depend on where the passenger or destination are. The RL agent must
 43 | then learn to choose the right option when picking up the passenger.
 44 | 
 45 | \begin{figure}[h]
 46 |     \center
 47 |     \includegraphics[width=3in]{figures/taxi1}
 48 |     \caption{State Space Graph for the Taxi Domain}
 49 |     \label{fig:taxi-graph}
 50 | \end{figure}
 51 | 
 52 | % Graph-based
 53 | It is easy to construct a graph $\graph$ out of the state-space described by an
 54 | MDP. The states $\states$ become the nodes of the graph, and $\actions$ become
 55 | the edges, with the transition probabilites serving as the weights. The edges
 56 | are also attributed with the rewards described by $\rewards$. Options can be
 57 | viewed to be paths along the graph. The Taxi domain just defined translates to
 58 | a graph shown in \autoref{fig:taxi-graph}.
 59 | 
 60 | % Constructing Options 
 61 | We construct an option `short-circuiting' two states using a policy constructed
 62 | from the shortest path on this graph. For every state $x$, we select a state to
 63 | be short-circuited $y$ with using a multinomial distribution with weight
 64 | proportional to the distance between them in the state space, i.e. $w(x,y)
 65 |     \propto d(x,y)^{-r}$. 
 66 | 
 67 | Another example of constructing an option on this graph would be to define a
 68 | policy that takes any state to a particular one along the shortest path. This is
 69 | the approach adopted by Simsek and Barto in \cite{Simsek}, where local maxima of
 70 | the betweenness scores are used to identify bottlenecks, and options defined to
 71 | reach these bottlenecks optimally from any state.
 72 | 
 73 | % Describe Macro-Q and Intra-Option-Q
 74 | Several learning algorithms have been proposed for agents using options
 75 | \cite{SuttonPrecupSingh1998,BartoMahadevan}. A simple such method is Macro
 76 | Q-learning, a generalisation of the Q-learning methods. The MacroQ algorithm
 77 | updates the value function only after completion of the option. If the option
 78 | $o$ was initiated in the state $s$, and continue for $k$ steps before
 79 | terminating in $s'$, the corresponding back-up will be,
 80 | 
 81 | \begin{IEEEeqnarray*}{rCl}
 82 |     Q(s,o) &=& Q(s,o) + \alpha [ r + \gamma^{k} \max_{o' \in \options_{s'} \cup \actions_{s'}} Q(s',o') - Q(s,o) ].
 83 | \end{IEEEeqnarray*}
 84 | 
 85 | Another method, Intra-option Q-learning, exploits the experience gathered during
 86 | the trajectory, instead of only at the end of it. In this approach, every step
 87 | from $s$ to $s'$ using $a \in \actions$ is used to back up the value function of
 88 | every option $o \in \options$ which can be used in $s$, and whose policy has a
 89 | non-zero probability of using the action $a$ using the following update,
 90 | 
 91 | \begin{IEEEeqnarray*}{rCl}
 92 |     Q(s,o) &=& Q(s,o) + \alpha [ r + \gamma Q(s',o) - Q(s,o) ].
 93 | \end{IEEEeqnarray*}
 94 | \noindent
 95 | The value-function for every action along the trajectory is also updated, using
 96 | the usual Q-learning backups.
 97 | 
 98 | % \begin{IEEEeqnarray*}{rCl}
 99 | %     x &=& y \\
100 | %       &=& z \\
101 | % \end{IEEEeqnarray*}
102 | 
103 | % \begin{figure}[s]
104 | %     \centering
105 | %     \includegraphics[width=5in]{filename}
106 | %     \caption{ }
107 | %     \label{fig:high-variance-rtt}
108 | % \end{figure}
109 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/column2.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | \begin{block}{Results}
  3 |     \begin{figure}[h]
  4 |         \centering
  5 |         \fbox{\includegraphics[height=4in]{figures/rooms-algos-200}}
  6 |         \label{fig:rooms-performance}
  7 |         \caption{Rooms: Cumulative Return (with 200 options)}
  8 |     \end{figure}
  9 | 
 10 |     \vskip-4ex
 11 |     % Table
 12 |     \begin{columns}
 13 |         \begin{column}{.45\textwidth}
 14 |             \begin{table}
 15 |               \centering
 16 |               \small
 17 |               \begin{tabular}{ l r r } %{@{} p{.25\linewidth} p{.2\linewidth} p{.2\linewidth} p{0.2\linewidth} }
 18 |                 \toprule 
 19 |                 Scheme & \multicolumn{2}{c @{}}{Options (40,000 epochs)}  \\
 20 |                 \cmidrule(l){2-3}
 21 |                                 & {200}             & {400}  \\
 22 |                 \toprule
 23 |                 None            & -31.82         & -31.82   \\
 24 |                 \addlinespace                            
 25 |                 $P_0$           & -31.23         & -32.90   \\  
 26 |                 \addlinespace                            
 27 |                 Betw.           & -18.28         & -24.38   \\
 28 |                 \addlinespace                                  
 29 |                 $P_4$           & {\bf -14.24}   & {\bf -7.55}    \\
 30 |                 \bottomrule
 31 |               \end{tabular}
 32 |               \caption{Arb. Nav.: Cumulative Return}
 33 |             \end{table}
 34 |         \end{column}
 35 |         \begin{column}{.45\textwidth}                
 36 |             \begin{table}
 37 |               \centering
 38 |               \small
 39 |               \begin{tabular}{ l r r } %{@{} p{.25\linewidth} p{.2\linewidth} p{.2\linewidth} p{0.2\linewidth} }
 40 |                 \toprule 
 41 |                 Scheme & \multicolumn{2}{c @{}}{Options (40,000 epochs)}  \\
 42 |                 \cmidrule(l){2-3}
 43 |                                         & {100}         & {200}            \\ %& {400}    \\
 44 |                 \toprule                                    
 45 |                 None                    & -16.90        & -16.90           \\ %& -3043.60 \\
 46 |                 \addlinespace                                                   %          
 47 |                 $P_0$                   & -17.68         & -18.83           \\ %& -8304.05 \\  
 48 |                 \addlinespace                                                   %           
 49 |                 Betw.                   & {\bf 80.59}   & {\bf 80.48}      \\ %& 14841.15 \\
 50 |                 \addlinespace                                                   %   
 51 |                 $P_{0.75}$                 & -7.55          & 0.66             \\ %& 22605.01 \\
 52 |                 % \addlinespace                                                %     
 53 |                 % Betw. + \\$P_{0.75}$      & 12.69          & 15.12        \\ %& {\bf 23168.17} \\
 54 |                 \bottomrule
 55 |               \end{tabular}
 56 |               \caption{Taxi: Cumulative Return}
 57 |             \end{table}
 58 |         \end{column}
 59 |     \end{columns}
 60 | 
 61 |     % Comments
 62 |     \begin{itemize}
 63 |         \item Experiments were run for 40,000 epochs using MacroQ. We compared
 64 |             options generated using a bottleneck based method (betweenness),
 65 |             randomly distributed options ($P_0$), and small world options ($P_{r
 66 |             > 0}$).
 67 |         \item Bottleneck-based methods have a natural advantage in the Taxi
 68 |             domain, as goal states coincide with bottleneck states (the {\tt
 69 |             pick-up} and {\tt put-down} actions). 
 70 |         \item Small-world options do very well on free-navigation tasks (Rooms
 71 |             or Arbitrary Navigation), even in the presence of bottlenecks
 72 |             (Rooms). Combining bottleneck-based options and small world
 73 |             options can outperform both (Rooms).
 74 |     \end{itemize}
 75 | \end{block}
 76 | 
 77 | \vfill
 78 | \begin{block}{Role of the Exponent $r$ (Rooms)}
 79 |     \begin{columns}
 80 |         \begin{column}{.49\textwidth}
 81 |             \begin{figure}[h]
 82 |                 \centering
 83 |                 \fbox{\includegraphics[height=3in]{figures/rooms-exp}}
 84 |                 \label{fig:rooms-exponent}
 85 |                 %\caption{Rooms}
 86 |             \end{figure}
 87 |         \end{column}
 88 |         \begin{column}{.39\textwidth}                
 89 |             \begin{itemize}
 90 |                 \item The basic structure of the the Rooms state spaces is 2D,
 91 |                     yet exponents around 1 perform optimally.  This difference
 92 |                     is likely due to obstacles (walls).
 93 |                 \item The existance of a maximal value for the exponent, as well
 94 |                     the behaviour for exponents greater than it, matches what is
 95 |                     seen in the social networks scenario.
 96 |             \end{itemize}
 97 |         \end{column}
 98 |     \end{columns}
 99 | \end{block}
100 | 
101 | \vfill
102 | \begin{block}{Conclusions}
103 |     \begin{itemize}
104 |         \item We give an algorithm to generate a random collection of options
105 |             $\options$ such that any ``task'' in an MDP can be performed in
106 |             $O( ( \log |\states| )^2 )$ decisions.
107 |         \item We find that these options significantly outperform
108 |             bottleneck-based options and purely random options.
109 |     \end{itemize}
110 | \end{block}
111 | 
112 | \vfill
113 | \begin{block}{Future Work}
114 |     \begin{itemize}
115 |         \item By using `cheaply' learnt policies could the total training time
116 |             for small world options compare to say that for the case of
117 |             betweenness?
118 |         \item Given the loose conditions for the theorem to hold, could function
119 |             approximators be used inplace of the complete MDP?
120 |         \item Could the bounded number of decisions required translate to any
121 |             theoretical guarantees on faster convergence?
122 |     \end{itemize}
123 | \end{block}
124 | 


--------------------------------------------------------------------------------
/src/Environment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | RL Framework
  3 | Authors: Arun Chaganty
  4 | Environment Base Class; Represented by an MDP
  5 | """
  6 | 
  7 | import random
  8 | import numpy as np
  9 | import networkx as nx
 10 | import util
 11 | import sys
 12 | import pdb
 13 | 
 14 | from ProgressBar import ProgressBar
 15 | 
 16 | class Environment:
 17 |     """Environment represented as an MDP"""
 18 |     domain = None
 19 |     S = 0
 20 |     A = 0
 21 |     P = []
 22 |     R = {}
 23 |     R_bias = 0
 24 |     Q = []
 25 | 
 26 |     state = 0
 27 | 
 28 |     def __init__( self, domain, S, A, P, R, R_bias, start_set, end_set ):
 29 |         self.domain = domain
 30 |         self.S = S 
 31 |         self.A = A 
 32 |         self.P = P
 33 |         self.R = R
 34 |         self.R_bias = R_bias
 35 |         self.start_set = start_set
 36 |         self.end_set = end_set
 37 | 
 38 |         # State action set for MDP
 39 |         Q = []
 40 |         for s in xrange(self.S):
 41 |             Q.append( tuple( ( a for a in xrange( self.A ) if len( self.P[ a ][ s ] ) > 0 ) ) )
 42 |         self.Q = Q
 43 | 
 44 |     def start( self ):
 45 |         """Calls _start - this is to support Options later"""
 46 |         return self._start()
 47 | 
 48 |     def _start(self):
 49 |         """Initialise the Environment
 50 |         @returns initial state and valid actions
 51 |         """
 52 |         if self.start_set:
 53 |             state = random.choice( self.start_set )
 54 |         else:
 55 |             state = np.random.randint( self.S )
 56 |             while len( self.Q[ state ] ) == 0:
 57 |                 state = np.random.randint( self.S )
 58 |         self.state = state
 59 | 
 60 |         return state
 61 | 
 62 |     def react(self, action):
 63 |         return self._react( action )
 64 | 
 65 |     def _react(self, action):
 66 |         state = util.choose( self.P[ action ][ self.state ] )
 67 |         reward = self.R.get( (self.state, state), 0 ) + self.R_bias
 68 | 
 69 |         # If there is no way to get out of this state, the episode has ended
 70 |         if self.end_set is not None:
 71 |             episode_ended = state in self.end_set 
 72 |         else:
 73 |             episode_ended = len( self.Q[ state ] ) == 0
 74 | 
 75 |         if episode_ended:
 76 |             state = self._start()
 77 |         self.state = state
 78 | 
 79 |         return state, reward, episode_ended
 80 | 
 81 |     def to_graph( self ):
 82 |         """Create a graph from the MDP environment"""
 83 | 
 84 |         graph = nx.MultiDiGraph()
 85 |         # Add all states as nodes
 86 |         for i in xrange( self.S ):
 87 |             graph.add_node( i )
 88 |         for a in xrange( self.A ):
 89 |             # Add pr-edges for each action
 90 |             for i in xrange( self.S ):
 91 |                 for (j,pr) in self.P[ a ][ i ]:
 92 |                     graph.add_edge( i, j, pr = pr, action = a )
 93 | 
 94 |         return graph
 95 | 
 96 |     def to_dot( self ):
 97 |         """Create a graph from the MDP environment"""
 98 | 
 99 |         s = ""
100 |         s += "# Autogenerated rl-domain graph\n"
101 |         s += "digraph{ \n"
102 | 
103 |         # Add a node for all states
104 |         for i in xrange( self.S ):
105 |             s += '%d [label=""];\n'%( i )
106 |         # Add pr-edges
107 |         for a in xrange( self.A ):
108 |             # Add pr-edges for each action
109 |             for i in xrange( self.S ):
110 |                 for (j,pr) in self.P[ a ][ i ]:
111 |                     s += "%d -> %d;\n"%( i, j )
112 |         s += "}\n"
113 |         return s
114 | 
115 | class Option:
116 |     r"""Encapsulates an option: I, \pi, \beta"""
117 |     I = set([])
118 |     pi = {}
119 |     B_ = {}
120 | 
121 |     def __init__( self, I, pi, B ):
122 |         self.I = I
123 |         self.pi = pi
124 |         self.B_ = B
125 | 
126 |     def __repr__(self):
127 |         return "[Option: %s]"%( id( self ) )
128 | 
129 |     def can_start( self, state ):
130 |         return state in self.I
131 | 
132 |     def act( self, state ):
133 |         action = util.choose( self.pi[ state ] )
134 |         return action
135 | 
136 |     def B( self, state ):
137 |         if state in self.B_:
138 |             return self.B_[ state ]
139 |         elif state in self.pi and len( self.pi[ state ] ) > 0:
140 |             return 0.0
141 |         else:
142 |             return 1.0
143 | 
144 |     def should_stop( self, state ):
145 |         b = self.B( state )
146 |         if b == 1.0:
147 |             return True
148 |         elif b == 0.0:
149 |             return False
150 |         elif np.random.random() < b:
151 |             return True
152 |         else:
153 |             return False
154 | 
155 | class OptionEnvironment( Environment ):
156 |     """
157 |     Environment that also supports options defines a graph structure
158 |     Note: We don't save actions as options from an efficiency standpoint.
159 |     """
160 |     O = []
161 | 
162 |     def __init__( self, domain, S, A, P, R, R_bias, start_set, end_set, O ):
163 |         Environment.__init__( self, domain, S, A, P, R, R_bias, start_set, end_set )
164 |         self.O = O
165 | 
166 |         # Update the Q function based on the options we now have
167 |         Q = []
168 |         for s in xrange(self.S):
169 |             actions = tuple( ( a for a in xrange( self.A ) if len( self.P[ a ][ s ] ) > 0 ) )
170 |             options = tuple( ( o for o in O if s in o.I ) )
171 |             Q.append( actions + options )
172 |         self.Q = Q
173 | 
174 |     def react( self, action ):
175 |         """
176 |         React to action
177 |         @returns new state and valid actions, and reward, and if episode has
178 |         ended
179 |         """
180 | 
181 |         if isinstance( action, Option ):
182 |             option = action
183 |             history = []
184 |             rewards = []
185 | 
186 |             # Act according to the option
187 |             action = option.act( self.state )
188 |             history.append( ( self.state, action ) )
189 | 
190 |             state, reward, episode_ended = self._react( action )
191 |             rewards.append( reward )
192 | 
193 |             while not episode_ended and not option.should_stop( state ):
194 |                 # Use the option policy 
195 |                 action = option.act( state )
196 |                 history.append( ( state, action ) )
197 | 
198 |                 state, reward, episode_ended = self._react( action )
199 |                 rewards.append( reward )
200 | 
201 |             history.append( (state, None) )
202 | 
203 |             return history, tuple(rewards), episode_ended
204 |                 
205 |         else:
206 |             return self._react( action )
207 | 
208 | 


--------------------------------------------------------------------------------
/doc/ewrl-poster/beamerthemeI6pd2.sty:
--------------------------------------------------------------------------------
  1 | \ProvidesPackage{beamerthemeI6pd2}  % this style was created by Thomas Deselaers an Philippe Dreuw
  2 | 
  3 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  4 | \RequirePackage{tangocolors}
  5 | \selectcolormodel{cmyk}
  6 | \mode<presentation>
  7 | 
  8 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  9 | 
 10 | \setbeamercolor{headline}{fg=tabutter,bg=black}
 11 | \setbeamercolor{footline}{fg=tabutter, bg=ta3gray}
 12 | \setbeamerfont{footline}{size=\large,series=\tt}
 13 | \setbeamercolor{separation line}{bg=ta2orange}
 14 | \setbeamercolor{title in headline}{fg=tabutter}
 15 | \setbeamercolor{author in headline}{fg=ta2orange}
 16 | \setbeamercolor{institute in headline}{fg=ta3orange}
 17 | 
 18 | \setbeamercolor{framesubtitle}{fg=ta3orange, bg=ta2gray}
 19 | \setbeamercolor{author in head/foot}{fg=ta2orange, bg=black}
 20 | \setbeamercolor{title in head/foot}{fg=ta2orange, bg=black}
 21 | 
 22 | \setbeamercolor*{normal text}{fg=tachameleon, bg=ta3gray}
 23 | \setbeamercolor*{block body}{bg=ta3aluminium,fg=black}
 24 | \setbeamercolor*{block title}{fg=taorange,bg=ta2gray}
 25 | \setbeamerfont{block title}{size=\large,series=\bf}
 26 | \setbeamercolor{upper separation line head}{fg=ta2orange}
 27 | 
 28 | \setbeamercolor*{example body}{fg=ta3aluminium,bg=black}
 29 | \setbeamercolor*{example text}{fg=ta3aluminium,bg=black}
 30 | \setbeamercolor*{example title}{bg=taorange,fg=ta2gray}
 31 | 
 32 | %\setbeamercolor{alerted text}{fg=ta3gray}
 33 | 
 34 | %\setbeamercolor{example text}{fg=taorange}
 35 | \setbeamercolor{structure}{fg=ta3skyblue}
 36 | 
 37 | \setbeamertemplate{itemize items}[triangle]
 38 | \setbeamertemplate{navigation symbols}{}  % no navigation on a poster
 39 | 
 40 | \newcommand{\thispdfpagelabel}[1]{} % Hack to prevent errors
 41 | 
 42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 43 | \setbeamertemplate{block begin}{
 44 |   \vskip.75ex
 45 |   \begin{beamercolorbox}[ht=3.5ex,dp=0.5ex,center,leftskip=-1em,colsep*=.75ex]{block title}%
 46 |     \usebeamerfont*{block title}%
 47 |     {\phantom{Gg}\insertblocktitle}% phantom because of baseline problem
 48 |   \end{beamercolorbox}%
 49 |   {\ifbeamercolorempty[bg]{block body}{}{\nointerlineskip\vskip-0.5pt}}%
 50 |   \usebeamerfont{block body}%
 51 |   \begin{beamercolorbox}[leftskip=1em,colsep*=.75ex,sep=0.5ex,vmode]{block body}%
 52 |     \ifbeamercolorempty[bg]{block body}{\vskip-.25ex}{\vskip-.75ex}\vbox{}%
 53 |   }
 54 |   \setbeamertemplate{block end}{
 55 |   \end{beamercolorbox}
 56 | }
 57 | 
 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 59 | \setbeamertemplate{headline}{  
 60 |   \leavevmode
 61 | 
 62 |   \begin{beamercolorbox}[wd=\paperwidth]{headline}
 63 |     \begin{columns}[T]
 64 |       \begin{column}{.02\paperwidth}
 65 |       \end{column}
 66 |       \begin{column}{.15\paperwidth}
 67 |         \vskip6ex
 68 |         \begin{center}
 69 |           \includegraphics[height=1.5in]{figures/iit-madras.png} %.95\linewidth
 70 |         \end{center}
 71 |         \vskip2ex
 72 |       \end{column}
 73 |       \begin{column}{.65\paperwidth}
 74 |         \vskip4ex
 75 |         \center
 76 |         \usebeamercolor{title in headline}{\color{fg}\textbf{\LARGE{\inserttitle}}\\[1ex]}
 77 |         \usebeamercolor{author in headline}{\color{fg}\large{\insertauthor}\\[1ex]}
 78 |         \usebeamercolor{institute in headline}{\color{fg}\large{\insertinstitute}\\[1ex]}     
 79 |         \vskip4ex
 80 |       \end{column}
 81 |       \begin{column}{.15\paperwidth}
 82 |         \vskip10ex
 83 |         \begin{center}
 84 |           \includegraphics[height=1.0in]{figures/RISE.pdf} %.95\linewidth
 85 |         \end{center}
 86 |         \vskip2ex
 87 |       \end{column}
 88 |       \begin{column}{.02\paperwidth}
 89 |       \end{column}
 90 |     \end{columns}
 91 |     \vskip2ex
 92 |   \end{beamercolorbox}
 93 | 
 94 |   \begin{beamercolorbox}[wd=\paperwidth]{lower separation line head}
 95 |     \rule{0pt}{3pt}
 96 |   \end{beamercolorbox}
 97 | }
 98 | 
 99 | %  \setbeamertemplate{headline}{  
100 | %    \leavevmode
101 | %    \begin{beamercolorbox}[sep=0.5cm,wd=.8\paperwidth]{headline}
102 | %      \usebeamercolor{title in headline}{\raggedleft\color{fg}\textbf{\LARGE{\inserttitle}}\\[1ex]}
103 | %      \usebeamercolor{author in headline}{\raggedleft\color{fg}\large{\insertauthor}\\[1ex]}
104 | %      \usebeamercolor{institute in headline}{\raggedleft\color{fg}\large{\insertinstitute}\\[1ex]}     
105 | %    \end{beamercolorbox}%
106 | %    \begin{beamercolorbox}[wd=.2\paperwidth]{logo in headline}
107 | %      \centering
108 | %      \LARGE{LOGO}
109 | %    \end{beamercolorbox}  
110 | 
111 | 
112 | %}
113 | 
114 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
115 | \setbeamertemplate{footline}{
116 |   \begin{beamercolorbox}[wd=\paperwidth]{upper separation line foot}
117 |     \rule{0pt}{3pt}
118 |   \end{beamercolorbox}
119 |   
120 |   \leavevmode%
121 |   \begin{beamercolorbox}[ht=4ex,leftskip=1em,rightskip=1em]{author in head/foot}%
122 |       %\texttt{Indian Institute of Technology Madras, Chennai, India}
123 |       \texttt{http://rise.cse.iitm.ac.in/wiki/}
124 |     \hfill
125 |         \hfill{Created with \LaTeX \texttt{beamerposter} \hskip1em}
126 |     \vskip1ex
127 |   \end{beamercolorbox}
128 |   \vskip0pt%
129 |   \begin{beamercolorbox}[wd=\paperwidth]{lower separation line foot}
130 |     \rule{0pt}{3pt}
131 |   \end{beamercolorbox}
132 | }
133 | 
134 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
135 | % Display a grid to help align images ... and it looks nice with this color scheme
136 | \beamertemplategridbackground[1cm]
137 | 
138 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
139 | \mode<all>
140 | 
141 | % adapt height of imtemize rectangles
142 | \setbeamertemplate{itemize items}[triangle]
143 | \setbeamertemplate{itemize item}{\raisebox{0.12ex}{$\blacktriangleright$}\hskip0.1em}
144 | \setbeamertemplate{itemize subitem}{\raisebox{0.12ex}{$\triangleright$}\hskip0.1em}
145 | % or define your own template using \defbeamertemplate{itemize item}, see beameruserguide.pdf
146 | 
147 | % equal font sizes for all levels
148 | \setbeamerfont{itemize/enumerate body}{size=\normalsize}
149 | \setbeamerfont{itemize/enumerate subbody}{size=\normalsize}
150 | \setbeamerfont{itemize/enumerate subsubbody}{size=\normalsize}
151 | 
152 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
153 | %%% Local Variables: 
154 | %%% mode: latex
155 | %%% TeX-PDF-mode: t
156 | %%% TeX-master: "poster-surf"
157 | %%% End: 
158 | 


--------------------------------------------------------------------------------
/doc/aamas/intro.tex:
--------------------------------------------------------------------------------
  1 | \section{Introduction}
  2 | \label{sec:intro}
  3 | 
  4 | % RL - challenges - need for structure
  5 | Reinforcement learning (RL) is a widely studied learning framework for
  6 | autonomous agents, particularly because of it's extreme generality; it
  7 | addresses the problem of learning optimal agent behaviour in an unknown
  8 | stochastic environment. In this setting, an agent explores a state
  9 | space, receiving rewards for actions it takes; the objective of the
 10 | agent is to maximise it's rewards accumlated over time. However, when
 11 | scaling up to larger domains, these agents require prohibitively large
 12 | amounts of experience in order to learn a good policy. By allowing the
 13 | agent to exploit the structure of environment or task, we can reduce the
 14 | experience required.
 15 | 
 16 | % Types of structure - temporal abstractions - options
 17 | Structure can be imposed on a learning task through either spatial or
 18 | temporal abstractions. With the former, the state-space is minimised
 19 | using information about the symmetries present in the domain. Spatial
 20 | abstractions have been surveyed in \cite{Li2006}. In the latter case,
 21 | high-level actions are introduced which capture sequences of primitive
 22 | actions. In this light, temporal abstractions capture the notion of
 23 | a ``subtask''. The most common approach for temporal abstractions is the
 24 | options framework proposed by Sutton, Precup and Singh
 25 | \cite{SuttonPrecupSingh1999}, and we build our work on this framework
 26 | also. Work by Ravindran and Barto on relativised options
 27 | \cite{Ravindran2003} show how temporal abstractions can be combined with
 28 | spatial abstractions.  Both spatial and temporal abstractions play an
 29 | important role in transfer learning, where we wish to extend optimal
 30 | behaviour learnt in one task to another task; a survey of such
 31 | techniques can be found in \cite{Taylor2009a}.
 32 | 
 33 | % Getting options - related work - deficiency
 34 | While options provide a broad framework for temporal abstraction, there
 35 | is still no consensus on how to choose subtasks. The prevalent view is
 36 | that subtasks should represent skills, i.e. partially defined action
 37 | policies that constitute a part of many reinforcement learning problems
 38 | \cite{Thrun1995}. For this reason, much of the existing work centres
 39 | around identifying `bottlenecks', regions that the agent tends to visit
 40 | frequently \cite{McGovern2001}, either empirically as in
 41 | \cite{McGovern2001}, or, more recently, using graph theoretic methods
 42 | like betweenness centrality \cite{Simsek2008} or graph partitions
 43 | \cite{Menache2002}. The intuition is that options that navigate an agent
 44 | to such states helps the agent move between strongly connected
 45 | components, thus leading to efficient exploration. 
 46 | 
 47 | These option generation schemes suffer from two serious drawbacks; (i)
 48 | they either require complete knowledge of the MDP, or construct a local
 49 | model from trajectories, a sample-heavy approach, and (ii) options to
 50 | bottlenecks can be initiated at any state, leading to a blowup in the
 51 | decision space, which might cause to actually take more time to learn
 52 | the task as it sorts through the unnecessary options. 
 53 | 
 54 | If one considered these options as additional edges to the bottleneck
 55 | states, in the sense that a single decision is sufficient to transit the
 56 | agent from a state, to the bottleneck, the resultant state-interaction
 57 | graph would now be ``more'' connected. To highlight the importance of
 58 | the connectivity of the state-interaction graph, consider the Markov
 59 | chain induced by a policy for an Markov decision process. It is well
 60 | known that the convergence rate of a Markov chain (mixing time), is
 61 | directly related to its conductance \cite{Jerrum1988}, and thus its
 62 | algebraic connectivity.
 63 | 
 64 | % Motivation for small world
 65 | Recognising the importance of connectivity, we try to apply concepts
 66 | from Kleinberg's work on small world networks, to the context of problem
 67 | solving with autonomous agents. These graphs have been shown to have
 68 | exceptionally high algebraic connectivity, and thus fast Markov chain
 69 | mixing times \cite{Salehi2007}. In a small-world network, each node 
 70 | has one non-neighbouring edge, which connected to another node with
 71 | a probability inversely proportional to the distance between them. With
 72 | this simple construction, Kleinberg showed that an agent can discover
 73 | a short path to any destination using only local information like the
 74 | coordinates of it's immediate neighbours \cite{Kleinberg2000}. In
 75 | contrast, other graph models with a small diameter only state the
 76 | existence of a short path, but do not guarantee that an agent would be
 77 | able to find such a path. 
 78 | 
 79 | % Small-world networks have found diverse applications from sensor
 80 | % networks, to load balancing, to swarms \cite{Saber2005}. 
 81 | In our context, we construct subtasks distributed according to the small
 82 | world distribution as follows; create an option that will take the agent
 83 | from a state $s$ to another state $s'$ with a probability inversely
 84 | proportional to the distance between $s$ and $s'$. We prove that this
 85 | set of subtasks enables the agent to easily solve any task by using only
 86 | a logarithmic number of options to reach a state of maximal value
 87 | (\secref{sec:theory}). As this scheme adds at most one additional option
 88 | per state, we do not explode the decision space for the agent.
 89 | 
 90 | Furthermore, in \secref{sec:algo}, we devise an algorithm that learns
 91 | small world options from optimal policies learnt for only a few tasks in
 92 | the domain. Thus not only are small world options effective to use, they
 93 | are also simple to learn, and do not require any global analysis of the
 94 | MDP. Experiments on several standard domains show that small-world
 95 | options outperform bottleneck-based methods, and that small world
 96 | options require far fewer learning epochs to be effective.
 97 | 
 98 | The remainder of the paper is organised as follows. We present an
 99 | overview of reinforcement learning, and the options framework in
100 | \secref{sec:background}. We then define a small world option, and prove
101 | that given such options, an agent will require to use only a logarithmic
102 | number of them to perform a task in \secref{sec:theory}. From a more
103 | practical perspective, we present an algorithm to extract these options
104 | from optimal policies learnt on several tasks in the domain in
105 | \secref{sec:algo}. We present our experimental results in
106 | \secref{sec:experiments}. Finally, we conclude in
107 | \secref{sec:conclusions}, where we present future directions for our
108 | work. \appendixref{sec:small-world-theory} contains an extension of
109 | Kleinberg's proof for the distributed search property of small-world
110 | networks which is used in \secref{sec:theory}.
111 | 
112 | 


--------------------------------------------------------------------------------