├── fathom ├── imagenet │ ├── __init__.py │ ├── imagenet.py │ ├── mnist.py │ └── image_processing.py ├── vgg │ ├── __init__.py │ └── vgg.py ├── deepq │ ├── __init__.py │ ├── roms │ │ └── .gitignore │ ├── emulator.py │ ├── database.py │ └── deepq.py ├── memnet │ ├── __init__.py │ ├── data_utils.py │ └── memnet.py ├── speech │ ├── __init__.py │ ├── phoneme.py │ ├── speech.py │ └── preproc.py ├── alexnet │ ├── __init__.py │ └── alexnet.py ├── seq2seq │ ├── __init__.py │ ├── data_utils.py │ └── seq2seq.py ├── autoenc │ ├── __init__.py │ ├── variational.py │ └── autoenc.py ├── residual │ ├── __init__.py │ └── residual.py ├── __init__.py ├── dataset.py └── nn.py ├── fathom.png ├── docs ├── index.md ├── assets │ ├── water-header.png │ ├── fix_search.js │ └── fathom.css ├── models.md ├── faq.md └── quickstart.md ├── .gitignore ├── setup.cfg ├── MANIFEST.in ├── runtest.sh ├── mkdocs.yml ├── Dockerfile ├── test └── test_basics.py ├── setup.py ├── README.md ├── pylintrc └── LICENSE /fathom/imagenet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fathom/vgg/__init__.py: -------------------------------------------------------------------------------- 1 | from .vgg import VGG, VGGFwd 2 | -------------------------------------------------------------------------------- /fathom/deepq/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepq import DeepQ, DeepQFwd 2 | -------------------------------------------------------------------------------- /fathom/memnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .memnet import MemNet, MemNetFwd 2 | -------------------------------------------------------------------------------- /fathom/speech/__init__.py: -------------------------------------------------------------------------------- 1 | from .speech import Speech, SpeechFwd 2 | -------------------------------------------------------------------------------- /fathom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdadolf/fathom/HEAD/fathom.png -------------------------------------------------------------------------------- /fathom/alexnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .alexnet import AlexNet, AlexNetFwd 2 | -------------------------------------------------------------------------------- /fathom/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | from .seq2seq import Seq2Seq, Seq2SeqFwd 2 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Fathom 2 | Reference workloads for modern deep learning. 3 | -------------------------------------------------------------------------------- /fathom/autoenc/__init__.py: -------------------------------------------------------------------------------- 1 | from .variational import Autoenc, AutoencFwd 2 | -------------------------------------------------------------------------------- /fathom/residual/__init__.py: -------------------------------------------------------------------------------- 1 | from .residual import Residual, ResidualFwd 2 | -------------------------------------------------------------------------------- /docs/assets/water-header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdadolf/fathom/HEAD/docs/assets/water-header.png -------------------------------------------------------------------------------- /fathom/deepq/roms/.gitignore: -------------------------------------------------------------------------------- 1 | # Licensing prevents us from distributing ROMs directly. 2 | * 3 | !.gitignore 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .*.swp 3 | .DS_Store 4 | 5 | Fathom_Workloads* 6 | 7 | deploy/ 8 | build/ 9 | dist/ 10 | 11 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | # By default, the logging level is apparently "NOTSET" (0), which causes a *ton* 3 | # of useless tensorflow output. 4 | logging-level: DEBUG 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Dockerfile 2 | include fathom.png 3 | include pylintrc 4 | include runtest.sh 5 | 6 | include README.md 7 | include mkdocs.yml 8 | recursive-include docs *.md *.png *.js *.css 9 | -------------------------------------------------------------------------------- /fathom/__init__.py: -------------------------------------------------------------------------------- 1 | from .deepq import * 2 | from .speech import * 3 | from .seq2seq import * 4 | from .autoenc import * 5 | from .memnet import * 6 | from .alexnet import * 7 | from .vgg import * 8 | from .residual import * 9 | -------------------------------------------------------------------------------- /runtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | die() { echo "ERROR: $@"; exit 1; } 3 | pylint -rn fathom/ || die 'Lint check failed for fathom/' 4 | pylint -rn test/ || die 'Lint check failed for test/' 5 | nosetests -v test/ || die 'Regression tests failed' 6 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Fathom 2 | repo_url: https://github.com/rdadolf/fathom 3 | site_description: 'Fathom: Reference Workloads for Modern Deep Learning Methods' 4 | google_analytics: ['UA-29492860-5', 'auto'] 5 | 6 | theme: 'readthedocs' 7 | site_dir: 'deploy' 8 | extra_css: [assets/fathom.css] 9 | extra_js: [assets/fix_search.js] 10 | 11 | pages: 12 | - 'Overview': index.md 13 | - 'Getting Started': quickstart.md 14 | - 'Models': models.md 15 | - 'Frequently Asked Questions': faq.md 16 | -------------------------------------------------------------------------------- /fathom/dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import tensorflow as tf 5 | 6 | class Dataset(object): 7 | """Simple wrapper for a dataset. 8 | 9 | Inspired by David Dao's TensorFlow models code. 10 | """ 11 | def __init__(self, subset, record_dir): 12 | """ 13 | record_dir: Directory with TFRecords. 14 | """ 15 | self.subset = subset 16 | self.record_dir = record_dir 17 | 18 | def data_files(self): 19 | return tf.gfile.Glob(os.path.join(self.record_dir, "{}-*".format(self.subset))) 20 | 21 | def record_queue(self): 22 | """Return a TensorFlow queue of TFRecords.""" 23 | return tf.train.string_input_producer(self.data_files()) 24 | 25 | def reader(self): 26 | return tf.TFRecordReader() 27 | 28 | -------------------------------------------------------------------------------- /docs/assets/fix_search.js: -------------------------------------------------------------------------------- 1 | $(document).ready(fixSearch); 2 | 3 | function fixSearch() { 4 | var target = document.getElementById('rtd-search-form'); 5 | var config = {attributes: true, childList: true}; 6 | 7 | var observer = new MutationObserver(function(mutations) { 8 | // if it isn't disconnected it'll loop infinitely because the observed element is modified 9 | observer.disconnect(); 10 | var form = $('#rtd-search-form'); 11 | form.empty(); 12 | form.attr('action', 'https://' + window.location.hostname + '/en/' + determineSelectedBranch() + '/search.html'); 13 | $('').attr({ 14 | type: "text", 15 | name: "q", 16 | placeholder: "Search docs" 17 | }).appendTo(form); 18 | }); 19 | // don't run this outside RTD hosting 20 | if (window.location.origin.indexOf('readthedocs') > -1) { 21 | observer.observe(target, config); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM tensorflow/tensorflow:0.8.0rc0-devel 2 | MAINTAINER Bob Adolf 3 | 4 | RUN apt-get update 5 | 6 | ### Software required for Fathom 7 | RUN apt-get install -y python-scipy 8 | RUN pip install scikit-learn 9 | RUN pip install librosa 10 | RUN apt-get install -y libhdf5-dev 11 | RUN pip install h5py 12 | 13 | # ALE 14 | RUN apt-get install -y libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake 15 | RUN git clone https://github.com/mgbellemare/Arcade-Learning-Environment.git /tmp/ALE 16 | RUN mkdir /tmp/build && cd /tmp/build && \ 17 | cmake -DUSE_SDL=ON -DUSE_RLGLUE=OFF /tmp/ALE && make 18 | RUN cd /tmp/ALE && pip install . 19 | # OpenCV 20 | RUN apt-get install -y libopencv-dev python-opencv 21 | 22 | ### Create a Fathom working environment 23 | RUN mkdir /data 24 | RUN useradd -ms /bin/bash fathom 25 | RUN chown fathom /data 26 | RUN chmod a+rwx /data 27 | USER fathom 28 | WORKDIR /home/fathom 29 | RUN git clone https://github.com/rdadolf/fathom.git 30 | ENV PYTHONPATH /home/fathom/fathom 31 | 32 | -------------------------------------------------------------------------------- /fathom/speech/phoneme.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Python representation of 61 TIMIT phonemes from DOC/PHONCODE.DOC. 5 | """ 6 | 7 | # list of 61 TIMIT phonemes from phoncode.doc 8 | timit_phonemes = [ 9 | # stop (and corresponding closures) 10 | 'b', 'bcl', 11 | 'd', 'dcl', 12 | 'g', 'gcl', 13 | 'p', 'pcl', 14 | 't', 'tcl', # NOTE: typo of "tck" in TIMIT docs 15 | 'k', 'kcl', 16 | 'dx', 17 | 'q', 18 | 19 | # affricates 20 | 'jh', 21 | 'ch', 22 | 23 | # fricatives 24 | 's', 25 | 'sh', 26 | 'z', 27 | 'zh', 28 | 'f', 29 | 'th', 30 | 'v', 31 | 'dh', 32 | 33 | # nasals 34 | 'm', 35 | 'n', 36 | 'ng', 37 | 'em', 38 | 'en', 39 | 'eng', 40 | 'nx', 41 | 42 | # semivowels and glides 43 | 'l', 44 | 'r', 45 | 'w', 46 | 'y', 47 | 'hh', 48 | 'hv', 49 | 'el', 50 | 51 | # vowels 52 | 'iy', 53 | 'ih', 54 | 'eh', 55 | 'ey', 56 | 'ae', 57 | 'aa', 58 | 'aw', 59 | 'ay', 60 | 'ah', 61 | 'ao', 62 | 'oy', 63 | 'ow', 64 | 'uh', 65 | 'uw', 66 | 'ux', 67 | 'er', 68 | 'ax', 69 | 'ix', 70 | 'axr', 71 | 'ax-h', 72 | 73 | # others 74 | 'pau', 75 | 'epi', 76 | 'h#', 77 | 78 | # lexicon-only (thus omitted from transcriptions) 79 | #'1', 80 | #'2', 81 | ] 82 | 83 | # map phoneme to index 84 | phoneme2index_list = [(phoneme, index) for index, phoneme in enumerate(timit_phonemes)] 85 | phoneme2index_dict = dict(phoneme2index_list) 86 | 87 | index2phoneme_list = [(index, phoneme) for index, phoneme in enumerate(timit_phonemes)] 88 | index2phoneme_dict = dict(index2phoneme_list) 89 | 90 | -------------------------------------------------------------------------------- /docs/assets/fathom.css: -------------------------------------------------------------------------------- 1 | /* Fathom uses a light Open Sans for body and Gruppo for styled titles. */ 2 | @import 'https://fonts.googleapis.com/css?family=Gruppo|Open+Sans'; 3 | body, h1, h2, h3, h4, h5, h6, legend { 4 | font-family: 'Open Sans', 'Helvetica Neue', 'Helvetica'; 5 | font-weight: 300; 6 | } 7 | code { 8 | font-size: 95%; 9 | padding-top: 4px; 10 | } 11 | /* Style all links in Fathom blue. */ 12 | a, a:visited { 13 | color: #007da5; 14 | } 15 | /* Make the large headers more visible */ 16 | h1 { 17 | padding-bottom: 5px; 18 | /*border-left: solid #007da5 1px;*/ 19 | border-bottom: solid #007da5 1px; 20 | } 21 | 22 | /***** Navigation Header *****/ 23 | /* Remove the house icon near the title. */ 24 | a.icon-home:before { content: ''; } 25 | /* Give the title a stylized look. */ 26 | a.icon-home { /* Desktop */ 27 | color: #fcfcfc; 28 | font-family: 'Gruppo'; 29 | font-size: 30pt; 30 | font-weight: 400; 31 | margin-bottom: 5px; 32 | } 33 | .wy-nav-top a { /* Mobile */ 34 | color: #fcfcfc; 35 | font-family: 'Gruppo'; 36 | font-size: 24pt; 37 | font-weight: 400; 38 | margin-bottom: 5px; 39 | } 40 | 41 | /* Add water as a background on desktop, just Fathom blue on mobile. */ 42 | .wy-side-nav-search { /* Desktop */ 43 | background: url(water-header.png) #007da5; 44 | padding: 5px; 45 | } 46 | .wy-nav-top { 47 | background-color: #007da5; 48 | } 49 | 50 | /***** Navigation Body *****/ 51 | .wy-menu-vertical a { 52 | color: #b3b3b3; /* Back to the default */ 53 | } 54 | .wy-menu-vertical li.toctree-l3 a { 55 | color: #404040; /* Highlight the difference between h1's and h*'s */ 56 | } 57 | -------------------------------------------------------------------------------- /docs/models.md: -------------------------------------------------------------------------------- 1 | # Seq2Seq 2 | *Direct language-to-language sentence translation. State-of-the-art accuracy with a simple, language-agnostic architecture.* 3 | 4 | *
Documentation in progress
* 5 | 6 | # MemNet 7 | *Facebook's memory-oriented neural system. One of two novel architectures which explore a topology beyond feed-forward lattices of neurons.* 8 | 9 | *
Documentation in progress
* 10 | 11 | # Speech 12 | *Baidu's speech recognition engine. Proved purely deep-learned networks can beat hand-tuned systems.* 13 | 14 | *
Documentation in progress
* 15 | 16 | # Autoenc 17 | *Variational autoencoder. An efficient, generative model for feature learning.* 18 | 19 | *
Documentation in progress
* 20 | 21 | # Residual 22 | *Image classifier from Microsoft Research Asia. Dramatically increased the practical depth of convolutional networks. ILSVRC 2015 winner.* 23 | 24 | *
Documentation in progress
* 25 | 26 | # VGG 27 | *Image classifier demonstrating the power of small convolutional filters. ILSVRC 2014 winner.* 28 | 29 | *
Documentation in progress
* 30 | 31 | # AlexNet 32 | *Image classifier. Watershed for deep learning by beating hand-tuned image systems at ILSVRC 2012.* 33 | 34 | *
Documentation in progress
* 35 | 36 | # DeepQ 37 | *Atari-playing neural network from DeepMind. Achieves superhuman performance on majority of Atari2600 games, without any preconceptions.* 38 | 39 | *
Documentation in progress
* 40 | -------------------------------------------------------------------------------- /test/test_basics.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | class TestBasics(unittest.TestCase): 4 | def test_import(self): 5 | modelnames = ['Speech','DeepQ','Seq2Seq','Autoenc','MemNet','Residual','VGG','AlexNet'] 6 | import fathom 7 | for modelname in modelnames: 8 | assert hasattr(fathom,modelname), 'No model named "'+str(modelname)+'" found in fathom module.' 9 | for modelname in modelnames: 10 | modelname += 'Fwd' 11 | assert hasattr(fathom,modelname), 'No model named "'+str(modelname)+'" found in fathom module.' 12 | 13 | # FIXME: ALE load failure causes testing to abort. 14 | @unittest.SkipTest 15 | def test_create_deepq(self): 16 | from fathom import DeepQ, DeepQFwd 17 | model = DeepQ() 18 | model = DeepQFwd() 19 | 20 | def test_create_speech(self): 21 | from fathom import Speech, SpeechFwd 22 | model = Speech() 23 | model = SpeechFwd() 24 | 25 | def test_create_seq2seq(self): 26 | from fathom import Seq2Seq, Seq2SeqFwd 27 | model = Seq2Seq() 28 | model = Seq2SeqFwd() 29 | 30 | def test_create_autoenc(self): 31 | from fathom import Autoenc, AutoencFwd 32 | model = Autoenc() 33 | model = AutoencFwd() 34 | 35 | def test_create_memnet(self): 36 | from fathom import MemNet, MemNetFwd 37 | model = MemNet() 38 | model = MemNetFwd() 39 | 40 | def test_create_residual(self): 41 | from fathom import Residual, ResidualFwd 42 | model = Residual() 43 | model = ResidualFwd() 44 | 45 | def test_create_vgg(self): 46 | from fathom import VGG, VGGFwd 47 | model = VGG() 48 | model = VGGFwd() 49 | 50 | def test_create_alexnet(self): 51 | from fathom import AlexNet, AlexNetFwd 52 | model = AlexNet() 53 | model = AlexNetFwd() 54 | 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | # The hyphens in release candidates (RCs) will automatically be normalized. 4 | # But we normalize below manually anyway. 5 | _VERSION = '1.0-rc0' 6 | 7 | # TODO: Add version numbers. 8 | REQUIRED_PACKAGES = [ 9 | 'scipy', 10 | 'tensorflow >= 1.0.0', 11 | 'scikit-learn', 12 | 'librosa', # audio preprocessing 13 | 'h5py' 14 | ] 15 | 16 | setup(name='Fathom-Workloads', # "fathom" is already taken on PyPI 17 | description='Reference workloads for modern deep learning', 18 | url='http://github.com/rdadolf/fathom', 19 | 20 | version=_VERSION.replace('-', ''), 21 | 22 | # Authors: Robert Adolf, Saketh Rama, and Brandon Reagen 23 | # PyPI does not have an easy way to specify multiple authors. 24 | author="Saketh Rama", 25 | author_email="rama@seas.harvard.edu", 26 | 27 | # We don't use __file__, but mark False to be safe. 28 | zip_safe=False, 29 | 30 | python_requires='>3.5', 31 | 32 | classifiers=[ 33 | 'Development Status :: 4 - Beta', 34 | 'Intended Audience :: Developers', 35 | 'Intended Audience :: Education', 36 | 'Intended Audience :: Science/Research', 37 | 38 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 39 | 'Topic :: Scientific/Engineering :: Image Recognition', 40 | 'Topic :: System :: Hardware', 41 | ], 42 | 43 | packages=find_packages(), # find packages in subdirectories 44 | 45 | package_data={'fathom': [ 46 | 'fathom.png', 47 | 48 | 'Dockerfile', 49 | 'pylintrc', 50 | 51 | 'README.md', 52 | 'mkdocs.yml', 53 | 54 | 'runtest.sh', 55 | 56 | 'setup.cfg', 57 | ]}, 58 | include_package_data=True, 59 | ) 60 | 61 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # Functions are missing from `cv2` 2 | 3 | You've probably installed the wrong python library. Unfortunately, the `cv2` package in PyPI is not related to OpenCV at all. It's a name-squatter who has managed to upload a useless, empty package. There are a couple of ways to install OpenCV: 4 | 5 | 1. Install from source by following the directions on the [OpenCV website](http://docs.opencv.org/2.4.13/doc/tutorials/introduction/linux_install/linux_install.html#linux-installation). 6 | 2. Install via apt: `sudo apt-get install python-opencv`. 7 | 3. Install using Anaconda: `conda install opencv`. 8 | 9 | # DeepQ can't find a ROM, but it's right there! 10 | 11 | [DeepQ](/models/#deepq) currently looks for its ROMs relative to Fathom's root directory. 12 | This is a bit hacky, and it will cause problems if you run anywhere else, regardless of whether you're using Fathom from the command-line or as a module. 13 | We're planning on fixing this eventually, but in the meantime, there are two solutions: 14 | 15 | 1. Run from the Fathom root directory. 16 | 17 | This should work: 18 | ```sh 19 | $ git clone https://github.com/rdadolf/fathom.git 20 | $ cd fathom 21 | $ export PYTHONPATH=`pwd` 22 | $ ./fathom//.py 23 | ``` 24 | 25 | But this won't: 26 | ```sh 27 | $ git clone https://github.com/rdadolf/fathom.git /tmp/fathom 28 | $ export PYTHONPATH=/tmp/fathom 29 | $ /tmp/fathom/fathom//.py 30 | ``` 31 | 32 | 2. Edit [DeepQ](/models/#deepq) to point to an absolute path. 33 | 34 | The `ROM_PATH` variable in [emulator.py](https://github.com/rdadolf/fathom/blob/master/fathom/deepq/emulator.py) tells the model where to search for a ROM. 35 | If you replace this variable with the absolute path to fathom, you should be able to run it anywhere. 36 | For instance, this should work: 37 | 38 | ```sh 39 | $ git clone https://github.com/rdadolf/fathom.git /tmp/fathom 40 | ``` 41 | 42 | ```python 43 | # in /tmp/fathom/fathom/deepq/emulator.py: 44 | ROM_PATH='/tmp/fathom/fathom/deepq/roms/' 45 | ``` 46 | 47 | ```sh 48 | $ export PYTHONPATH=/tmp/fathom 49 | $ python /tmp/fathom/fathom/deepq/deepq.py 50 | ``` 51 | 52 | # I found an issue with the Speech model! 53 | 54 | Our implementation requires significant improvement, which we have not yet undertaken for lack of time. 55 | -------------------------------------------------------------------------------- /fathom/deepq/emulator.py: -------------------------------------------------------------------------------- 1 | # NOTE: Tejas Kulkarni's implementation 2 | import sys 3 | import time 4 | import os.path 5 | 6 | import numpy as np 7 | from ale_python_interface import ALEInterface 8 | import cv2 9 | 10 | ROM_PATH = 'fathom/deepq/roms/' 11 | 12 | class emulator(object): 13 | def __init__(self, rom_name, vis,frameskip=1,windowname='preview'): 14 | self.ale = ALEInterface() 15 | self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode"); 16 | self.ale.setInt("random_seed",123) 17 | self.ale.setInt("frame_skip",frameskip) 18 | romfile = str(ROM_PATH)+str(rom_name) 19 | if not os.path.exists(romfile): 20 | print('No ROM file found at "'+romfile+'".\nAdjust ROM_PATH or double-check the filt exists.') 21 | self.ale.loadROM(romfile) 22 | self.legal_actions = self.ale.getMinimalActionSet() 23 | self.action_map = dict() 24 | self.windowname = windowname 25 | for i in range(len(self.legal_actions)): 26 | self.action_map[self.legal_actions[i]] = i 27 | 28 | # print(self.legal_actions) 29 | self.screen_width,self.screen_height = self.ale.getScreenDims() 30 | print("width/height: " +str(self.screen_width) + "/" + str(self.screen_height)) 31 | self.vis = vis 32 | if vis: 33 | cv2.startWindowThread() 34 | cv2.namedWindow(self.windowname, flags=cv2.WINDOW_AUTOSIZE) # permit manual resizing 35 | 36 | def get_image(self): 37 | numpy_surface = np.zeros(self.screen_height*self.screen_width*3, dtype=np.uint8) 38 | self.ale.getScreenRGB(numpy_surface) 39 | image = np.reshape(numpy_surface, (self.screen_height, self.screen_width, 3)) 40 | return image 41 | 42 | def newGame(self): 43 | self.ale.reset_game() 44 | return self.get_image() 45 | 46 | def next(self, action_indx): 47 | reward = self.ale.act(action_indx) 48 | nextstate = self.get_image() 49 | # scipy.misc.imsave('test.png',nextstate) 50 | if self.vis: 51 | cv2.imshow(self.windowname,nextstate) 52 | if sys.platform == 'darwin': 53 | # if we don't do this, can hang on OS X 54 | cv2.waitKey(2) 55 | return nextstate, reward, self.ale.game_over() 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | engine = emulator('breakout.bin',True) 61 | engine.next(0) 62 | time.sleep(5) 63 | -------------------------------------------------------------------------------- /fathom/deepq/database.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class database(object): 4 | def __init__(self, params): 5 | self.size = params['db_size'] 6 | self.img_scale = params['img_scale'] 7 | self.states = np.zeros([self.size,84,84],dtype='uint8') #image dimensions 8 | self.actions = np.zeros(self.size,dtype='float32') 9 | self.terminals = np.zeros(self.size,dtype='float32') 10 | self.rewards = np.zeros(self.size,dtype='float32') 11 | self.bat_size = params['batch'] 12 | self.bat_s = np.zeros([self.bat_size,84,84,4]) 13 | self.bat_a = np.zeros([self.bat_size]) 14 | self.bat_t = np.zeros([self.bat_size]) 15 | self.bat_n = np.zeros([self.bat_size,84,84,4]) 16 | self.bat_r = np.zeros([self.bat_size]) 17 | 18 | self.counter = 0 #keep track of next empty state 19 | self.flag = False 20 | return 21 | 22 | def get_batches(self): 23 | for i in range(self.bat_size): 24 | idx = 0 25 | while idx < 3 or (idx > self.counter-2 and idx < self.counter+3): 26 | idx = np.random.randint(3,self.get_size()-1) 27 | self.bat_s[i] = np.transpose(self.states[idx-3:idx+1,:,:],(1,2,0))/self.img_scale 28 | self.bat_n[i] = np.transpose(self.states[idx-2:idx+2,:,:],(1,2,0))/self.img_scale 29 | self.bat_a[i] = self.actions[idx] 30 | self.bat_t[i] = self.terminals[idx] 31 | self.bat_r[i] = self.rewards[idx] 32 | #self.bat_s[0] = np.transpose(self.states[10:14,:,:],(1,2,0))/self.img_scale 33 | #self.bat_n[0] = np.transpose(self.states[11:15,:,:],(1,2,0))/self.img_scale 34 | #self.bat_a[0] = self.actions[13] 35 | #self.bat_t[0] = self.terminals[13] 36 | #self.bat_r[0] = self.rewards[13] 37 | 38 | return self.bat_s,self.bat_a,self.bat_t,self.bat_n,self.bat_r 39 | 40 | def insert(self, prevstate_proc,reward,action,terminal): 41 | self.states[self.counter] = prevstate_proc 42 | self.rewards[self.counter] = reward 43 | self.actions[self.counter] = action 44 | self.terminals[self.counter] = terminal 45 | #update counter 46 | self.counter += 1 47 | if self.counter >= self.size: 48 | self.flag = True 49 | self.counter = 0 50 | return 51 | 52 | def get_size(self): 53 | if self.flag == False: 54 | return self.counter 55 | else: 56 | return self.size 57 | 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Fathom: Reference Workloads for Modern Deep Learning](https://raw.githubusercontent.com/rdadolf/fathom/master/fathom.png) 2 | 3 | [![build status](https://img.shields.io/badge/build-disabled-lightgray.svg)](https://travis-ci.org/rdadolf/fathom) 4 | [![docs status](https://readthedocs.org/projects/fathom/badge/?version=latest)](http://fathom.readthedocs.io/en/latest/) 5 | 6 | ## Release: [`1.0-rc0`](https://github.com/rdadolf/fathom/releases) 7 | 8 | This release reflects the state of Fathom more or less as it was for the paper published in September 2016. We are currently developing a somewhat more user-friendly version, which you can track in the GitHub issue tracker. If you're eager to use Fathom as it is, please let us know. 9 | 10 | ## Workloads 11 | 12 | This paper contains a description of the workloads, performance characteristics, and the rationale behind the project: 13 | 14 | > R. Adolf, S. Rama, B. Reagen, G.Y. Wei, D. Brooks. "Fathom: Reference Workloads for Modern Deep Learning Methods." 15 | [(Arxiv)](http://arxiv.org/abs/1608.06581) 16 | (DOI) 17 | 18 | Name | Description 19 | -------- | ----- 20 | Seq2Seq | Direct language-to-language sentence translation. State-of-the-art accuracy with a simple, language-agnostic architecture. 21 | MemNet | Facebook's memory-oriented neural system. One of two novel architectures which explore a topology beyond feed-forward lattices of neurons. 22 | Speech | Baidu's speech recognition engine. Proved purely deep-learned networks can beat hand-tuned systems. 23 | Autoenc | Variational autoencoder. An efficient, generative model for feature learning. 24 | Residual | Image classifier from Microsoft Research Asia. Dramatically increased the practical depth of convolutional networks. ILSVRC 2015 winner. 25 | VGG | Image classifier demonstrating the power of small convolutional filters. ILSVRC 2014 winner. 26 | AlexNet | Image classifier. Watershed for deep learning by beating hand-tuned image systems at ILSVRC 2012. 27 | DeepQ | Atari-playing neural network from DeepMind. Achieves superhuman performance on majority of Atari2600 games, without any preconceptions. 28 | 29 | ## Getting Started 30 | 31 | Read the [Fathom Quickstart Guide](http://fathom.readthedocs.io/en/latest/quickstart/) and let us know if you have any questions. 32 | 33 | Submit a GitHub issue if you have a suggestion or find a bug. 34 | -------------------------------------------------------------------------------- /fathom/autoenc/variational.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import tensorflow as tf 4 | 5 | import numpy as np 6 | from fathom.nn import default_runstep 7 | from fathom.autoenc.autoenc import xavier_init, AutoencBase 8 | 9 | # heavily based on tensorflow.models.autoencoder 10 | class Autoenc(AutoencBase): 11 | """Variational Autoencoder.""" 12 | def build_inference(self, inputs, transfer_function=tf.nn.softplus, scale=0.1): 13 | with self.G.as_default(): 14 | self.transfer = transfer_function 15 | 16 | self.training_scale = scale 17 | 18 | network_weights = self._initialize_weights() 19 | self.weights = network_weights 20 | 21 | self.z_mean = tf.add(tf.matmul(inputs, self.weights['w1']), self.weights['b1']) 22 | self.z_log_sigma_sq = tf.add(tf.matmul(inputs, self.weights['log_sigma_w1']), self.weights['log_sigma_b1']) 23 | 24 | # sample from gaussian distribution 25 | eps = tf.random_normal(tf.stack([tf.shape(self.xs)[0], self.n_hidden]), 0, 1, dtype = tf.float32) 26 | self.z = tf.add(self.z_mean, tf.multiply(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps)) 27 | 28 | self.reconstruction = tf.add(tf.matmul(self.z, self.weights['w2']), self.weights['b2']) 29 | 30 | # for unsupervised model, loss is part of testing as well 31 | self.build_loss(self.inputs, self.outputs) 32 | 33 | return self.reconstruction 34 | 35 | def build_loss(self, inputs, reconstruction): 36 | with self.G.as_default(): 37 | # cost 38 | reconstr_loss = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.xs), 2.0)) 39 | latent_loss = -0.5 * tf.reduce_sum(1 + self.z_log_sigma_sq 40 | - tf.square(self.z_mean) 41 | - tf.exp(self.z_log_sigma_sq), 1) 42 | self.loss_op = tf.reduce_mean(reconstr_loss + latent_loss) 43 | return self.loss_op 44 | 45 | def _initialize_weights(self): 46 | all_weights = dict() 47 | all_weights['w1'] = tf.Variable(xavier_init(self.n_input, self.n_hidden)) 48 | all_weights['log_sigma_w1'] = tf.Variable(xavier_init(self.n_input, self.n_hidden)) 49 | all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32)) 50 | all_weights['log_sigma_b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32)) 51 | all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32)) 52 | all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32)) 53 | return all_weights 54 | 55 | def generate(self, hidden = None): 56 | if hidden is None: 57 | hidden = np.random.normal(size=self.weights["b1"]) 58 | return self.session.run(self.reconstruction, feed_dict={self.z_mean: hidden}) 59 | 60 | class AutoencFwd(Autoenc): 61 | forward_only = True 62 | 63 | if __name__ == "__main__": 64 | m = Autoenc() 65 | m.setup() 66 | m.run(runstep=default_runstep, n_steps=10) 67 | m.teardown() 68 | -------------------------------------------------------------------------------- /fathom/memnet/data_utils.py: -------------------------------------------------------------------------------- 1 | # Dominique Luna's implementation 2 | 3 | 4 | import os 5 | import re 6 | import numpy as np 7 | 8 | def load_task(data_dir, task_id, only_supporting=False): 9 | '''Load the nth task. There are 20 tasks in total. 10 | 11 | Returns a tuple containing the training and testing data for the task. 12 | ''' 13 | assert task_id > 0 and task_id < 21 14 | 15 | files = os.listdir(data_dir) 16 | files = [os.path.join(data_dir, f) for f in files] 17 | s = 'qa{}_'.format(task_id) 18 | train_file = [f for f in files if s in f and 'train' in f][0] 19 | test_file = [f for f in files if s in f and 'test' in f][0] 20 | train_data = get_stories(train_file, only_supporting) 21 | test_data = get_stories(test_file, only_supporting) 22 | print(train_file, test_file) 23 | return train_data, test_data 24 | 25 | def tokenize(sent): 26 | '''Return the tokens of a sentence including punctuation. 27 | >>> tokenize('Bob dropped the apple. Where is the apple?') 28 | ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?'] 29 | ''' 30 | return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()] 31 | 32 | 33 | def parse_stories(lines, only_supporting=False): 34 | '''Parse stories provided in the bAbI tasks format 35 | If only_supporting is true, only the sentences that support the answer are kept. 36 | ''' 37 | data = [] 38 | story = [] 39 | for line in lines: 40 | line = str.lower(line) 41 | nid, line = line.split(' ', 1) 42 | nid = int(nid) 43 | if nid == 1: 44 | story = [] 45 | if '\t' in line: # question 46 | q, a, supporting = line.split('\t') 47 | q = tokenize(q) 48 | #a = tokenize(a) 49 | # answer is one vocab word even if it's actually multiple words 50 | a = [a] 51 | substory = None 52 | 53 | # remove question marks 54 | if q[-1] == "?": 55 | q = q[:-1] 56 | 57 | if only_supporting: 58 | # Only select the related substory 59 | supporting = list(map(int, supporting.split())) 60 | substory = [story[i - 1] for i in supporting] 61 | else: 62 | # Provide all the substories 63 | substory = [x for x in story if x] 64 | 65 | data.append((substory, q, a)) 66 | story.append('') 67 | else: # regular sentence 68 | # remove periods 69 | sent = tokenize(line) 70 | if sent[-1] == ".": 71 | sent = sent[:-1] 72 | story.append(sent) 73 | return data 74 | 75 | 76 | def get_stories(f, only_supporting=False): 77 | '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story. 78 | If max_length is supplied, any stories longer than max_length tokens will be discarded. 79 | ''' 80 | with open(f) as f: 81 | return parse_stories(f.readlines(), only_supporting=only_supporting) 82 | 83 | def vectorize_data(data, word_idx, sentence_size, memory_size): 84 | """ 85 | Vectorize stories and queries. 86 | 87 | If a sentence length < sentence_size, the sentence will be padded with 0's. 88 | 89 | If a story length < memory_size, the story will be padded with empty memories. 90 | Empty memories are 1-D arrays of length sentence_size filled with 0's. 91 | 92 | The answer array is returned as a one-hot encoding. 93 | """ 94 | S = [] 95 | Q = [] 96 | A = [] 97 | for story, query, answer in data: 98 | ss = [] 99 | for sentence in story: 100 | ls = max(0, sentence_size - len(sentence)) 101 | ss.append([word_idx[w] for w in sentence] + [0] * ls) 102 | 103 | # take only the most recent sentences that fit in memory 104 | ss = ss[::-1][:memory_size][::-1] 105 | 106 | # pad to memory_size 107 | lm = max(0, memory_size - len(ss)) 108 | for _ in range(lm): 109 | ss.append([0] * sentence_size) 110 | 111 | lq = max(0, sentence_size - len(query)) 112 | q = [word_idx[w] for w in query] + [0] * lq 113 | 114 | y = np.zeros(len(word_idx) + 1) # 0 is reserved for nil word 115 | for a in answer: 116 | y[word_idx[a]] = 1 117 | 118 | S.append(ss) 119 | Q.append(q) 120 | A.append(y) 121 | return np.array(S), np.array(Q), np.array(A) 122 | -------------------------------------------------------------------------------- /pylintrc: -------------------------------------------------------------------------------- 1 | # Largely based on the pylint default configuration. 2 | # pylint --generate-rcfile 3 | 4 | [MASTER] 5 | ignore=CVS 6 | persistent=no 7 | load-plugins= 8 | jobs=1 9 | unsafe-load-any-extension=no 10 | extension-pkg-whitelist= 11 | optimize-ast=no 12 | 13 | [MESSAGES CONTROL] 14 | disable=all 15 | # Full list at https://pylint.readthedocs.io/en/latest/features.html 16 | enable= 17 | # 2 spaces. I don't care what PEP8 says. 18 | bad-indentation,mixed-indentation, 19 | # keeps things clean 20 | #unused-import,unused-variable, 21 | # I'm needlessly inconsistent about this. 22 | wrong-import-order, 23 | # No need for old-style classes, and I sometimes rely on new-style ones. So make everything new. 24 | old-style-class, 25 | # There's almost always a better way. 26 | dangerous-default-value, 27 | # I should probably just automatically fix this... 28 | trailing-whitespace, 29 | # Prudent. 30 | #arguments-differ, 31 | 32 | [REPORTS] 33 | # Defaults 34 | output-format=text 35 | files-output=no 36 | reports=yes 37 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 38 | #msg-template= 39 | 40 | [BASIC] 41 | # List of builtins function names that should not be used 42 | bad-functions=input 43 | # Good/bad variable names which should always/never be accepted 44 | good-names=i,j,k,ex,Run,_ 45 | bad-names= 46 | 47 | function-rgx=[a-z_][a-z0-9_]{2,30}$ 48 | function-name-hint=[a-z_][a-z0-9_]{2,30}$ 49 | variable-rgx=[a-z_][a-z0-9_]{2,30}$ 50 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$ 51 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 52 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$ 53 | attr-rgx=[a-z_][a-z0-9_]{2,30}$ 54 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$ 55 | argument-rgx=[a-z_][a-z0-9_]{2,30}$ 56 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$ 57 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 58 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$ 59 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$ 60 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$ 61 | class-rgx=[A-Z_][a-zA-Z0-9]+$ 62 | class-name-hint=[A-Z_][a-zA-Z0-9]+$ 63 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 64 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$ 65 | method-rgx=[a-z_][a-z0-9_]{2,30}$ 66 | method-name-hint=[a-z_][a-z0-9_]{2,30}$ 67 | no-docstring-rgx=^_ 68 | docstring-min-length=-1 69 | 70 | [ELIF] 71 | # Maximum number of nested blocks for function / method body 72 | max-nested-blocks=5 73 | 74 | [FORMAT] 75 | # Maximum number of characters on a single line. 76 | max-line-length=80 77 | ignore-long-lines=^\s*(# )??$ 78 | single-line-if-stmt=no 79 | no-space-check=trailing-comma,dict-separator 80 | max-module-lines=1000 81 | indent-string=' ' 82 | # Number of spaces of indent required inside a hanging or continued line. 83 | indent-after-paren=2 84 | expected-line-ending-format= 85 | 86 | [LOGGING] 87 | # Logging modules to check that the string format arguments are in logging 88 | # function parameter format 89 | logging-modules=logging 90 | 91 | [MISCELLANEOUS] 92 | # List of note tags to take in consideration, separated by a comma. 93 | notes=FIXME,XXX,TODO 94 | 95 | [SIMILARITIES] 96 | min-similarity-lines=4 97 | ignore-comments=yes 98 | ignore-docstrings=yes 99 | ignore-imports=no 100 | 101 | [TYPECHECK] 102 | ignore-mixin-members=yes 103 | ignored-modules= 104 | ignored-classes= 105 | generated-members= 106 | 107 | [VARIABLES] 108 | # Tells whether we should check for unused import in __init__ files. 109 | init-import=no 110 | dummy-variables-rgx=_$ 111 | additional-builtins= 112 | callbacks= 113 | 114 | [CLASSES] 115 | defining-attr-methods=__init__,__new__ 116 | valid-classmethod-first-arg=cls 117 | valid-metaclass-classmethod-first-arg=mcs 118 | exclude-protected= 119 | 120 | [DESIGN] 121 | max-args=1000 122 | ignored-argument-names=_.* 123 | max-locals=15 124 | max-returns=6 125 | max-branches=12 126 | max-statements=50 127 | max-parents=7 128 | max-attributes=7 129 | min-public-methods=2 130 | max-public-methods=20 131 | max-bool-expr=5 132 | 133 | [IMPORTS] 134 | deprecated-modules=regsub,TERMIOS,Bastion,rexec 135 | import-graph= 136 | ext-import-graph= 137 | int-import-graph= 138 | 139 | [EXCEPTIONS] 140 | overgeneral-exceptions=Exception 141 | -------------------------------------------------------------------------------- /fathom/imagenet/imagenet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import tensorflow as tf 4 | 5 | from fathom.nn import NeuralNetworkModel, default_runstep 6 | from fathom.dataset import Dataset 7 | from fathom.imagenet.image_processing import distorted_inputs 8 | 9 | # TODO: don't hard-code this 10 | imagenet_record_dir = '/data/ILSVRC2012/imagenet-tfrecord/' 11 | 12 | class Imagenet(Dataset): 13 | """Design from TensorFlow Inception example.""" 14 | def __init__(self, subset, record_dir=imagenet_record_dir): 15 | super(Imagenet, self).__init__(subset, record_dir) 16 | 17 | def num_classes(self): 18 | return 1000 19 | 20 | def num_examples_per_epoch(self): 21 | # Bounding box data consists of 615299 bounding boxes for 544546 images. 22 | if self.subset == 'train': 23 | return 1281167 24 | if self.subset == 'validation': 25 | return 50000 26 | 27 | class ImagenetModel(NeuralNetworkModel): 28 | @property 29 | def inputs(self): 30 | return self.images 31 | 32 | @property 33 | def labels(self): 34 | return self._labels 35 | 36 | @property 37 | def outputs(self): 38 | return self.logits 39 | 40 | @property 41 | def loss(self): 42 | return self.loss_op 43 | 44 | @property 45 | def train(self): 46 | return self.train_op 47 | 48 | def build_inputs(self): 49 | with self.G.as_default(): 50 | # TODO: configure image_size in image_processing.py 51 | self.image_size = 224 # side of the square image 52 | self.channels = 3 53 | self.n_input = self.image_size * self.image_size * self.channels 54 | 55 | self.images = tf.placeholder(tf.float32, [None, self.image_size, self.image_size, self.channels]) 56 | 57 | # add queue runners (evaluation dequeues records) 58 | self.dataset = Imagenet('train') 59 | self.batch_images_queue, self.batch_labels_queue = distorted_inputs(self.dataset, batch_size=self.batch_size) 60 | 61 | def build_labels(self): 62 | with self.G.as_default(): 63 | self.n_classes = 1000 + 1 # background class 64 | self._labels = tf.placeholder(tf.int64, [None]) 65 | 66 | def build_evaluation(self): 67 | """Evaluation metrics (e.g., accuracy).""" 68 | self.correct_pred = tf.equal(tf.argmax(self.outputs, 1), self.labels) # TODO: off-by-one? 69 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32)) 70 | 71 | def build_hyperparameters(self): 72 | with self.G.as_default(): 73 | self.learning_rate = 0.001 74 | self.training_iters = 200000 75 | self.batch_size = 64 76 | self.display_step = 1 77 | 78 | self.dropout = 0.8 # Dropout, probability to keep units 79 | 80 | # TODO: can this not be a placeholder? 81 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) 82 | 83 | def build_loss(self, logits, labels): 84 | with self.G.as_default(): 85 | # Define loss 86 | # TODO: does this labels have unexpected state? 87 | self.loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)) 88 | return self.loss_op 89 | 90 | def build_train(self, total_loss): 91 | with self.G.as_default(): 92 | opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 93 | 94 | # Compute and apply gradients. 95 | #self.train_op = opt.minimize(total_loss, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) 96 | self.train_op = opt.minimize(total_loss) 97 | 98 | return self.train_op 99 | 100 | def load_data(self): 101 | # Grab the dataset from the internet, if necessary 102 | self.num_batches_per_epoch = self.dataset.num_examples_per_epoch() / self.batch_size 103 | 104 | def run(self, runstep=default_runstep, n_steps=1): 105 | self.load_data() 106 | 107 | with self.G.as_default(): 108 | # Keep training until reach max iterations 109 | step = 1 110 | while step * self.batch_size < self.training_iters: 111 | if step > n_steps: 112 | return 113 | 114 | # TODO: switch to test 115 | batch_images, batch_labels = self.session.run([self.batch_images_queue, self.batch_labels_queue]) 116 | 117 | print("Queued ImageNet batch.") 118 | 119 | if not self.forward_only: 120 | _, loss_value, acc = runstep( 121 | self.session, 122 | [self.train, self.loss, self.accuracy], 123 | feed_dict={self.images: batch_images, self._labels: batch_labels, self.keep_prob: self.dropout}, 124 | ) 125 | 126 | if step % self.display_step == 0: 127 | print("Iter " + str(step*self.batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss_value) + ", Training Accuracy= " + "{:.5f}".format(acc)) 128 | else: 129 | _ = runstep( 130 | self.session, 131 | self.outputs, 132 | feed_dict={self.images: batch_images, self._labels: batch_labels, self.keep_prob: 1.}, 133 | ) 134 | 135 | step += 1 136 | 137 | #print "Testing Accuracy:", runstep(self.session, [self.accuracy], feed_dict={self.images: self.mnist.test.images[:256], self._labels: self.mnist.test.labels[:256], self.keep_prob: 1.}) 138 | -------------------------------------------------------------------------------- /fathom/nn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from abc import ABCMeta, abstractmethod, abstractproperty 4 | import tensorflow as tf 5 | 6 | class GenericModel(object, metaclass=ABCMeta): 7 | def __init__(self, device=None, init_options=None): 8 | self.device=device 9 | 10 | @abstractmethod 11 | def model(self): 12 | 'Return a reference to the native representation of the model.' 13 | pass 14 | def setup(self, setup_options=None): 15 | '(Optional) Prepare the model for running.' 16 | pass 17 | @abstractmethod 18 | def run(self, runstep=None, n_steps=1, *args, **kwargs): 19 | 'Run the model.' 20 | pass 21 | def teardown(self): 22 | '(Optional) Clean up after a model run.' 23 | pass 24 | 25 | def default_runstep(session, sink_ops, *options, **kw_options): 26 | return session.run(sink_ops, *options, **kw_options) 27 | 28 | 29 | class NeuralNetworkModel(GenericModel, metaclass=ABCMeta): 30 | forward_only = False 31 | 32 | def __init__(self, device=None, init_options=None): 33 | super(NeuralNetworkModel,self).__init__(device=device, init_options=init_options) 34 | 35 | self.G = tf.Graph() 36 | self.session = None 37 | 38 | # e.g., for batch_size 39 | self.init_options = init_options 40 | 41 | with self.G.device(device): 42 | with self.G.as_default(): 43 | self.build() 44 | 45 | with self.G.as_default(): 46 | self.init = tf.global_variables_initializer() 47 | 48 | @abstractmethod 49 | def load_data(self): 50 | """Load dataset (possibly downloading it).""" 51 | pass 52 | 53 | @abstractmethod 54 | def build_inputs(self): 55 | """Construct graph's input placeholders.""" 56 | pass 57 | 58 | @abstractmethod 59 | def build_labels(self): 60 | """Construct graph's label placeholders.""" 61 | pass 62 | 63 | @abstractproperty 64 | def inputs(self): 65 | pass 66 | 67 | @abstractproperty 68 | def labels(self): 69 | pass 70 | 71 | @abstractmethod 72 | def build_hyperparameters(self): 73 | """Set hard-coded hyperparameters.""" 74 | pass 75 | 76 | @abstractproperty 77 | def outputs(self): 78 | """Network outputs before loss function.""" 79 | pass 80 | 81 | @abstractproperty 82 | def loss(self): 83 | """Loss function.""" 84 | pass 85 | 86 | @abstractproperty 87 | def train(self): 88 | """Training/optimization operation.""" 89 | pass 90 | 91 | def build_evaluation(self): 92 | """Evaluation metrics (e.g., accuracy).""" 93 | self.correct_pred = tf.equal(tf.argmax(self.outputs, 1), tf.argmax(self.labels, 1)) 94 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32)) 95 | 96 | def build(self): 97 | """Build computation graph.""" 98 | with self.G.as_default(): 99 | self.global_step = tf.Variable(0, trainable=False) 100 | 101 | self.build_hyperparameters() 102 | 103 | self.build_inputs() 104 | self.build_labels() 105 | 106 | self.build_inference(self.inputs) 107 | 108 | if not self.forward_only: 109 | self.build_loss(self.outputs, self.labels) 110 | self.build_train(self.loss_op) 111 | 112 | self.build_evaluation() 113 | 114 | @abstractmethod 115 | def build_inference(self, inputs): 116 | """Build inference. 117 | 118 | Args: 119 | inputs: Images, for example. 120 | 121 | Returns: 122 | Logits. 123 | """ 124 | pass 125 | 126 | @abstractmethod 127 | def build_loss(self, outputs, labels): 128 | """Add loss to trainable variables. 129 | Args: 130 | outputs: Outputs from inference(). 131 | labels: Labels from inputs. 1-D tensor of shape [batch_size]. 132 | 133 | Returns: 134 | Loss tensor of type float. 135 | """ 136 | pass 137 | 138 | @abstractmethod 139 | def build_train(self, total_loss, global_step): 140 | """Train model. 141 | 142 | Create optimizer and apply to all trainable variables. 143 | 144 | Args: 145 | total_loss: Total loss from loss(). 146 | global_step: Integer Variable counting number of training steps processed. 147 | 148 | Returns: 149 | train_op: op for training. 150 | """ 151 | pass 152 | 153 | def model(self): 154 | return self.G 155 | 156 | def setup(self, setup_options=None): 157 | """Make session and launch queue runners.""" 158 | super(NeuralNetworkModel,self).setup(setup_options=setup_options) 159 | with self.G.as_default(): 160 | # Start a new session and initialize the network 161 | if setup_options is not None: 162 | self.session = tf.Session(config=tf.ConfigProto(**setup_options)) 163 | else: 164 | self.session = tf.Session() 165 | # Start the input data loaders 166 | self.coord = tf.train.Coordinator() 167 | self.session.run(self.init) 168 | # Start the input data loaders 169 | self.threads = tf.train.start_queue_runners(sess=self.session,coord=self.coord) 170 | 171 | def teardown(self): 172 | """Close session and join queue runners.""" 173 | self.coord.request_stop() 174 | self.coord.join(self.threads, stop_grace_period_secs=10) 175 | if self.session is not None: 176 | self.session.close() 177 | self.session = None 178 | 179 | -------------------------------------------------------------------------------- /fathom/vgg/vgg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from math import sqrt 4 | import tensorflow as tf 5 | from fathom.nn import default_runstep 6 | from fathom.imagenet import imagenet 7 | 8 | class VGG(imagenet.ImagenetModel): 9 | """VGG Network.""" 10 | def build_hyperparameters(self): 11 | # TODO: put these into runstep options or somewhere else 12 | # Parameters 13 | self.learning_rate = 0.0001 14 | self.training_iters = 200000 15 | self.batch_size = 8 16 | if self.init_options: 17 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 18 | self.display_step = 1 19 | 20 | if not self.forward_only: 21 | self.dropout = 0.8 # Dropout, probability to keep units 22 | else: 23 | self.dropout = 1. 24 | 25 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) 26 | 27 | def build_inference(self, images): 28 | with self.G.as_default(): 29 | # fix dimensions 30 | input_shape = images.get_shape().as_list() 31 | if len(input_shape) == 2: 32 | ndim = int(sqrt(input_shape[1])) 33 | if ndim * ndim != input_shape[1]: 34 | raise ValueError('input_shape should be square') 35 | images = tf.reshape(images, [-1, ndim, ndim, 1]) 36 | 37 | # assume images shape is 224x224x3 38 | 39 | # block 1 -- outputs 112x112x64 40 | conv1_1 = conv_op(images, name="conv1_1", kh=3, kw=3, n_out=64, dh=1, dw=1) 41 | conv1_2 = conv_op(conv1_1, name="conv1_2", kh=3, kw=3, n_out=64, dh=1, dw=1) 42 | pool1 = mpool_op(conv1_2, name="pool1", kh=2, kw=2, dw=2, dh=2) 43 | 44 | # block 2 -- outputs 56x56x128 45 | conv2_1 = conv_op(pool1, name="conv2_1", kh=3, kw=3, n_out=128, dh=1, dw=1) 46 | conv2_2 = conv_op(conv2_1, name="conv2_2", kh=3, kw=3, n_out=128, dh=1, dw=1) 47 | pool2 = mpool_op(conv2_2, name="pool2", kh=2, kw=2, dh=2, dw=2) 48 | 49 | # TODO: VGG pooling in later layers is too aggressive for MNIST 50 | using_imagenet = True 51 | if using_imagenet: 52 | # block 3 -- outputs 28x28x256 53 | conv3_1 = conv_op(pool2, name="conv3_1", kh=3, kw=3, n_out=256, dh=1, dw=1) 54 | conv3_2 = conv_op(conv3_1, name="conv3_2", kh=3, kw=3, n_out=256, dh=1, dw=1) 55 | pool3 = mpool_op(conv3_2, name="pool3", kh=2, kw=2, dh=2, dw=2) 56 | 57 | # block 4 -- outputs 14x14x512 58 | conv4_1 = conv_op(pool3, name="conv4_1", kh=3, kw=3, n_out=512, dh=1, dw=1) 59 | conv4_2 = conv_op(conv4_1, name="conv4_2", kh=3, kw=3, n_out=512, dh=1, dw=1) 60 | conv4_3 = conv_op(conv4_2, name="conv4_2", kh=3, kw=3, n_out=512, dh=1, dw=1) 61 | pool4 = mpool_op(conv4_3, name="pool4", kh=2, kw=2, dh=2, dw=2) 62 | 63 | # block 5 -- outputs 7x7x512 64 | conv5_1 = conv_op(pool4, name="conv5_1", kh=3, kw=3, n_out=512, dh=1, dw=1) 65 | conv5_2 = conv_op(conv5_1, name="conv5_2", kh=3, kw=3, n_out=512, dh=1, dw=1) 66 | conv5_3 = conv_op(conv5_2, name="conv5_3", kh=3, kw=3, n_out=512, dh=1, dw=1) 67 | pool5 = mpool_op(conv5_3, name="pool5", kh=2, kw=2, dw=2, dh=2) 68 | 69 | # flatten 70 | shp = pool5.get_shape().as_list() # pool2 if shrunk 71 | flattened_shape = shp[1] * shp[2] * shp[3] 72 | resh1 = tf.reshape(pool5, [self.batch_size, flattened_shape], name="resh1") 73 | 74 | # fully connected 75 | fc6 = fc_op(resh1, name="fc6", n_out=4096) 76 | fc6_drop = tf.nn.dropout(fc6, self.dropout, name="fc6_drop") 77 | 78 | fc7 = fc_op(fc6_drop, name="fc7", n_out=4096) 79 | fc7_drop = tf.nn.dropout(fc7, self.dropout, name="fc7_drop") 80 | 81 | fc8 = fc_op(fc7_drop, name="fc8", n_out=self.n_classes) 82 | 83 | self.logits = fc8 84 | 85 | return self.logits 86 | 87 | # crudely based on https://github.com/huyng/tensorflow-vgg 88 | # TODO: refactor these utility functions across convnet models to remove dependencies 89 | def conv_op(input_op, name, kw, kh, n_out, dw, dh): 90 | n_in = input_op.get_shape()[-1].value 91 | 92 | with tf.name_scope(name) as scope: 93 | kernel_init_val = tf.truncated_normal([kh, kw, n_in, n_out], dtype=tf.float32, stddev=0.1) 94 | kernel = tf.Variable(kernel_init_val, trainable=True, name='w') 95 | conv = tf.nn.conv2d(input_op, kernel, (1, dh, dw, 1), padding='SAME') 96 | bias_init_val = tf.constant(0.0, shape=[n_out], dtype=tf.float32) 97 | biases = tf.Variable(bias_init_val, trainable=True, name='b') 98 | z = tf.reshape(tf.nn.bias_add(conv, biases), [n_in] + conv.get_shape().as_list()[1:]) 99 | z = tf.nn.bias_add(conv, biases) 100 | activation = tf.nn.relu(z, name=scope) 101 | return activation 102 | 103 | def fc_op(input_op, name, n_out): 104 | n_in = input_op.get_shape()[-1].value 105 | 106 | with tf.name_scope(name): 107 | kernel = tf.Variable(tf.truncated_normal([n_in, n_out], dtype=tf.float32, stddev=0.1), name='w') 108 | biases = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float32), name='b') 109 | activation = tf.nn.relu_layer(input_op, kernel, biases, name=name) 110 | return activation 111 | 112 | def mpool_op(input_op, name, kh, kw, dh, dw): 113 | return tf.nn.max_pool(input_op, 114 | ksize=[1, kh, kw, 1], 115 | strides=[1, dh, dw, 1], 116 | padding='VALID', 117 | name=name) 118 | 119 | class VGGFwd(VGG): 120 | forward_only = True 121 | 122 | if __name__ == "__main__": 123 | m = VGG() 124 | m.setup() 125 | m.run(runstep=default_runstep, n_steps=10) 126 | m.teardown() 127 | -------------------------------------------------------------------------------- /fathom/alexnet/alexnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import tensorflow as tf 3 | 4 | from fathom.imagenet import imagenet 5 | from fathom.nn import default_runstep 6 | 7 | def conv2d(name, l_input, w, b): 8 | return tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(l_input, w, strides=[1, 1, 1, 1], padding='SAME'),b), name=name) 9 | 10 | def max_pool(name, l_input, k): 11 | return tf.nn.max_pool(l_input, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME', name=name) 12 | 13 | def norm(name, l_input, lsize=4): 14 | return tf.nn.lrn(l_input, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name=name) 15 | 16 | class AlexNet(imagenet.ImagenetModel): 17 | """Based on Aymeric Damien's TensorFlow example of AlexNet.""" 18 | def build_inference(self, images): 19 | with self.G.as_default(): 20 | # conv1 21 | with tf.name_scope('conv1') as scope: 22 | kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=tf.float32, 23 | stddev=1e-1), name='weights') 24 | conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME') 25 | biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32), 26 | trainable=True, name='biases') 27 | bias = tf.nn.bias_add(conv, biases) 28 | conv1 = tf.nn.relu(bias, name=scope) 29 | 30 | # pool1 31 | pool1 = tf.nn.max_pool(conv1, 32 | ksize=[1, 3, 3, 1], 33 | strides=[1, 2, 2, 1], 34 | padding='VALID', 35 | name='pool1') 36 | 37 | # TODO: lrn1 38 | lsize = 4 39 | norm1 = tf.nn.lrn(pool1, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 40 | 41 | # conv2 42 | with tf.name_scope('conv2') as scope: 43 | kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=tf.float32, 44 | stddev=1e-1), name='weights') 45 | conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME') 46 | biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=tf.float32), 47 | trainable=True, name='biases') 48 | bias = tf.nn.bias_add(conv, biases) 49 | conv2 = tf.nn.relu(bias, name=scope) 50 | 51 | # pool2 52 | pool2 = tf.nn.max_pool(conv2, 53 | ksize=[1, 3, 3, 1], 54 | strides=[1, 2, 2, 1], 55 | padding='VALID', 56 | name='pool2') 57 | 58 | norm2 = tf.nn.lrn(pool2, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 59 | 60 | # conv3 61 | with tf.name_scope('conv3') as scope: 62 | kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384], 63 | dtype=tf.float32, 64 | stddev=1e-1), name='weights') 65 | conv = tf.nn.conv2d(norm2, kernel, [1, 1, 1, 1], padding='SAME') 66 | biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=tf.float32), 67 | trainable=True, name='biases') 68 | bias = tf.nn.bias_add(conv, biases) 69 | conv3 = tf.nn.relu(bias, name=scope) 70 | 71 | # conv4 72 | with tf.name_scope('conv4') as scope: 73 | kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256], 74 | dtype=tf.float32, 75 | stddev=1e-1), name='weights') 76 | conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME') 77 | biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32), 78 | trainable=True, name='biases') 79 | bias = tf.nn.bias_add(conv, biases) 80 | conv4 = tf.nn.relu(bias, name=scope) 81 | 82 | # conv5 83 | with tf.name_scope('conv5') as scope: 84 | kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256], 85 | dtype=tf.float32, 86 | stddev=1e-1), name='weights') 87 | conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME') 88 | biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32), 89 | trainable=True, name='biases') 90 | bias = tf.nn.bias_add(conv, biases) 91 | conv5 = tf.nn.relu(bias, name=scope) 92 | 93 | # pool5 94 | pool5 = tf.nn.max_pool(conv5, 95 | ksize=[1, 3, 3, 1], 96 | strides=[1, 2, 2, 1], 97 | padding='VALID', 98 | name='pool5') 99 | 100 | pool5_shape = pool5.get_shape().as_list() 101 | pool5_length = pool5_shape[1] * pool5_shape[2] * pool5_shape[3] 102 | 103 | wd1 = tf.Variable(tf.random_normal([pool5_length, 4096])) 104 | bd1 = tf.Variable(tf.random_normal([4096])) 105 | 106 | flattened_pool5 = tf.reshape(pool5, [self.batch_size, pool5_length]) 107 | dense1 = tf.nn.relu(tf.nn.xw_plus_b(flattened_pool5, wd1, bd1), name='fc1') 108 | 109 | wd2 = tf.Variable(tf.random_normal([4096, 4096])) 110 | bd2 = tf.Variable(tf.random_normal([4096])) 111 | dense2 = tf.nn.relu(tf.nn.xw_plus_b(dense1, wd2, bd2), name='fc2') 112 | 113 | w_out = tf.Variable(tf.random_normal([4096, self.n_classes])) 114 | b_out = tf.Variable(tf.random_normal([self.n_classes])) 115 | 116 | self.logits = tf.nn.xw_plus_b(dense2, w_out, b_out) 117 | 118 | return self.logits 119 | 120 | def build_hyperparameters(self): 121 | self.learning_rate = 0.001 122 | self.training_iters = 200000 123 | self.batch_size = 64 124 | if self.init_options: 125 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 126 | self.display_step = 1 127 | 128 | self.dropout = 0.8 # Dropout, probability to keep units 129 | 130 | # TODO: can this not be a placeholder? 131 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) 132 | 133 | class AlexNetFwd(AlexNet): 134 | forward_only = True 135 | 136 | if __name__=='__main__': 137 | m = AlexNet() 138 | m.setup() 139 | m.run(runstep=default_runstep, n_steps=10) 140 | m.teardown() 141 | 142 | -------------------------------------------------------------------------------- /fathom/imagenet/mnist.py: -------------------------------------------------------------------------------- 1 | """Functions for downloading and reading MNIST data. 2 | 3 | Original Author: Aymeric Damien 4 | https://github.com/aymericdamien/TensorFlow-Examples/ 5 | """ 6 | 7 | # TODO: clean up dataset code 8 | 9 | 10 | import gzip 11 | import os 12 | import urllib.request, urllib.parse, urllib.error 13 | import numpy 14 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' 15 | def maybe_download(filename, work_directory): 16 | """Download the data from Yann's website, unless it's already here.""" 17 | if not os.path.exists(work_directory): 18 | os.mkdir(work_directory) 19 | filepath = os.path.join(work_directory, filename) 20 | if not os.path.exists(filepath): 21 | filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath) 22 | statinfo = os.stat(filepath) 23 | print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.') 24 | return filepath 25 | def _read32(bytestream): 26 | dt = numpy.dtype(numpy.uint32).newbyteorder('>') 27 | return numpy.frombuffer(bytestream.read(4), dtype=dt)[0] 28 | def extract_images(filename): 29 | """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" 30 | print('Extracting', filename) 31 | with gzip.open(filename) as bytestream: 32 | magic = _read32(bytestream) 33 | if magic != 2051: 34 | raise ValueError( 35 | 'Invalid magic number %d in MNIST image file: %s' % 36 | (magic, filename)) 37 | num_images = _read32(bytestream) 38 | rows = _read32(bytestream) 39 | cols = _read32(bytestream) 40 | buf = bytestream.read(rows * cols * num_images) 41 | data = numpy.frombuffer(buf, dtype=numpy.uint8) 42 | data = data.reshape(num_images, rows, cols, 1) 43 | return data 44 | def dense_to_one_hot(labels_dense, num_classes=10): 45 | """Convert class labels from scalars to one-hot vectors.""" 46 | num_labels = labels_dense.shape[0] 47 | index_offset = numpy.arange(num_labels) * num_classes 48 | labels_one_hot = numpy.zeros((num_labels, num_classes)) 49 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 50 | return labels_one_hot 51 | def extract_labels(filename, one_hot=False): 52 | """Extract the labels into a 1D uint8 numpy array [index].""" 53 | print('Extracting', filename) 54 | with gzip.open(filename) as bytestream: 55 | magic = _read32(bytestream) 56 | if magic != 2049: 57 | raise ValueError( 58 | 'Invalid magic number %d in MNIST label file: %s' % 59 | (magic, filename)) 60 | num_items = _read32(bytestream) 61 | buf = bytestream.read(num_items) 62 | labels = numpy.frombuffer(buf, dtype=numpy.uint8) 63 | if one_hot: 64 | return dense_to_one_hot(labels) 65 | return labels 66 | class DataSet(object): 67 | def __init__(self, images, labels, fake_data=False): 68 | if fake_data: 69 | self._num_examples = 10000 70 | else: 71 | assert images.shape[0] == labels.shape[0], ( 72 | "images.shape: %s labels.shape: %s" % (images.shape, 73 | labels.shape)) 74 | self._num_examples = images.shape[0] 75 | # Convert shape from [num examples, rows, columns, depth] 76 | # to [num examples, rows*columns] (assuming depth == 1) 77 | assert images.shape[3] == 1 78 | images = images.reshape(images.shape[0], 79 | images.shape[1] * images.shape[2]) 80 | # Convert from [0, 255] -> [0.0, 1.0]. 81 | images = images.astype(numpy.float32) 82 | images = numpy.multiply(images, 1.0 / 255.0) 83 | self._images = images 84 | self._labels = labels 85 | self._epochs_completed = 0 86 | self._index_in_epoch = 0 87 | @property 88 | def images(self): 89 | return self._images 90 | @property 91 | def labels(self): 92 | return self._labels 93 | @property 94 | def num_examples(self): 95 | return self._num_examples 96 | @property 97 | def epochs_completed(self): 98 | return self._epochs_completed 99 | def next_batch(self, batch_size, fake_data=False): 100 | """Return the next `batch_size` examples from this data set.""" 101 | if fake_data: 102 | fake_image = [1.0 for _ in range(784)] 103 | fake_label = 0 104 | return [fake_image for _ in range(batch_size)], [ 105 | fake_label for _ in range(batch_size)] 106 | start = self._index_in_epoch 107 | self._index_in_epoch += batch_size 108 | if self._index_in_epoch > self._num_examples: 109 | # Finished epoch 110 | self._epochs_completed += 1 111 | # Shuffle the data 112 | perm = numpy.arange(self._num_examples) 113 | numpy.random.shuffle(perm) 114 | self._images = self._images[perm] 115 | self._labels = self._labels[perm] 116 | # Start next epoch 117 | start = 0 118 | self._index_in_epoch = batch_size 119 | assert batch_size <= self._num_examples 120 | end = self._index_in_epoch 121 | return self._images[start:end], self._labels[start:end] 122 | def read_data_sets(train_dir, fake_data=False, one_hot=False): 123 | class DataSets(object): 124 | pass 125 | data_sets = DataSets() 126 | if fake_data: 127 | data_sets.train = DataSet([], [], fake_data=True) 128 | data_sets.validation = DataSet([], [], fake_data=True) 129 | data_sets.test = DataSet([], [], fake_data=True) 130 | return data_sets 131 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' 132 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' 133 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz' 134 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz' 135 | VALIDATION_SIZE = 5000 136 | local_file = maybe_download(TRAIN_IMAGES, train_dir) 137 | train_images = extract_images(local_file) 138 | local_file = maybe_download(TRAIN_LABELS, train_dir) 139 | train_labels = extract_labels(local_file, one_hot=one_hot) 140 | local_file = maybe_download(TEST_IMAGES, train_dir) 141 | test_images = extract_images(local_file) 142 | local_file = maybe_download(TEST_LABELS, train_dir) 143 | test_labels = extract_labels(local_file, one_hot=one_hot) 144 | validation_images = train_images[:VALIDATION_SIZE] 145 | validation_labels = train_labels[:VALIDATION_SIZE] 146 | train_images = train_images[VALIDATION_SIZE:] 147 | train_labels = train_labels[VALIDATION_SIZE:] 148 | data_sets.train = DataSet(train_images, train_labels) 149 | data_sets.validation = DataSet(validation_images, validation_labels) 150 | data_sets.test = DataSet(test_images, test_labels) 151 | return data_sets 152 | -------------------------------------------------------------------------------- /fathom/residual/residual.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from math import sqrt 4 | from collections import namedtuple 5 | import tensorflow as tf 6 | from fathom.nn import default_runstep 7 | from fathom.imagenet import imagenet 8 | 9 | # Code heavily based on Parag Mital's TensorFlow tutorials. 10 | class Residual(imagenet.ImagenetModel): 11 | """Residual Network.""" 12 | def build_hyperparameters(self): 13 | # Parameters 14 | self.learning_rate = 0.01 15 | self.training_iters = 200000 16 | self.batch_size = 16 17 | if self.init_options: 18 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 19 | self.display_step = 1 20 | 21 | self.dropout = 0.8 # Dropout, probability to keep units 22 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) 23 | 24 | def build_inference(self, images): 25 | with self.G.as_default(): 26 | LayerBlock = namedtuple( 27 | 'LayerBlock', ['num_repeats', 'num_filters', 'bottleneck_size']) 28 | blocks = [ 29 | LayerBlock(3, 128, 32), 30 | LayerBlock(3, 256, 64), 31 | LayerBlock(3, 512, 128), 32 | LayerBlock(3, 1024, 256) 33 | ] 34 | 35 | # %% 36 | input_shape = images.get_shape().as_list() 37 | if len(input_shape) == 2: 38 | ndim = int(sqrt(input_shape[1])) 39 | if ndim * ndim != input_shape[1]: 40 | raise ValueError('input_shape should be square') 41 | images = tf.reshape(images, [-1, ndim, ndim, 1]) 42 | 43 | # %% 44 | # First convolution expands to 64 channels and downsamples 45 | net = conv2d(images, 64, k_h=7, k_w=7, 46 | name='conv1', 47 | activation=tf.nn.relu) 48 | 49 | # %% 50 | # Max pool and downsampling 51 | net = tf.nn.max_pool( 52 | net, [1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME') 53 | 54 | # %% 55 | # Setup first chain of resnets 56 | net = conv2d(net, blocks[0].num_filters, k_h=1, k_w=1, 57 | stride_h=1, stride_w=1, padding='VALID', name='conv2') 58 | 59 | # %% 60 | # Loop through all res blocks 61 | for block_i, block in enumerate(blocks): 62 | for repeat_i in range(block.num_repeats): 63 | name = 'block_%d/repeat_%d' % (block_i, repeat_i) 64 | conv = conv2d(net, block.bottleneck_size, k_h=1, k_w=1, 65 | padding='VALID', stride_h=1, stride_w=1, 66 | activation=tf.nn.relu, 67 | name=name + '/conv_in') 68 | 69 | conv = conv2d(conv, block.bottleneck_size, k_h=3, k_w=3, 70 | padding='SAME', stride_h=1, stride_w=1, 71 | activation=tf.nn.relu, 72 | name=name + '/conv_bottleneck') 73 | 74 | conv = conv2d(conv, block.num_filters, k_h=1, k_w=1, 75 | padding='VALID', stride_h=1, stride_w=1, 76 | activation=tf.nn.relu, 77 | name=name + '/conv_out') 78 | 79 | net = conv + net 80 | 81 | try: 82 | # upscale to the next block size 83 | next_block = blocks[block_i + 1] 84 | net = conv2d(net, next_block.num_filters, k_h=1, k_w=1, 85 | padding='SAME', stride_h=1, stride_w=1, bias=False, 86 | name='block_%d/conv_upscale' % block_i) 87 | except IndexError: 88 | pass 89 | 90 | # %% 91 | net = tf.nn.avg_pool(net, 92 | ksize=[1, net.get_shape().as_list()[1], 93 | net.get_shape().as_list()[2], 1], 94 | strides=[1, 1, 1, 1], padding='VALID') 95 | net = tf.reshape( 96 | net, 97 | [-1, net.get_shape().as_list()[1] * 98 | net.get_shape().as_list()[2] * 99 | net.get_shape().as_list()[3]]) 100 | 101 | self.logits = linear(net, self.n_classes, activation=tf.identity) 102 | 103 | # %% 104 | return self.logits 105 | 106 | def conv2d(x, n_filters, 107 | k_h=5, k_w=5, 108 | stride_h=2, stride_w=2, 109 | stddev=0.02, 110 | activation=lambda x: x, 111 | bias=True, 112 | padding='SAME', 113 | name="Conv2D"): 114 | """2D Convolution with options for kernel size, stride, and init deviation. 115 | Parameters 116 | ---------- 117 | x : Tensor 118 | Input tensor to convolve. 119 | n_filters : int 120 | Number of filters to apply. 121 | k_h : int, optional 122 | Kernel height. 123 | k_w : int, optional 124 | Kernel width. 125 | stride_h : int, optional 126 | Stride in rows. 127 | stride_w : int, optional 128 | Stride in cols. 129 | stddev : float, optional 130 | Initialization's standard deviation. 131 | activation : arguments, optional 132 | Function which applies a nonlinearity 133 | padding : str, optional 134 | 'SAME' or 'VALID' 135 | name : str, optional 136 | Variable scope to use. 137 | Returns 138 | ------- 139 | x : Tensor 140 | Convolved input. 141 | """ 142 | with tf.variable_scope(name): 143 | w = tf.get_variable( 144 | 'w', [k_h, k_w, x.get_shape()[-1], n_filters], 145 | initializer=tf.truncated_normal_initializer(stddev=stddev)) 146 | conv = tf.nn.conv2d(x, w, strides=[1, stride_h, stride_w, 1], padding=padding) 147 | if bias: 148 | b = tf.get_variable( 149 | 'b', [n_filters], 150 | initializer=tf.truncated_normal_initializer(stddev=stddev)) 151 | 152 | conv = conv + b 153 | return activation(conv) 154 | 155 | def linear(x, n_units, scope=None, stddev=0.02, 156 | activation=lambda x: x): 157 | """Fully-connected network. 158 | Parameters 159 | ---------- 160 | x : Tensor 161 | Input tensor to the network. 162 | n_units : int 163 | Number of units to connect to. 164 | scope : str, optional 165 | Variable scope to use. 166 | stddev : float, optional 167 | Initialization's standard deviation. 168 | activation : arguments, optional 169 | Function which applies a nonlinearity 170 | Returns 171 | ------- 172 | x : Tensor 173 | Fully-connected output. 174 | """ 175 | shape = x.get_shape().as_list() 176 | 177 | with tf.variable_scope(scope or "Linear"): 178 | matrix = tf.get_variable("Matrix", [shape[1], n_units], tf.float32, 179 | tf.random_normal_initializer(stddev=stddev)) 180 | return activation(tf.matmul(x, matrix)) 181 | 182 | class ResidualFwd(Residual): 183 | forward_only = True 184 | 185 | if __name__ == "__main__": 186 | m = Residual() 187 | m.setup() 188 | m.run(runstep=default_runstep, n_steps=10) 189 | m.teardown() 190 | -------------------------------------------------------------------------------- /fathom/autoenc/autoenc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from math import sqrt 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | import sklearn.preprocessing as prep 8 | 9 | from fathom.nn import NeuralNetworkModel, default_runstep 10 | import fathom.imagenet.mnist as input_data 11 | 12 | # TODO: create an unsupervised parent class 13 | 14 | def standard_scale(X_train, X_test): 15 | preprocessor = prep.StandardScaler().fit(X_train) 16 | X_train = preprocessor.transform(X_train) 17 | X_test = preprocessor.transform(X_test) 18 | return X_train, X_test 19 | 20 | # heavily based on tensorflow.models.autoencoder 21 | class AutoencBase(NeuralNetworkModel): 22 | """Basic Autoencoder (denoising optional).""" 23 | def load_data(self): 24 | # Grab the dataset from the internet, if necessary 25 | self.mnist = input_data.read_data_sets("/tmp/data/", one_hot=True) 26 | self.X_train, self.X_test = standard_scale(self.mnist.train.images, self.mnist.test.images) 27 | 28 | def build_hyperparameters(self): 29 | # Parameters 30 | self.learning_rate = 0.001 31 | self.batch_size = 128 32 | if self.init_options: 33 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 34 | self.display_step = 1 35 | 36 | # Network Parameters 37 | self.n_hidden = 200 38 | 39 | # TODO: remove this data-specific stuff 40 | self.n_input = 784 # MNIST data input (img shape: 28*28) 41 | 42 | if not self.forward_only: 43 | self.scale = tf.placeholder(tf.float32) 44 | #self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability) 45 | 46 | def build_inputs(self): 47 | # tf Graph input 48 | self.xs = tf.placeholder(tf.float32, [None, self.n_input]) 49 | 50 | @property 51 | def inputs(self): 52 | return self.xs 53 | 54 | @property 55 | def outputs(self): 56 | return self.reconstruction 57 | 58 | # TODO: remove labels methods upon creating unsupervised parent class 59 | def build_labels(self): 60 | # inputs are the ground truth 61 | pass 62 | 63 | @property 64 | def labels(self): 65 | # inputs are the ground truth 66 | return self.inputs 67 | 68 | def run(self, runstep=None, n_steps=1): 69 | self.load_data() 70 | 71 | with self.G.as_default(): 72 | # %% We'll train in minibatches and report accuracy: 73 | self.epochs = 20 74 | self.display_step = 1 75 | 76 | if self.forward_only: 77 | self.epochs = 1 78 | 79 | for epoch in range(self.epochs): 80 | # TODO: re-enable options and metadata, which slow down the run 81 | 82 | total_batch = self.mnist.train.num_examples // self.batch_size 83 | 84 | avg_cost = 0 85 | for batch_i in range(total_batch): 86 | if batch_i >= n_steps: 87 | break 88 | #batch_xs = self.mnist.train.next_batch(self.batch_size) 89 | batch_xs = get_random_block_from_data(self.X_train, self.batch_size) 90 | 91 | # TODO: summary nodes 92 | 93 | if not self.forward_only: 94 | # train on batch 95 | _, loss_value = runstep( 96 | self.session, 97 | [self.train, self.loss], 98 | feed_dict={self.xs: batch_xs, self.scale: self.training_scale}, 99 | #options=run_options, run_metadata=values 100 | ) 101 | else: 102 | # run forward on train batch 103 | _ = runstep( 104 | self.session, 105 | self.outputs, 106 | feed_dict={self.xs: batch_xs} 107 | ) 108 | 109 | if not self.forward_only: 110 | avg_cost += loss_value * self.mnist.train.num_examples * self.batch_size 111 | 112 | if epoch % self.display_step == 0: 113 | print('epoch:', epoch, 'cost:', avg_cost) 114 | 115 | print("Total cost:", self.calc_total_cost(self.X_test)) 116 | 117 | def noisy_input(self, inputs, scale, dist=tf.random_normal): 118 | """Add scaled noise to input for denoising autoencoder.""" 119 | with self.G.as_default(): 120 | return inputs + scale * dist((self.n_input,)) 121 | 122 | def build_inference(self, inputs, transfer_function=tf.nn.softplus, scale=0.1, denoising=True): 123 | with self.G.as_default(): 124 | self.transfer = transfer_function 125 | 126 | self.training_scale = scale 127 | 128 | network_weights = self._initialize_weights() 129 | self.weights = network_weights 130 | 131 | if denoising and not self.forward_only: 132 | # add white noise to the input so the autoencoder learns to reconstruct from noise 133 | self.hidden = self.transfer( 134 | tf.matmul(self.noisy_input(self.xs, self.scale), self.weights['w1']) + self.weights['b1']) 135 | else: 136 | # learn to reconstruct the input alone 137 | self.hidden = self.transfer(tf.add(tf.matmul(self.xs, self.weights['w1']), self.weights['b1'])) 138 | 139 | self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2']) 140 | 141 | # for an autoencoder, the cost/loss is not just part of training 142 | self.build_loss(self.inputs, self.reconstruction) 143 | 144 | return self.reconstruction 145 | 146 | def build_loss(self, inputs, reconstruction): 147 | with self.G.as_default(): 148 | self.loss_op = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(reconstruction, inputs), 2.0)) 149 | return self.loss_op 150 | 151 | @property 152 | def loss(self): 153 | return self.loss_op 154 | 155 | def build_train(self, total_loss): 156 | with self.G.as_default(): 157 | opt = tf.train.AdamOptimizer() 158 | 159 | # Compute and apply gradients. 160 | self.train_op = opt.minimize(total_loss)#, global_step) 161 | 162 | return self.train_op 163 | 164 | @property 165 | def train(self): 166 | return self.train_op 167 | 168 | def _initialize_weights(self): 169 | all_weights = dict() 170 | all_weights['w1'] = tf.Variable(xavier_init(self.n_input, self.n_hidden)) 171 | all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32)) 172 | all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32)) 173 | all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32)) 174 | return all_weights 175 | 176 | def calc_total_cost(self, X): 177 | return self.session.run(self.loss, feed_dict = {self.xs: X, self.scale: self.training_scale}) 178 | 179 | def transform(self, X): 180 | return self.session.run(self.hidden, feed_dict={self.xs: X, self.scale: self.training_scale}) 181 | 182 | def generate(self, hidden = None): 183 | if hidden is None: 184 | hidden = np.random.normal(size=self.weights["b1"]) 185 | return self.session.run(self.reconstruction, feed_dict={self.hidden: hidden}) 186 | 187 | def reconstruct(self, X): 188 | return self.session.run(self.reconstruction, feed_dict={self.xs: X, self.scale: self.training_scale}) 189 | 190 | def xavier_init(fan_in, fan_out, constant = 1): 191 | low = -constant * sqrt(6.0 / (fan_in + fan_out)) 192 | high = constant * sqrt(6.0 / (fan_in + fan_out)) 193 | return tf.random_uniform((fan_in, fan_out), 194 | minval = low, maxval = high, 195 | dtype = tf.float32) 196 | 197 | def get_random_block_from_data(data, batch_size): 198 | start_index = np.random.randint(0, len(data) - batch_size) 199 | return data[start_index:(start_index + batch_size)] 200 | 201 | class AutoencBaseFwd(AutoencBase): 202 | forward_only = True 203 | 204 | if __name__ == "__main__": 205 | m = AutoencBase() 206 | m.setup() 207 | m.run(runstep=default_runstep) 208 | m.teardown() 209 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Installing Prerequisites 2 | 3 | Fathom requires a fair number of other software packages to use. TensorFlow is the obvious dependency, but there are a number of other support libraries which are mostly used for data processing and ingest. Deep learning algorithms operate on real data, so many of them have to do a substantial amount of work to turn raw inputs into a form they can process efficiently. 4 | 5 | ## TensorFlow 6 | 7 | - Python 2.6+ 8 | - TensorFlow 1.x+ (artifact for paper required [TensorFlow 0.8.0rc0](https://github.com/tensorflow/tensorflow/releases/tag/v0.8.0rc0)) 9 | 10 | For TensorFlow, you can either download a pre-built binary or build from source. The latter is more involved, but can allow more flexibility in configuration (i.e.- you can pass specific options to the underlying math libraries which can affect performance). 11 | 12 | To build from source, you'll also need Bazel, Google's build system. Instructions can be found in the TensorFlow documentation. 13 | 14 | The TensorFlow API is rapidly changing, and so it is possible for Fathom to break in small ways on unintended versions of TensorFlow. These issues tend to be about package imports and renaming rather than fundamentally devastating differences, so feel free to submit pull requests if you fix them on your own. 15 | 16 | ## Supporting libraries 17 | 18 | Fathom needs several other python as well, mostly for pre-processing inputs. For all of these, you have your choice of methods for installing them: 19 | 20 | - `apt-get`: (or your favorite Linux distribution's package manager) This is a quick route, but be careful of versioning. Sometimes distributions lag a fair ways behind in version numbers. 21 | - `pip`: preferred package installer for Python 22 | - `conda`: If you're using an Anaconda distribution of python, this is probably your best bet for numpy, scipy, and scikit-learn. You'll need to use `pip` for librosa and tqdm, though (as Continuum doesn't support these packages). 23 | 24 | You'll want to install the following list of packages. (You may have several of them installed already, and you shouldn't need to re-install—Fathom doesn't use any fancy features). 25 | 26 | - numpy (most) 27 | - scipy (for scikit-learn) 28 | - scikit-learn ([MemNet](/models/#memnet), [Speech](/models/#speech), [Autoenc](/models/#autoenc)) 29 | - six ([Seq2Seq](/models/#seq2seq)) 30 | - librosa ([Speech](/models/#speech)) 31 | 32 | - h5py* ([Speech](/models/#speech)) 33 | 34 | *For h5py, you'll also need libhdf5, which is the C++ backend for interfacing with HDF5-formatted files. This is usually available as a Linux package, but [building from source](https://support.hdfgroup.org/downloads/index.html) is also fine. Any recent version should work. In Ubuntu, the package you're looking for is `libhdf5-dev`. 35 | 36 | ## Atari emulation 37 | 38 | [DeepQ](/models/#deepq) requires a bit more support than the other models. This is largely because it is interacting directly with a running Atari emulator. Consequently, you'll need both the emulator itself and OpenCV to run it. 39 | 40 | The [Arcade Learning Environment (ALE)](http://www.arcadelearningenvironment.org/) is a clean, two-way interface between machine learning models and an Atari 2600 emulator. Installation instructions can be found in the [ALE Manual](https://github.com/mgbellemare/Arcade-Learning-Environment/raw/master/doc/manual/manual.pdf), but boil down to two steps: building the ALE C++ backend, and installing the python wrapper. 41 | 42 | 43 | [OpenCV](http://opencv.org/) is a collection of image processing and computational geometry functions designed to support computer vision. You'll need both a 2.x version of the backend library and also the python interface wrapper. Many Linux distributions have a package for both (Ubuntu's are `libopencv-dev` and `python-opencv`), but you can also [build from source ](http://docs.opencv.org/2.4.13/doc/tutorials/introduction/linux_install/linux_install.html) and then use `pip` to install the `opencv-python` wrapper. 44 | 45 | # Alternative: Quickstart via Docker 46 | 47 | If you don't need accurate performance numbers right away, we also provide a pre-built [Docker image](https://hub.docker.com/r/rdadolf/fathom/) to make it easy to get familiar with the Fathom workloads. 48 | 49 | If you're not familiar with Docker, you can think of it as a lightweight virtualization layer, similar to a VM but at a higher level of abstraction. Installation instructions can be found on the [docker website](https://www.docker.com/). To run the Fathom image interactively, use this: 50 | 51 | ```sh 52 | docker run -it rdadolf/fathom 53 | ``` 54 | 55 | The image will automatically be downloaded from the Docker hub, launched, and you'll be given a shell prompt with the environment all set up. 56 | 57 | # Downloading Data 58 | 59 | *
Documentation in progress
* 60 | 61 | Fathom does not come with datasets suitable for training. This is a combination of size (realistic training sets are often massive) and licensing (an oft-repeated mantra is that good data is more valuable than a good model). 62 | 63 | Regardless, the inputs Fathom is designed for are standard and widely-available: 64 | 65 | - [ImageNet](http://www.image-net.org/download-images) - requires registration, but downloads are free for non-commercial purposes. 66 | - [WMT15](http://www.statmt.org/wmt15/translation-task.html) - freely available online, and automatically downloaded by Fathom 67 | - [bAbI](https://research.facebook.com/research/babi/) - freely available online 68 | - [MNIST](http://yann.lecun.com/exdb/mnist/) - freely available online, and automatically downloaded by Fathom. 69 | - [TIMIT](https://catalog.ldc.upenn.edu/ldc93s1) - requires membership of the Linguistic Data Consortium (this is not free, but it is widely available in the research community). 70 | - Atari "Breakout" ROM - Technically not freely available. In practice, it is [available online](https://www.google.com/search?q=atari+breakout+rom). You can also legally obtain this by dumping the memory of an Atari 2600 running a copy of Breakout you bought. 71 | 72 | We eventually want to write synthetic datasets which allow users to run Fathom out of the box without requiring the above downloads. 73 | 74 | # Running the Workloads 75 | 76 | Fathom is a Python library with command-line shims. To use Fathom, you'll need to tell your Python installation where to find it. The easiest way is to adjust your `PYTHONPATH` environment variable: 77 | 78 | ```sh 79 | $ git clone https://github.com/rdadolf/fathom.git 80 | $ export PYTHONPATH=`pwd`/fathom 81 | ``` 82 | 83 | Once you've done that, you can either run the models directly (using the command-line shims): 84 | 85 | ```sh 86 | $ cd fathom 87 | $ ./fathom/seq2seq/seq2seq.py 88 | ``` 89 | 90 | or you can use Fathom as a Python library directly in your scripts: 91 | 92 | ```python 93 | from fathom import Seq2seq 94 | model = Seq2seq() 95 | model.setup() 96 | model.run() 97 | ``` 98 | 99 | ## ImageNet 100 | 101 | The ImageNet workflow is finicky for training and the parameters and optimizations we have included do not reflect the state-of-the-art (e.g., batch normalization). Several users have reported issues with running the training flow out of the box, and we are currently working on resolving these issues. 102 | 103 | If you do not want to download and set up ImageNet, then you can switch to using MNIST as provided in `fathom/imagenet/mnist.py`. Some of the models (e.g., VGG) may require modification because their convolutional kernels compress the smaller MNIST images too much. 104 | 105 | ## DeepQ 106 | 107 | Note: [DeepQ](/models/#deepq) currently looks for its ROMs relative to Fathom's root directory. In practice, this will cause problems if you don't run in that directory. We are working on a more general configuration interface, but in the meantime, you should feel free to modify the `ROM_PATH` variable in `fathom/deepq/emulator.py`. 108 | 109 | -------------------------------------------------------------------------------- /fathom/memnet/memnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Dominique Luna's implementation of End-to-End Memory Networks, refactored.""" 4 | 5 | from functools import reduce 6 | from itertools import chain 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | from sklearn import cross_validation 12 | from fathom.nn import NeuralNetworkModel, default_runstep 13 | from .data_utils import load_task, vectorize_data 14 | 15 | data_dir = "/data/babi/tasks_1-20_v1-2/en/" 16 | task_id = 1 17 | 18 | class MemNet(NeuralNetworkModel): 19 | def build_inference(self, inputs): 20 | with self.G.as_default(): 21 | self.encoding_op = tf.constant(self.encoding(self.sentence_size, self.embedding_size), name="encoding") 22 | 23 | # variables 24 | #with tf.variable_scope(self.name): 25 | nil_word_slot = tf.zeros([1, self.embedding_size]) 26 | A = tf.concat(axis=0, values=[ nil_word_slot, self.initializer([self.vocab_size-1, self.embedding_size]) ]) 27 | B = tf.concat(axis=0, values=[ nil_word_slot, self.initializer([self.vocab_size-1, self.embedding_size]) ]) 28 | self.A = tf.Variable(A, name="A") 29 | self.B = tf.Variable(B, name="B") 30 | 31 | self.TA = tf.Variable(self.initializer([self.memory_size, self.embedding_size]), name='TA') 32 | 33 | self.H = tf.Variable(self.initializer([self.embedding_size, self.embedding_size]), name="H") 34 | self.W = tf.Variable(self.initializer([self.embedding_size, self.vocab_size]), name="W") 35 | 36 | #with tf.variable_scope(self.name): 37 | q_emb = tf.nn.embedding_lookup(self.B, self.queries) 38 | u_0 = tf.reduce_sum(q_emb * self.encoding_op, 1) 39 | u = [u_0] 40 | m_emb = tf.nn.embedding_lookup(self.A, self.stories) 41 | m = tf.reduce_sum(m_emb * self.encoding_op, 2) + self.TA 42 | 43 | # hop 44 | for hop_number in range(self.hops): 45 | with tf.name_scope('Hop_'+str(hop_number)): 46 | u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1]) 47 | dotted = tf.reduce_sum(m * u_temp, 2) 48 | 49 | # Calculate probabilities 50 | probs = tf.nn.softmax(dotted) 51 | 52 | probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1]) 53 | c_temp = tf.transpose(m, [0, 2, 1]) 54 | o_k = tf.reduce_sum(c_temp * probs_temp, 2) 55 | 56 | u_k = tf.matmul(u[-1], self.H) + o_k 57 | 58 | # nonlinearity 59 | if self.nonlin: 60 | u_k = nonlin(u_k) 61 | 62 | u.append(u_k) 63 | 64 | self.nil_vars = set([self.A.name, self.B.name]) 65 | 66 | self._outputs = tf.matmul(u_k, self.W) 67 | 68 | return self._outputs 69 | 70 | @property 71 | def outputs(self): 72 | return self._outputs 73 | 74 | def build_loss(self, logits, labels): 75 | with self.G.as_default(): 76 | with tf.name_scope('loss'): 77 | # Define loss 78 | # TODO: does this labels have unexpected state? 79 | self.loss_op = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.cast(labels, tf.float32))) 80 | return self.loss_op 81 | 82 | @property 83 | def loss(self): 84 | return self.loss_op 85 | 86 | def build_train(self, total_loss): 87 | with self.G.as_default(): 88 | self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 89 | 90 | # can't use opt.minimize because we need to clip the gradients 91 | grads_and_vars = self.opt.compute_gradients(self.loss) 92 | grads_and_vars = [(tf.clip_by_norm(g, self.max_grad_norm), v) for g,v in grads_and_vars] 93 | grads_and_vars = [(add_gradient_noise(g), v) for g,v in grads_and_vars] 94 | nil_grads_and_vars = [] 95 | for g, v in grads_and_vars: 96 | if v.name in self.nil_vars: 97 | nil_grads_and_vars.append((zero_nil_slot(g), v)) 98 | else: 99 | nil_grads_and_vars.append((g, v)) 100 | 101 | self.train_op = self.opt.apply_gradients(nil_grads_and_vars, name="train_op") 102 | 103 | return self.train_op 104 | @property 105 | def train(self): 106 | return self.train_op 107 | 108 | def load_data(self): 109 | # single babi task 110 | # TODO: refactor all this running elsewhere 111 | # task data 112 | train, test = load_task(data_dir, task_id) 113 | 114 | vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in train + test))) 115 | word_idx = dict((c, i + 1) for i, c in enumerate(vocab)) 116 | 117 | self.memory_size = 50 118 | 119 | self.max_story_size = max(list(map(len, (s for s, _, _ in train + test)))) 120 | self.mean_story_size = int(np.mean(list(map(len, (s for s, _, _ in train + test))))) 121 | self.sentence_size = max(list(map(len, chain.from_iterable(s for s, _, _ in train + test)))) 122 | self.query_size = max(list(map(len, (q for _, q, _ in train + test)))) 123 | self.memory_size = min(self.memory_size, self.max_story_size) 124 | self.vocab_size = len(word_idx) + 1 # +1 for nil word 125 | self.sentence_size = max(self.query_size, self.sentence_size) # for the position 126 | 127 | print("Longest sentence length", self.sentence_size) 128 | print("Longest story length", self.max_story_size) 129 | print("Average story length", self.mean_story_size) 130 | 131 | # train/validation/test sets 132 | self.S, self.Q, self.A = vectorize_data(train, word_idx, self.sentence_size, self.memory_size) 133 | self.trainS, self.valS, self.trainQ, self.valQ, self.trainA, self.valA = cross_validation.train_test_split(self.S, self.Q, self.A, test_size=.1) # TODO: randomstate 134 | self.testS, self.testQ, self.testA = vectorize_data(test, word_idx, self.sentence_size, self.memory_size) 135 | 136 | print(self.testS[0]) 137 | 138 | print("Training set shape", self.trainS.shape) 139 | 140 | # params 141 | self.n_train = self.trainS.shape[0] 142 | self.n_test = self.testS.shape[0] 143 | self.n_val = self.valS.shape[0] 144 | 145 | print("Training Size", self.n_train) 146 | print("Validation Size", self.n_val) 147 | print("Testing Size", self.n_test) 148 | 149 | def build_hyperparameters(self): 150 | with self.G.as_default(): 151 | # TODO: put these into runstep options or somewhere else 152 | # Parameters 153 | self.learning_rate = 0.01 154 | self.batch_size = 32 155 | if self.init_options: 156 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 157 | self.embedding_size = 20 158 | self.hops = 3 159 | self.max_grad_norm = 40.0 160 | self.nonlin = None 161 | self.encoding = position_encoding 162 | self.display_step = 10 163 | 164 | def build_inputs(self): 165 | self.load_data() # TODO: get static numbers for the things that currently require loading and move this to run 166 | 167 | with self.G.as_default(): 168 | # inputs 169 | self.stories = tf.placeholder(tf.int32, [None, self.memory_size, self.sentence_size], name="stories") 170 | self.queries = tf.placeholder(tf.int32, [None, self.sentence_size], name="queries") 171 | 172 | self.initializer = tf.random_normal_initializer(stddev=0.1) 173 | 174 | @property 175 | def inputs(self): 176 | return self.stories, self.queries 177 | 178 | def build_labels(self): 179 | with self.G.as_default(): 180 | self.answers = tf.placeholder(tf.int32, [None, self.vocab_size], name="answers") 181 | 182 | @property 183 | def labels(self): 184 | return self.answers 185 | 186 | def run(self, runstep=None, n_steps=1): 187 | # load babi data 188 | # vocab, memory, sentence sizes set here 189 | # TODO: get static data size numbers and don't load in inputs anymore 190 | #self.load_data() 191 | #tf.set_random_seed(random_state) 192 | 193 | start = 0 194 | assert self.batch_sizeself.n_train: 219 | start,end = 0,self.batch_size 220 | else: 221 | start,end = end,end+self.batch_size 222 | 223 | acc = self.session.run( 224 | self.accuracy, 225 | feed_dict={self.stories: self.testS, self.queries: self.testQ, self.answers: self.testA} 226 | ) 227 | 228 | print("Test accuracy: {:.5f}".format(acc)) 229 | 230 | def position_encoding(sentence_size, embedding_size): 231 | """ 232 | Position Encoding described in section 4.1 [1] 233 | """ 234 | encoding = np.ones((embedding_size, sentence_size), dtype=np.float32) 235 | ls = sentence_size+1 236 | le = embedding_size+1 237 | for i in range(1, le): 238 | for j in range(1, ls): 239 | encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2) 240 | encoding = 1 + 4 * encoding / embedding_size / sentence_size 241 | return np.transpose(encoding) 242 | 243 | def zero_nil_slot(t, name=None): 244 | """ 245 | Overwrites the nil_slot (first row) of the input Tensor with zeros. 246 | The nil_slot is a dummy slot and should not be trained and influence 247 | the training algorithm. 248 | """ 249 | with tf.name_scope(values=[t], name=name, default_name="zero_nil_slot") as name: 250 | t = tf.convert_to_tensor(t, name="t") 251 | s = tf.shape(t)[1] 252 | z = tf.zeros(tf.stack([1, s])) 253 | return tf.concat(axis=0, values=[z, tf.slice(t, [1, 0], [-1, -1])], name=name) 254 | 255 | def add_gradient_noise(t, stddev=1e-3, name=None): 256 | """ 257 | Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2]. 258 | The input Tensor `t` should be a gradient. 259 | The output will be `t` + gaussian noise. 260 | 0.001 was said to be a good fixed value for memory networks [2]. 261 | """ 262 | with tf.name_scope(values=[t, stddev], name=name, default_name="add_gradient_noise") as name: 263 | t = tf.convert_to_tensor(t, name="t") 264 | gn = tf.random_normal(tf.shape(t), stddev=stddev) 265 | return tf.add(t, gn, name=name) 266 | 267 | class MemNetFwd(MemNet): 268 | forward_only = True 269 | 270 | if __name__=='__main__': 271 | m = MemNet() 272 | m.setup() 273 | m.run(runstep=default_runstep, n_steps=100) 274 | m.teardown() 275 | 276 | -------------------------------------------------------------------------------- /fathom/speech/speech.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | #from tensorflow.models.rnn import rnn, rnn_cell 7 | from tensorflow.python.ops import functional_ops 8 | from tensorflow.python.ops import variable_scope as vs 9 | from tensorflow.contrib.rnn.python.ops.rnn_cell import _linear 10 | 11 | from fathom.nn import NeuralNetworkModel, default_runstep 12 | 13 | from .preproc import load_timit, timit_hdf5_filepath 14 | from .phoneme import index2phoneme_dict 15 | 16 | 17 | def clipped_relu(inputs, clip=20): 18 | """Similar to tf.nn.relu6, but can clip at 20 as in Deep Speech.""" 19 | return tf.minimum(tf.nn.relu(inputs), clip) 20 | 21 | 22 | class ClippedReluRNNCell(tf.contrib.rnn.RNNCell): 23 | """Basic RNN cell with clipped ReLU rather than tanh activation.""" 24 | 25 | def __init__(self, num_units, input_size=None): 26 | self._num_units = num_units 27 | 28 | @property 29 | def state_size(self): 30 | return self._num_units 31 | 32 | @property 33 | def output_size(self): 34 | return self._num_units 35 | 36 | def __call__(self, inputs, state, scope=None): 37 | """Basic RNN: output = new_state = clipped_relu(W * input + U * state + B).""" 38 | with vs.variable_scope(scope or type(self).__name__): 39 | output = clipped_relu(_linear([inputs, state], self._num_units, True)) 40 | return output, output 41 | 42 | 43 | # TODO: show label error rate 44 | # TODO: avoid labels and blank off-by-one error due to padding zeros 45 | class Speech(NeuralNetworkModel): 46 | """RNN for speech recognition.""" 47 | def __init__(self, device=None, init_options=None): 48 | super(Speech,self).__init__(device=device, init_options=init_options) 49 | 50 | #def inference(self, inputs, n_hidden=2048): 51 | def build_inference(self, inputs, n_hidden=1024): 52 | with self.G.as_default(): 53 | self.n_hidden = n_hidden 54 | 55 | # Architecture of Deep Speech [Hannun et al. 2014] 56 | outputs_1 = self.mlp_layer(inputs, self.n_coeffs, self.n_hidden) 57 | outputs_2 = self.mlp_layer(outputs_1, self.n_hidden, self.n_hidden) 58 | outputs_3 = self.mlp_layer(outputs_2, self.n_hidden, self.n_hidden) 59 | outputs_4 = self.bidirectional_layer(outputs_3, n_input=self.n_hidden, n_hidden=self.n_hidden, n_output=self.n_hidden) 60 | outputs_5 = self.mlp_layer(outputs_3, self.n_hidden, self.n_labels) 61 | 62 | self._outputs = outputs_5 63 | 64 | # transpose in preparation for CTC loss 65 | self.logits_t = tf.transpose(self._outputs, perm=[1,0,2]) 66 | 67 | return outputs_5 68 | 69 | @property 70 | def outputs(self): 71 | return self._outputs 72 | 73 | @property 74 | def loss(self): 75 | return self.loss_op 76 | 77 | def build_loss(self, logits, labels): 78 | with self.G.as_default(): 79 | # NOTE: CTC does the softmax for us, according to the code 80 | 81 | # CTC loss requires sparse labels 82 | self.sparse_labels = self.ctc_label_dense_to_sparse(self.labels, self.seq_lens) 83 | 84 | # CTC 85 | self.loss_op = tf.nn.ctc_loss( 86 | inputs=self.logits_t, 87 | labels=self.sparse_labels, 88 | sequence_length=self.seq_lens 89 | ) 90 | 91 | return self.loss_op 92 | 93 | def build_train(self, loss): 94 | # TODO: buckets 95 | with self.G.as_default(): 96 | self.train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) 97 | return self.train_op 98 | 99 | @property 100 | def train(self): 101 | return self.train_op 102 | 103 | def mlp_layer(self, inputs, n_input, n_output): 104 | with self.G.as_default(): 105 | # layer sees inputs as (batch_size, max_time, n_input) 106 | W = tf.Variable(tf.zeros([n_input, n_output])) 107 | b = tf.Variable(tf.zeros([n_output])) 108 | 109 | W_batch_multiples = tf.constant([self.batch_size, 1, 1], dtype=tf.int32) 110 | W_batch = tf.tile(tf.expand_dims(W, 0), W_batch_multiples) 111 | 112 | # TODO: is tiling a bias vector over batch and frames correct? 113 | b_batch_multiples = tf.constant([self.batch_size, self.max_frames, 1], dtype=tf.int32) 114 | b_batch = tf.tile(tf.expand_dims(tf.expand_dims(b, 0), 0), b_batch_multiples) 115 | 116 | # TODO: change batch_matmul to an averaging reshape so that batching happens and dimensions are easier 117 | outputs = tf.add(tf.matmul(inputs, W_batch), b_batch) 118 | 119 | return clipped_relu(outputs) 120 | 121 | def bidirectional_layer(self, inputs, n_input, n_hidden, n_output): 122 | """Bidirectional RNN layer.""" 123 | with self.G.as_default(): 124 | fw_cell = ClippedReluRNNCell(n_hidden) 125 | bw_cell = ClippedReluRNNCell(n_hidden) 126 | 127 | # input shape: (batch_size, max_time, n_input) 128 | inputs = tf.transpose(inputs, perm=[1, 0, 2]) # permute max_time and batch_size 129 | inputs = tf.reshape(inputs, [-1, n_input]) # (max_time*batch_size, n_input) 130 | 131 | inputs = tf.split(axis=0, num_or_size_splits=self.max_frames, value=inputs) # max_time * (batch_size, n_hidden) 132 | 133 | # optional initial states 134 | istate_fw = tf.placeholder("float", [None, n_hidden]) 135 | istate_bw = tf.placeholder("float", [None, n_hidden]) 136 | 137 | # TODO: support both tanh (default) and clipped_relu 138 | outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=istate_fw, initial_state_bw=istate_bw) 139 | 140 | # TODO: is this the right output? 141 | return outputs[-1] 142 | 143 | def ctc_label_dense_to_sparse( self, labels, label_lengths ): 144 | """Mike Henry's implementation, with some minor modifications.""" 145 | with self.G.as_default(): 146 | label_shape = tf.shape( labels ) 147 | num_batches_tns = tf.stack( [label_shape[0]] ) 148 | max_num_labels_tns = tf.stack( [label_shape[1]] ) 149 | 150 | def range_less_than(previous_state, current_input): 151 | return tf.expand_dims( tf.range( label_shape[1] ), 0 ) < current_input 152 | 153 | init = tf.cast( tf.fill( max_num_labels_tns, 0 ), tf.bool ) 154 | init = tf.expand_dims( init, 0 ) 155 | dense_mask = functional_ops.scan(range_less_than, label_lengths , initializer=init, parallel_iterations=1) 156 | dense_mask = dense_mask[ :, 0, : ] 157 | 158 | label_array = tf.reshape( tf.tile( tf.range( 0, label_shape[1] ), num_batches_tns ), label_shape ) 159 | label_ind = tf.boolean_mask( label_array, dense_mask ) 160 | 161 | batch_array = tf.transpose( tf.reshape( tf.tile( tf.range( 0, label_shape[0] ), max_num_labels_tns ), tf.reverse( label_shape,[0]) ) ) 162 | batch_ind = tf.boolean_mask( batch_array, dense_mask ) 163 | 164 | indices = tf.transpose( tf.reshape( tf.concat( axis=0, values=[batch_ind, label_ind] ), [2,-1] ) ) 165 | vals_sparse = tf.gather_nd( labels, indices ) 166 | return tf.SparseTensor( tf.to_int64(indices), vals_sparse, tf.to_int64( label_shape ) ) 167 | 168 | def build_hyperparameters(self): 169 | self.n_labels = 61 + 1 # add blank 170 | self.max_frames = 1566 # TODO: compute dynamically 171 | self.max_labels = 75 172 | self.n_coeffs = 26 173 | self.batch_size = 32 174 | if self.init_options: 175 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 176 | 177 | def build_inputs(self): 178 | with self.G.as_default(): 179 | # NOTE: ctc_loss requires a transpose 180 | # tf.transpose(inputs,perm=[1,0,2]) 181 | self._inputs = tf.placeholder(tf.float32, [None, self.max_frames, self.n_coeffs], name="inputs") 182 | 183 | @property 184 | def inputs(self): 185 | return self._inputs 186 | 187 | def build_labels(self): 188 | with self.G.as_default(): 189 | self._labels = tf.placeholder(tf.int32, [None, self.max_labels], name="labels") 190 | self.seq_lens = tf.placeholder(tf.int32, [None], name="seq_lens") 191 | 192 | @property 193 | def labels(self): 194 | return self._labels 195 | 196 | def build(self): 197 | super(Speech, self).build() 198 | 199 | with self.G.as_default(): 200 | self.decode_op = self.decoding() 201 | 202 | def load_data(self): 203 | self.train_spectrograms, self.train_labels, self.train_seq_lens = load_timit(timit_hdf5_filepath, train=True) 204 | # TODO: load test 205 | 206 | def get_random_batch(self): 207 | """Get random batch from np.arrays (not tf.train.shuffle_batch).""" 208 | n_examples = self.train_spectrograms.shape[0] 209 | random_sample = np.random.randint(n_examples, size=self.batch_size) 210 | return self.train_spectrograms[random_sample, :, :], self.train_labels[random_sample, :], self.train_seq_lens[random_sample] 211 | 212 | def decoding(self): 213 | """Predict labels from learned sequence model.""" 214 | # TODO: label error rate on validation set 215 | decoded, _ = tf.nn.ctc_greedy_decoder(self.logits_t, self.seq_lens) 216 | sparse_decode_op = decoded[0] # single-element list 217 | self.decode_op = tf.sparse_to_dense(sparse_decode_op.indices, sparse_decode_op.dense_shape, sparse_decode_op.values) 218 | return self.decode_op 219 | 220 | def run(self, runstep=None, n_steps=1, *args, **kwargs): 221 | print("Loading spectrogram features...") 222 | self.load_data() 223 | 224 | with self.G.as_default(): 225 | print('Starting run...') 226 | for _ in range(n_steps): 227 | spectrogram_batch, label_batch, seq_len_batch = self.get_random_batch() 228 | 229 | if not self.forward_only: 230 | _, _ = runstep(self.session, 231 | [self.train_op, self.loss_op], 232 | feed_dict={self.inputs: spectrogram_batch, self.labels: label_batch, self.seq_lens: seq_len_batch}) 233 | else: 234 | # run forward-only on train batch 235 | _ = runstep(self.session, 236 | self.outputs, 237 | feed_dict={self.inputs: spectrogram_batch, self.labels: label_batch, self.seq_lens: seq_len_batch}) 238 | 239 | # decode the same batch, for debugging 240 | decoded = self.session.run(self.decode_op, 241 | feed_dict={self.inputs: spectrogram_batch, self.labels: label_batch, self.seq_lens: seq_len_batch}) 242 | 243 | # print some decoded examples 244 | if False: 245 | print(' '.join(self.labels2phonemes(decoded[0]))) 246 | # TODO: fix dtypes in dataset (labels are accidentally floats right now) 247 | print(' '.join(self.labels2phonemes(np.array(label_batch[0,:], dtype=np.int32)))) 248 | 249 | def labels2phonemes(self, decoded_labels): 250 | """Convert a list of label indices to a list of corresponding phonemes.""" 251 | return [index2phoneme_dict[label] for label in decoded_labels] 252 | 253 | class SpeechFwd(Speech): 254 | forward_only = True 255 | 256 | if __name__=='__main__': 257 | m = Speech() 258 | m.setup() 259 | m.run(runstep=default_runstep, n_steps=10) 260 | m.teardown() 261 | -------------------------------------------------------------------------------- /fathom/speech/preproc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Convert TIMIT audio files into spectral coefficients. 4 | """ 5 | 6 | import logging 7 | import os 8 | import fnmatch 9 | 10 | import numpy as np 11 | import h5py 12 | import librosa 13 | import sklearn.preprocessing 14 | 15 | from .phoneme import timit_phonemes, phoneme2index_list, phoneme2index_dict 16 | 17 | # global config: load from previous saved dataset if True, else recompute 18 | load_features = False 19 | 20 | # TODO: configurable path to /data/speech/timit/ 21 | timit_dir = '/data/speech/timit/TIMIT/' 22 | timit_hdf5_filepath = '/data/speech/timit/timit.hdf5' 23 | 24 | train_name, test_name = 'train', 'test' 25 | train_dir = os.path.join(timit_dir, train_name.upper()) 26 | test_dir = os.path.join(timit_dir, test_name.upper()) 27 | 28 | 29 | # simple logging 30 | logger = logging.getLogger('TIMIT') 31 | logger.setLevel(logging.INFO) 32 | 33 | ch = logging.StreamHandler() 34 | ch.setLevel(logging.DEBUG) 35 | formatter = logging.Formatter('%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') 36 | ch.setFormatter(formatter) 37 | logger.addHandler(ch) 38 | 39 | 40 | def recursive_glob_ext(dirpath, ext): 41 | """Recursively find files with an extension in a TIMIT directory.""" 42 | return [os.path.splitext(os.path.join(dirpath, filename))[0] # remove extension 43 | for dirpath, _, files in os.walk(dirpath) 44 | for filename in fnmatch.filter(files, '*.{}'.format(ext))] 45 | 46 | 47 | def mfcc_features(filename): 48 | """Preprocessing per CTC paper. 49 | 50 | (These are not the simpler linear spectrogram features alone as in Deep 51 | Speech). 52 | 53 | Properties: 54 | - 10ms frames with 5ms overlap 55 | - 12 MFCCs with 26 filter banks 56 | - replace first MFCC with energy (TODO: log-energy) 57 | - add first-order derivatives for all of the above 58 | - total: 26 coefficients 59 | """ 60 | d, sr = librosa.load(filename) 61 | 62 | frame_length_seconds = 0.010 63 | frame_overlap_seconds = 0.005 64 | 65 | mfccs = librosa.feature.mfcc(d, sr, n_mfcc=1+12, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr)) 66 | 67 | # energy (TODO: log?) 68 | energy = librosa.feature.rmse(d, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr)) 69 | 70 | mfccs[0] = energy # replace first MFCC with energy, per convention 71 | 72 | deltas = librosa.feature.delta(mfccs, order=1) 73 | mfccs_plus_deltas = np.vstack([mfccs, deltas]) 74 | 75 | coeffs = sklearn.preprocessing.scale(mfccs_plus_deltas, axis=1) 76 | 77 | return coeffs 78 | 79 | 80 | def dirpath2dataset(dirpath): 81 | """Convert a TIMIT dirpath to a dataset. 82 | 83 | The filename alone is not unique. 84 | 85 | e.g., TIMIT/TRAIN/DR8/MMPM0/SX251.WAV => MMPM0/SX251.WAV 86 | """ 87 | if not '/' in dirpath: 88 | raise Exception("not a valid TIMIT dirpath") 89 | 90 | dataset_name = '/'.join(dirpath.split('/')[-2:]) 91 | return dataset_name 92 | 93 | 94 | def phoneme_transcription(phoneme_filename): 95 | phoneme_column = -1 96 | # we can discard the first two columns, which provide the time alignment 97 | transcription = [line.split()[phoneme_column].strip() for line in open(phoneme_filename)] 98 | return transcription 99 | 100 | 101 | def verify_phonemes(timit_phoneme_set, transcription_phoneme_set): 102 | """Make sure every pre-specified phoneme was seen in data, and the converse.""" 103 | for phoneme in transcription_phoneme_set: 104 | if phoneme not in timit_phoneme_set: 105 | logger.error(phoneme + ' not in TIMIT phonemes') 106 | 107 | for phoneme in timit_phoneme_set: 108 | if phoneme not in transcription_phoneme_set: 109 | logger.error(phoneme + ' not in transcribed phonemes') 110 | 111 | 112 | def compute_spectrograms(audio_filenames): 113 | """Extract spectrogram features from each audio file.""" 114 | features_list = [] 115 | audio_ext = ".WAV" 116 | 117 | for audio_basename in audio_filenames: 118 | # recompute spectrogram features 119 | # FIXME: on interrupt, kill the thread which librosa launches via audioread 120 | feature_vector = mfcc_features(audio_basename + audio_ext) 121 | features_list.append(feature_vector) 122 | 123 | return features_list 124 | 125 | 126 | def load_precomputed_spectrograms(filepath): 127 | """Load precomputed spectrogram features to save time.""" 128 | features_list = [] 129 | # TODO: this HDF5 group structure is outdated, recompute and save a new one 130 | with h5py.File(filepath, 'r') as hf: 131 | for g in hf['utterances']: 132 | for dataset in hf['utterances'][g]: 133 | data = np.array(hf['utterances'][g][dataset]) 134 | features_list.append(data) 135 | 136 | return features_list 137 | 138 | 139 | def load_timit(filepath, train=True): 140 | # TODO: load test also 141 | with h5py.File(filepath, 'r') as hf: 142 | train_spectrograms = np.array(hf['timit']['train']['spectrograms']) 143 | train_labels = np.array(hf['timit']['train']['labels']) 144 | train_seq_lens = np.array(hf['timit']['train']['seq_lens']) 145 | 146 | return train_spectrograms, train_labels, train_seq_lens 147 | 148 | 149 | def save_feature_dataset(audio_filenames, spectrograms, seq_lens, phoneme2index_list, labels, filepath, overwrite=False): 150 | """Save computed features for TIMIT. 151 | 152 | Args: 153 | - maps from subset kinds 'train' and 'test' to corresponding data: 154 | - audio_filenames: list of basepaths to TIMIT examples 155 | - spectrograms: np.array((n_examples, max_frames, n_coeffs)) 156 | - n_examples: number of TIMIT examples (e.g., train=4206) 157 | - max_frames: the most frames in any example 158 | - n_coeffs: number of spectrogram features (e.g., 26 with 12 MFCCs, one 159 | energy, and their 13 deltas) 160 | - seq_lens: number of labels in each target sequence (<= max_labels) 161 | - labels: np.array((n_examples, max_labels)) 162 | - max_labels: the most labels in any example (e.g., train=75) 163 | - phoneme2index_list: a map from phoneme strings (e.g., 'sh') to indices, 164 | ordered as in TIMIT PHONCODE.DOC 165 | """ 166 | if overwrite: 167 | file_mode = 'w' 168 | else: 169 | file_mode = 'w-' # fail if file exists 170 | 171 | with h5py.File(filepath, file_mode) as hf: 172 | timit = hf.create_group('timit') 173 | 174 | train_name = 'train' 175 | test_name = 'test' 176 | 177 | train = timit.create_group(train_name) 178 | test = timit.create_group(test_name) 179 | 180 | for subset_kind, subset_dataset in [(train_name, train), (test_name, test)]: 181 | # (n_examples,) 182 | subset_dataset.create_dataset('example_paths', dtype="S100", data=np.array(audio_filenames[subset_kind])) 183 | 184 | # (n_examples, max_frames, n_coeffs) 185 | subset_dataset.create_dataset('spectrograms', data=spectrograms[subset_kind]) 186 | 187 | # (n_examples,) 188 | subset_dataset.create_dataset('seq_lens', data=seq_lens[subset_kind]) 189 | 190 | # (n_examples, max_labels) 191 | label_dataset = subset_dataset.create_dataset('labels', data=labels[subset_kind]) 192 | 193 | # store phoneme <-> index mapping in HDF5 attributes to avoid numpy structured arrays 194 | # indices are per order in TIMIT phoncode.doc 195 | for phoneme, index in phoneme2index_list: 196 | label_dataset.attrs[phoneme] = index 197 | 198 | # NOTE: because we don't use '1' and '2' as TIMIT phonemes, there 199 | # shouldn't be any collisions with the indices '1' and '2' when we put 200 | # both into the same dict as strings 201 | label_dataset.attrs[str(index)] = phoneme 202 | 203 | 204 | def index_labels(phoneme2index_dict, timit_transcriptions, max_labels): 205 | """Convert TIMIT transcriptions to integer np.array of indices.""" 206 | labels = np.empty((n_examples, max_labels)) 207 | seq_lens = np.empty((n_examples,)) 208 | for i, transcription in enumerate(timit_transcriptions): 209 | index_transcription = [phoneme2index_dict[phoneme] for phoneme in transcription] 210 | labels[i,:len(transcription)] = index_transcription 211 | seq_lens[i] = len(index_transcription) 212 | 213 | return labels, seq_lens 214 | 215 | 216 | def build_spectrogram_array(features_list, n_examples, max_frames, n_coeffs): 217 | """Convert list of ragged spectrograms to np.array with list of lens.""" 218 | spectrograms = np.empty((n_examples, max_frames, n_coeffs)) 219 | 220 | for i, feature_vector in enumerate(features_list): 221 | example_frames = feature_vector.shape[1] 222 | spectrograms[i,:example_frames,:] = feature_vector.T 223 | 224 | return spectrograms 225 | 226 | 227 | def load_transcriptions(audio_filenames): 228 | """Load list of phoneme transcriptions. 229 | 230 | Each phoneme transcription is a list of phonemes without time alignments. 231 | """ 232 | phoneme_ext = ".PHN" 233 | transcriptions = [] 234 | for audio_basename in tqdm(audio_filenames): 235 | # obtain list of phonemes, discarding time-alignment 236 | tr = phoneme_transcription(audio_basename + phoneme_ext) 237 | transcriptions.append(tr) 238 | 239 | return transcriptions 240 | 241 | 242 | def phoneme_set(transcriptions): 243 | """Reduce list of lists of phonemes to a set of phonemes.""" 244 | transcription_phonemes = set() 245 | for transcription in transcriptions: 246 | for phoneme in transcription: 247 | transcription_phonemes.add(phoneme) 248 | 249 | return transcription_phonemes 250 | 251 | 252 | if __name__ == "__main__": 253 | logger.info("Starting to preprocess TIMIT audio data.") 254 | logger.info("Walking TIMIT data directory...") 255 | 256 | audio_filenames = {} 257 | spectrograms = {} 258 | seq_lens = {} 259 | labels = {} 260 | 261 | for subset_kind, subset_dir in [(train_name, train_dir), (test_name, test_dir)]: 262 | subset_audio_filenames = recursive_glob_ext(subset_dir, ext="WAV") 263 | 264 | logger.info("Loading phoneme transcriptions for {}...".format(subset_kind)) 265 | subset_transcriptions = load_transcriptions(subset_audio_filenames) 266 | 267 | # sanity check 268 | verify_phonemes(set(timit_phonemes), phoneme_set(subset_transcriptions)) 269 | 270 | subset_features_list = [] 271 | if load_features: 272 | logger.info("Loading precomputed spectrograms for {}...".format(subset_kind)) 273 | features_list = load_precomputed_spectrograms(filepath='/data/speech/timit/mfcc-timit.hdf5') 274 | else: 275 | logger.info("Computing spectrograms for {}...".format(subset_kind)) 276 | subset_features_list = compute_spectrograms(subset_audio_filenames) 277 | 278 | # compute sizes for np.arrays 279 | n_examples = len(subset_features_list) 280 | max_frames = max(feature_vector.shape[1] for feature_vector in subset_features_list) 281 | n_coeffs = subset_features_list[0].shape[0] # same for all 282 | max_labels = max(len(transcription) for transcription in subset_transcriptions) 283 | 284 | logger.info("Building label array by indexing labels from transcriptions for {}...".format(subset_kind)) 285 | subset_labels, subset_seq_lens = index_labels(phoneme2index_dict, subset_transcriptions, max_labels) 286 | 287 | logger.info("Building spectrogram array for {}...".format(subset_kind)) 288 | subset_spectrograms = build_spectrogram_array(subset_features_list, n_examples, max_frames, n_coeffs) 289 | 290 | # store for later saving 291 | audio_filenames[subset_kind] = subset_audio_filenames 292 | spectrograms[subset_kind] = subset_spectrograms 293 | labels[subset_kind] = subset_labels 294 | seq_lens[subset_kind] = subset_seq_lens 295 | 296 | logger.info("Finished preprocessing {}.".format(subset_kind)) 297 | 298 | logger.info("Saving HDF5 train/test dataset...") 299 | save_feature_dataset(audio_filenames, spectrograms, seq_lens, phoneme2index_list, labels, filepath=timit_hdf5_filepath) 300 | 301 | logger.info("Finished.") 302 | 303 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016-2017, the President and Fellows of Harvard College 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /fathom/seq2seq/data_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utilities for downloading data from WMT, tokenizing, vocabularies.""" 17 | 18 | 19 | 20 | 21 | import gzip 22 | import os 23 | import re 24 | import tarfile 25 | 26 | from six.moves import urllib 27 | 28 | from tensorflow.python.platform import gfile 29 | 30 | # Special vocabulary symbols - we always put them at the start. 31 | _PAD = b"_PAD" 32 | _GO = b"_GO" 33 | _EOS = b"_EOS" 34 | _UNK = b"_UNK" 35 | _START_VOCAB = [_PAD, _GO, _EOS, _UNK] 36 | 37 | PAD_ID = 0 38 | GO_ID = 1 39 | EOS_ID = 2 40 | UNK_ID = 3 41 | 42 | # Regular expressions used to tokenize. 43 | _WORD_SPLIT = re.compile(b"([.,!?\"':;)(])") 44 | _DIGIT_RE = re.compile(br"\d") 45 | 46 | # URLs for WMT data. 47 | _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar" 48 | _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz" 49 | 50 | 51 | def maybe_download(directory, filename, url): 52 | """Download filename from url unless it's already in directory.""" 53 | if not os.path.exists(directory): 54 | print("Creating directory %s" % directory) 55 | os.makedirs(directory) 56 | filepath = os.path.join(directory, filename) 57 | if not os.path.exists(filepath): 58 | print("Downloading %s to %s" % (url, filepath)) 59 | filepath, _ = urllib.request.urlretrieve(url, filepath) 60 | statinfo = os.stat(filepath) 61 | print("Succesfully downloaded", filename, statinfo.st_size, "bytes") 62 | return filepath 63 | 64 | 65 | def gunzip_file(gz_path, new_path): 66 | """Unzips from gz_path into new_path.""" 67 | print("Unpacking %s to %s" % (gz_path, new_path)) 68 | with gzip.open(gz_path, "rb") as gz_file: 69 | with open(new_path, "wb") as new_file: 70 | for line in gz_file: 71 | new_file.write(line) 72 | 73 | 74 | def get_wmt_enfr_train_set(directory): 75 | """Download the WMT en-fr training corpus to directory unless it's there.""" 76 | train_path = os.path.join(directory, "giga-fren.release2.fixed") 77 | if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")): 78 | corpus_file = maybe_download(directory, "training-giga-fren.tar", 79 | _WMT_ENFR_TRAIN_URL) 80 | print("Extracting tar file %s" % corpus_file) 81 | with tarfile.open(corpus_file, "r") as corpus_tar: 82 | corpus_tar.extractall(directory) 83 | gunzip_file(train_path + ".fr.gz", train_path + ".fr") 84 | gunzip_file(train_path + ".en.gz", train_path + ".en") 85 | return train_path 86 | 87 | 88 | def get_wmt_enfr_dev_set(directory): 89 | """Download the WMT en-fr training corpus to directory unless it's there.""" 90 | dev_name = "newstest2013" 91 | dev_path = os.path.join(directory, dev_name) 92 | if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")): 93 | dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL) 94 | print("Extracting tgz file %s" % dev_file) 95 | with tarfile.open(dev_file, "r:gz") as dev_tar: 96 | fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr") 97 | en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en") 98 | fr_dev_file.name = dev_name + ".fr" # Extract without "dev/" prefix. 99 | en_dev_file.name = dev_name + ".en" 100 | dev_tar.extract(fr_dev_file, directory) 101 | dev_tar.extract(en_dev_file, directory) 102 | return dev_path 103 | 104 | 105 | def basic_tokenizer(sentence): 106 | """Very basic tokenizer: split the sentence into a list of tokens.""" 107 | words = [] 108 | for space_separated_fragment in sentence.strip().split(): 109 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment)) 110 | return [w for w in words if w] 111 | 112 | 113 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, 114 | tokenizer=None, normalize_digits=True): 115 | """Create vocabulary file (if it does not exist yet) from data file. 116 | 117 | Data file is assumed to contain one sentence per line. Each sentence is 118 | tokenized and digits are normalized (if normalize_digits is set). 119 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size. 120 | We write it to vocabulary_path in a one-token-per-line format, so that later 121 | token in the first line gets id=0, second line gets id=1, and so on. 122 | 123 | Args: 124 | vocabulary_path: path where the vocabulary will be created. 125 | data_path: data file that will be used to create vocabulary. 126 | max_vocabulary_size: limit on the size of the created vocabulary. 127 | tokenizer: a function to use to tokenize each data sentence; 128 | if None, basic_tokenizer will be used. 129 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 130 | """ 131 | if not gfile.Exists(vocabulary_path): 132 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) 133 | vocab = {} 134 | with gfile.GFile(data_path, mode="rb") as f: 135 | counter = 0 136 | for line in f: 137 | counter += 1 138 | if counter % 100000 == 0: 139 | print(" processing line %d" % counter) 140 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) 141 | for w in tokens: 142 | word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w 143 | if word in vocab: 144 | vocab[word] += 1 145 | else: 146 | vocab[word] = 1 147 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) 148 | if len(vocab_list) > max_vocabulary_size: 149 | vocab_list = vocab_list[:max_vocabulary_size] 150 | with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: 151 | for w in vocab_list: 152 | vocab_file.write(w + b"\n") 153 | 154 | 155 | def initialize_vocabulary(vocabulary_path): 156 | """Initialize vocabulary from file. 157 | 158 | We assume the vocabulary is stored one-item-per-line, so a file: 159 | dog 160 | cat 161 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will 162 | also return the reversed-vocabulary ["dog", "cat"]. 163 | 164 | Args: 165 | vocabulary_path: path to the file containing the vocabulary. 166 | 167 | Returns: 168 | a pair: the vocabulary (a dictionary mapping string to integers), and 169 | the reversed vocabulary (a list, which reverses the vocabulary mapping). 170 | 171 | Raises: 172 | ValueError: if the provided vocabulary_path does not exist. 173 | """ 174 | if gfile.Exists(vocabulary_path): 175 | rev_vocab = [] 176 | with gfile.GFile(vocabulary_path, mode="rb") as f: 177 | rev_vocab.extend(f.readlines()) 178 | rev_vocab = [line.strip() for line in rev_vocab] 179 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 180 | return vocab, rev_vocab 181 | else: 182 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 183 | 184 | 185 | def sentence_to_token_ids(sentence, vocabulary, 186 | tokenizer=None, normalize_digits=True): 187 | """Convert a string to list of integers representing token-ids. 188 | 189 | For example, a sentence "I have a dog" may become tokenized into 190 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2, 191 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7]. 192 | 193 | Args: 194 | sentence: the sentence in bytes format to convert to token-ids. 195 | vocabulary: a dictionary mapping tokens to integers. 196 | tokenizer: a function to use to tokenize each sentence; 197 | if None, basic_tokenizer will be used. 198 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 199 | 200 | Returns: 201 | a list of integers, the token-ids for the sentence. 202 | """ 203 | 204 | if tokenizer: 205 | words = tokenizer(sentence) 206 | else: 207 | words = basic_tokenizer(sentence) 208 | if not normalize_digits: 209 | return [vocabulary.get(w, UNK_ID) for w in words] 210 | # Normalize digits by 0 before looking words up in the vocabulary. 211 | return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words] 212 | 213 | 214 | def data_to_token_ids(data_path, target_path, vocabulary_path, 215 | tokenizer=None, normalize_digits=True): 216 | """Tokenize data file and turn into token-ids using given vocabulary file. 217 | 218 | This function loads data line-by-line from data_path, calls the above 219 | sentence_to_token_ids, and saves the result to target_path. See comment 220 | for sentence_to_token_ids on the details of token-ids format. 221 | 222 | Args: 223 | data_path: path to the data file in one-sentence-per-line format. 224 | target_path: path where the file with token-ids will be created. 225 | vocabulary_path: path to the vocabulary file. 226 | tokenizer: a function to use to tokenize each sentence; 227 | if None, basic_tokenizer will be used. 228 | normalize_digits: Boolean; if true, all digits are replaced by 0s. 229 | """ 230 | if not gfile.Exists(target_path): 231 | print("Tokenizing data in %s" % data_path) 232 | vocab, _ = initialize_vocabulary(vocabulary_path) 233 | with gfile.GFile(data_path, mode="rb") as data_file: 234 | with gfile.GFile(target_path, mode="w") as tokens_file: 235 | counter = 0 236 | for line in data_file: 237 | counter += 1 238 | if counter % 100000 == 0: 239 | print(" tokenizing line %d" % counter) 240 | token_ids = sentence_to_token_ids(line, vocab, tokenizer, 241 | normalize_digits) 242 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") 243 | 244 | 245 | def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None): 246 | """Get WMT data into data_dir, create vocabularies and tokenize data. 247 | 248 | Args: 249 | data_dir: directory in which the data sets will be stored. 250 | en_vocabulary_size: size of the English vocabulary to create and use. 251 | fr_vocabulary_size: size of the French vocabulary to create and use. 252 | tokenizer: a function to use to tokenize each data sentence; 253 | if None, basic_tokenizer will be used. 254 | 255 | Returns: 256 | A tuple of 6 elements: 257 | (1) path to the token-ids for English training data-set, 258 | (2) path to the token-ids for French training data-set, 259 | (3) path to the token-ids for English development data-set, 260 | (4) path to the token-ids for French development data-set, 261 | (5) path to the English vocabulary file, 262 | (6) path to the French vocabulary file. 263 | """ 264 | # Get wmt data to the specified directory. 265 | train_path = get_wmt_enfr_train_set(data_dir) 266 | dev_path = get_wmt_enfr_dev_set(data_dir) 267 | 268 | # Create vocabularies of the appropriate sizes. 269 | fr_vocab_path = os.path.join(data_dir, "vocab%d.fr" % fr_vocabulary_size) 270 | en_vocab_path = os.path.join(data_dir, "vocab%d.en" % en_vocabulary_size) 271 | create_vocabulary(fr_vocab_path, train_path + ".fr", fr_vocabulary_size, tokenizer) 272 | create_vocabulary(en_vocab_path, train_path + ".en", en_vocabulary_size, tokenizer) 273 | 274 | # Create token ids for the training data. 275 | fr_train_ids_path = train_path + (".ids%d.fr" % fr_vocabulary_size) 276 | en_train_ids_path = train_path + (".ids%d.en" % en_vocabulary_size) 277 | data_to_token_ids(train_path + ".fr", fr_train_ids_path, fr_vocab_path, tokenizer) 278 | data_to_token_ids(train_path + ".en", en_train_ids_path, en_vocab_path, tokenizer) 279 | 280 | # Create token ids for the development data. 281 | fr_dev_ids_path = dev_path + (".ids%d.fr" % fr_vocabulary_size) 282 | en_dev_ids_path = dev_path + (".ids%d.en" % en_vocabulary_size) 283 | data_to_token_ids(dev_path + ".fr", fr_dev_ids_path, fr_vocab_path, tokenizer) 284 | data_to_token_ids(dev_path + ".en", en_dev_ids_path, en_vocab_path, tokenizer) 285 | 286 | return (en_train_ids_path, fr_train_ids_path, 287 | en_dev_ids_path, fr_dev_ids_path, 288 | en_vocab_path, fr_vocab_path) 289 | -------------------------------------------------------------------------------- /fathom/deepq/deepq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # NOTE: Based on Tejas Kulkarni's implementation 3 | # (https://github.com/mrkulk/deepQN_tensorflow). 4 | import time 5 | import datetime 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import cv2 10 | 11 | from fathom.nn import GenericModel, default_runstep 12 | 13 | from .database import * 14 | from .emulator import * 15 | 16 | # TODO: clean up this file 17 | nature_params = { 18 | 'game': 'breakout', 19 | 'window_name': "NNModel: Deep Q-Learning for Atari", 20 | 'frameskip': 1, 21 | 'visualize' : False, 22 | 'network_type':'nips', 23 | 'ckpt_file':None, 24 | 'steps_per_epoch': 50000, 25 | 'num_epochs': 100, 26 | 'eval_freq':50000, 27 | 'steps_per_eval':10000, 28 | 'copy_freq' : 10000, 29 | 'disp_freq':10000, 30 | 'save_interval':10000, 31 | 'db_size': 1000000, 32 | 'batch': 32, 33 | 'num_act': 0, 34 | 'input_dims' : [210, 160, 3], 35 | 'input_dims_proc' : [84, 84, 4], 36 | 'learning_interval': 1, 37 | 'eps': 1.0, 38 | 'eps_step':1000000, 39 | 'eps_min' : 0.1, 40 | 'eps_eval' : 0.05, 41 | 'discount': 0.95, 42 | 'lr': 0.0002, 43 | 'rms_decay':0.99, 44 | 'rms_eps':1e-6, 45 | 'train_start':100, # default: 100 46 | 'img_scale':255.0, 47 | 'clip_delta' : 0, #nature : 1 48 | 'gpu_fraction' : 0.25, 49 | 'batch_accumulator':'mean', 50 | 'record_eval' : True, 51 | 'only_eval' : 'n' 52 | } 53 | 54 | nature_params['steps_per_epoch']= 200000 55 | nature_params['eval_freq'] = 100000 56 | nature_params['steps_per_eval'] = 10000 57 | nature_params['copy_freq'] = 10000 58 | nature_params['disp_freq'] = 20000 59 | nature_params['save_interval'] = 20000 60 | #nature_params['learning_interval'] = 1 61 | nature_params['discount'] = 0.99 62 | nature_params['lr'] = 0.00025 63 | nature_params['rms_decay'] = 0.95 64 | nature_params['rms_eps']=0.01 65 | nature_params['clip_delta'] = 1.0 66 | #nature_params['train_start']=50000 67 | nature_params['batch_accumulator'] = 'sum' 68 | nature_params['eps_step'] = 1000000 69 | nature_params['num_epochs'] = 250 70 | nature_params['batch'] = 32 71 | 72 | # The actual neural network interface implementation is the network which 73 | # combines the Q-network and target-network below, not this one. 74 | class DeepQNetNature(object): 75 | """Q-learning network which approximates action-value and action-value targets.""" 76 | def __init__(self, params, parent_graph): 77 | self.G = parent_graph 78 | self.build(params) 79 | 80 | def build(self, params): 81 | with self.G.as_default(): 82 | self.network_type = 'nature' 83 | self.params = params 84 | self.network_name = "deepqnet" 85 | self.x = tf.placeholder('float32',[None,84,84,4],name=self.network_name + '_x') 86 | self.q_t = tf.placeholder('float32',[None],name=self.network_name + '_q_t') 87 | self.actions = tf.placeholder("float32", [None, params['num_act']],name=self.network_name + '_actions') 88 | self.rewards = tf.placeholder("float32", [None],name=self.network_name + '_rewards') 89 | self.terminals = tf.placeholder("float32", [None],name=self.network_name + '_terminals') 90 | 91 | #conv1 92 | layer_name = 'conv1' ; size = 8 ; channels = 4 ; filters = 32 ; stride = 4 93 | self.w1 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights') 94 | self.b1 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases') 95 | self.c1 = tf.nn.conv2d(self.x, self.w1, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs') 96 | self.o1 = tf.nn.relu(tf.add(self.c1,self.b1),name=self.network_name + '_'+layer_name+'_activations') 97 | #self.n1 = tf.nn.lrn(self.o1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 98 | 99 | #conv2 100 | layer_name = 'conv2' ; size = 4 ; channels = 32 ; filters = 64 ; stride = 2 101 | self.w2 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights') 102 | self.b2 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases') 103 | self.c2 = tf.nn.conv2d(self.o1, self.w2, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs') 104 | self.o2 = tf.nn.relu(tf.add(self.c2,self.b2),name=self.network_name + '_'+layer_name+'_activations') 105 | #self.n2 = tf.nn.lrn(self.o2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 106 | 107 | #conv3 108 | layer_name = 'conv3' ; size = 3 ; channels = 64 ; filters = 64 ; stride = 1 109 | self.w3 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights') 110 | self.b3 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases') 111 | self.c3 = tf.nn.conv2d(self.o2, self.w3, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs') 112 | self.o3 = tf.nn.relu(tf.add(self.c3,self.b3),name=self.network_name + '_'+layer_name+'_activations') 113 | #self.n2 = tf.nn.lrn(self.o2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75) 114 | 115 | #flat 116 | o3_shape = self.o3.get_shape().as_list() 117 | 118 | #fc3 119 | layer_name = 'fc4' ; hiddens = 512 ; dim = o3_shape[1]*o3_shape[2]*o3_shape[3] 120 | self.o3_flat = tf.reshape(self.o3, [-1,dim],name=self.network_name + '_'+layer_name+'_input_flat') 121 | self.w4 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights') 122 | self.b4 = tf.Variable(tf.constant(0.1, shape=[hiddens]),name=self.network_name + '_'+layer_name+'_biases') 123 | self.ip4 = tf.add(tf.matmul(self.o3_flat,self.w4),self.b4,name=self.network_name + '_'+layer_name+'_ips') 124 | self.o4 = tf.nn.relu(self.ip4,name=self.network_name + '_'+layer_name+'_activations') 125 | 126 | #fc4 127 | layer_name = 'fc5' ; hiddens = params['num_act'] ; dim = 512 128 | self.w5 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights') 129 | self.b5 = tf.Variable(tf.constant(0.1, shape=[hiddens]),name=self.network_name + '_'+layer_name+'_biases') 130 | self.y = tf.add(tf.matmul(self.o4,self.w5),self.b5,name=self.network_name + '_'+layer_name+'_outputs') 131 | 132 | #Q,Cost,Optimizer 133 | self.discount = tf.constant(self.params['discount']) 134 | self.yj = tf.add(self.rewards, tf.multiply(1.0-self.terminals, tf.multiply(self.discount, self.q_t))) 135 | self.Qxa = tf.multiply(self.y,self.actions) 136 | self.Q_pred = tf.reduce_max(self.Qxa, axis=1) 137 | #self.yjr = tf.reshape(self.yj,(-1,1)) 138 | #self.yjtile = tf.concat(1,[self.yjr,self.yjr,self.yjr,self.yjr]) 139 | #self.yjax = tf.mul(self.yjtile,self.actions) 140 | 141 | #half = tf.constant(0.5) 142 | self.diff = tf.subtract(self.yj, self.Q_pred) 143 | if self.params['clip_delta'] > 0 : 144 | self.quadratic_part = tf.minimum(tf.abs(self.diff), tf.constant(self.params['clip_delta'])) 145 | self.linear_part = tf.subtract(tf.abs(self.diff),self.quadratic_part) 146 | self.diff_square = 0.5 * tf.pow(self.quadratic_part,2) + self.params['clip_delta']*self.linear_part 147 | 148 | else: 149 | self.diff_square = tf.multiply(tf.constant(0.5),tf.pow(self.diff, 2)) 150 | # add optimization 151 | 152 | self.loss() 153 | self.train() 154 | 155 | def loss(self): 156 | with self.G.as_default(): 157 | if self.params['batch_accumulator'] == 'sum': 158 | self.cost = tf.reduce_sum(self.diff_square) 159 | else: 160 | self.cost = tf.reduce_mean(self.diff_square) 161 | 162 | def train(self): 163 | with self.G.as_default(): 164 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 165 | self.rmsprop = tf.train.RMSPropOptimizer(self.params['lr'],self.params['rms_decay'],0.0,self.params['rms_eps']).minimize(self.cost,global_step=self.global_step) 166 | return self.rmsprop 167 | 168 | class DeepQ(GenericModel): 169 | """Deep Q-Learning.""" 170 | forward_only = False 171 | 172 | def __init__(self, device=None, init_options=None, game=nature_params['game']): 173 | super(DeepQ,self).__init__(device=device, init_options=init_options) 174 | assert game in ["breakout", "space_invaders", "seaquest"] 175 | 176 | self.G = tf.Graph() 177 | 178 | # NOTE: moved tf.Graph construction to setup 179 | self.params = nature_params 180 | 181 | self.DB = database(self.params) 182 | self.engine = emulator(rom_name='{}.bin'.format(game), vis=self.params['visualize'], frameskip=self.params['frameskip'], windowname=self.params['window_name']) 183 | #self.engine = emulator(rom_name='{}.bin'.format(game), vis=self.params['visualize'], frameskip=self.params['frameskip'], windowname=self.params['window_name']) 184 | self.params['num_act'] = len(self.engine.legal_actions) 185 | 186 | with self.G.device(device): 187 | self.build_inference() 188 | 189 | def build_inference(self): 190 | with self.G.as_default(): 191 | print('Building QNet and targetnet...') 192 | self.qnet = DeepQNetNature(self.params, self.G) 193 | self.targetnet = DeepQNetNature(self.params, self.G) 194 | saver_dict = {'qw1':self.qnet.w1,'qb1':self.qnet.b1, 195 | 'qw2':self.qnet.w2,'qb2':self.qnet.b2, 196 | 'qw3':self.qnet.w3,'qb3':self.qnet.b3, 197 | 'qw4':self.qnet.w4,'qb4':self.qnet.b4, 198 | 'qw5':self.qnet.w5,'qb5':self.qnet.b5, 199 | 'tw1':self.targetnet.w1,'tb1':self.targetnet.b1, 200 | 'tw2':self.targetnet.w2,'tb2':self.targetnet.b2, 201 | 'tw3':self.targetnet.w3,'tb3':self.targetnet.b3, 202 | 'tw4':self.targetnet.w4,'tb4':self.targetnet.b4, 203 | 'tw5':self.targetnet.w5,'tb5':self.targetnet.b5, 204 | 'step':self.qnet.global_step} 205 | 206 | print("#ops", len(self.G.get_operations())) 207 | 208 | self.saver = tf.train.Saver(saver_dict) 209 | #self.saver = tf.train.Saver() 210 | 211 | self.cp_ops = [ 212 | self.targetnet.w1.assign(self.qnet.w1),self.targetnet.b1.assign(self.qnet.b1), 213 | self.targetnet.w2.assign(self.qnet.w2),self.targetnet.b2.assign(self.qnet.b2), 214 | self.targetnet.w3.assign(self.qnet.w3),self.targetnet.b3.assign(self.qnet.b3), 215 | self.targetnet.w4.assign(self.qnet.w4),self.targetnet.b4.assign(self.qnet.b4), 216 | self.targetnet.w5.assign(self.qnet.w5),self.targetnet.b5.assign(self.qnet.b5)] 217 | 218 | if self.params['ckpt_file'] is not None: 219 | print('loading checkpoint : ' + self.params['ckpt_file']) 220 | self.saver.restore(self.sess,self.params['ckpt_file']) 221 | temp_train_cnt = self.sess.run(self.qnet.global_step) 222 | temp_step = temp_train_cnt * self.params['learning_interval'] 223 | print('Continue from') 224 | print(' -> Steps : ' + str(temp_step)) 225 | print(' -> Minibatch update : ' + str(temp_train_cnt)) 226 | 227 | def model(self): 228 | return self.G 229 | 230 | def setup(self, setup_options=None): 231 | super(DeepQ,self).setup(setup_options=setup_options) 232 | with self.G.as_default(): 233 | if setup_options is None: 234 | self.setup_config = tf.ConfigProto(gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.params['gpu_fraction'])) 235 | else: 236 | self.setup_config = tf.ConfigProto(**setup_options) 237 | self.setup_config.gpu_options.per_process_gpu_memory_fraction=self.params['gpu_fraction'] 238 | 239 | self.sess = tf.Session(config=self.setup_config) 240 | self.init = tf.global_variables_initializer() 241 | self.sess.run(self.init) 242 | self.sess.run(self.cp_ops) 243 | 244 | self.reset_game() 245 | self.step = 0 246 | self.reset_statistics('all') 247 | self.train_cnt = self.sess.run(self.qnet.global_step) 248 | 249 | def reset_game(self): 250 | self.state_proc = np.zeros((84,84,4)); self.action = -1; self.terminal = False; self.reward = 0 251 | self.state = self.engine.newGame() 252 | self.state_resized = cv2.resize(self.state,(84,110)) 253 | self.state_gray = cv2.cvtColor(self.state_resized, cv2.COLOR_BGR2GRAY) 254 | self.state_gray_old = None 255 | self.state_proc[:,:,3] = self.state_gray[26:110,:]/self.params['img_scale'] 256 | 257 | def reset_statistics(self, mode): 258 | if mode == 'all': 259 | self.epi_reward_train = 0 260 | self.epi_Q_train = 0 261 | self.num_epi_train = 0 262 | self.total_reward_train = 0 263 | self.total_Q_train = 0 264 | self.total_cost_train = 0 265 | self.steps_train = 0 266 | self.train_cnt_for_disp = 0 267 | self.step_eval = 0 268 | self.epi_reward_eval = 0 269 | self.epi_Q_eval = 0 270 | self.num_epi_eval = 0 271 | self.total_reward_eval = 0 272 | self.total_Q_eval = 0 273 | 274 | def select_action(self, st, runstep=None): 275 | with self.G.as_default(): 276 | if np.random.rand() > self.params['eps']: 277 | #greedy with random tie-breaking 278 | if not self.forward_only: 279 | Q_pred = self.sess.run(self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0] 280 | else: 281 | Q_pred = runstep(self.sess, self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0] 282 | 283 | a_winner = np.argwhere(Q_pred == np.amax(Q_pred)) 284 | if len(a_winner) > 1: 285 | act_idx = a_winner[np.random.randint(0, len(a_winner))][0] 286 | return act_idx,self.engine.legal_actions[act_idx], np.amax(Q_pred) 287 | else: 288 | act_idx = a_winner[0][0] 289 | return act_idx,self.engine.legal_actions[act_idx], np.amax(Q_pred) 290 | else: 291 | #random 292 | act_idx = np.random.randint(0,len(self.engine.legal_actions)) 293 | if not self.forward_only: 294 | Q_pred = self.sess.run(self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0] 295 | else: 296 | Q_pred = runstep(self.sess, self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0] 297 | return act_idx,self.engine.legal_actions[act_idx], Q_pred[act_idx] 298 | 299 | def get_onehot(self,actions): 300 | actions_onehot = np.zeros((self.params['batch'], self.params['num_act'])) 301 | 302 | for i in range(self.params['batch']): 303 | actions_onehot[i,int(actions[i])] = 1 304 | return actions_onehot 305 | 306 | def run(self, runstep=default_runstep, n_steps=1): 307 | self.s = time.time() 308 | print(self.params) 309 | print('Start training!') 310 | print('Collecting replay memory for ' + str(self.params['train_start']) + ' steps') 311 | 312 | with self.G.as_default(): 313 | while self.step < (self.params['steps_per_epoch'] * self.params['num_epochs'] * self.params['learning_interval'] + self.params['train_start']): 314 | if not self.forward_only: 315 | if self.step >= n_steps: 316 | return 317 | if self.DB.get_size() >= self.params['train_start'] : self.step += 1 ; self.steps_train += 1 318 | else: 319 | if self.step_eval >= n_steps: 320 | return 321 | self.step_eval += 1 322 | if self.state_gray_old is not None and not self.forward_only: 323 | self.DB.insert(self.state_gray_old[26:110,:],self.reward_scaled,self.action_idx,self.terminal) 324 | 325 | if not self.forward_only and self.params['copy_freq'] > 0 and self.step % self.params['copy_freq'] == 0 and self.DB.get_size() > self.params['train_start']: 326 | print('&&& Copying Qnet to targetnet\n') 327 | self.sess.run(self.cp_ops) 328 | 329 | if not self.forward_only and self.step % self.params['learning_interval'] == 0 and self.DB.get_size() > self.params['train_start'] : 330 | bat_s,bat_a,bat_t,bat_n,bat_r = self.DB.get_batches() 331 | bat_a = self.get_onehot(bat_a) 332 | 333 | if self.params['copy_freq'] > 0 : 334 | feed_dict={self.targetnet.x: bat_n} 335 | q_t = self.sess.run(self.targetnet.y,feed_dict=feed_dict) 336 | else: 337 | feed_dict={self.qnet.x: bat_n} 338 | q_t = self.sess.run(self.qnet.y,feed_dict=feed_dict) 339 | 340 | q_t = np.amax(q_t,axis=1) 341 | 342 | feed_dict={self.qnet.x: bat_s, self.qnet.q_t: q_t, self.qnet.actions: bat_a, self.qnet.terminals:bat_t, self.qnet.rewards: bat_r} 343 | 344 | # NOTE: we only runstep the Qnet 345 | _,self.train_cnt,self.cost = runstep(self.sess, [self.qnet.rmsprop,self.qnet.global_step,self.qnet.cost],feed_dict=feed_dict) 346 | 347 | self.total_cost_train += np.sqrt(self.cost) 348 | self.train_cnt_for_disp += 1 349 | 350 | if not self.forward_only: 351 | self.params['eps'] = max(self.params['eps_min'],1.0 - float(self.train_cnt * self.params['learning_interval'])/float(self.params['eps_step'])) 352 | else: 353 | self.params['eps'] = 0.05 354 | 355 | if self.DB.get_size() > self.params['train_start'] and self.step % self.params['save_interval'] == 0 and not self.forward_only: 356 | save_idx = self.train_cnt 357 | self.saver.save(self.sess,'ckpt/model_'+self.params['network_type']+'_'+str(save_idx)) 358 | sys.stdout.write('$$$ Model saved : %s\n\n' % ('ckpt/model_'+self.params['network_type']+'_'+str(save_idx))) 359 | sys.stdout.flush() 360 | 361 | if not self.forward_only and self.step > 0 and self.step % self.params['eval_freq'] == 0 and self.DB.get_size() > self.params['train_start']: 362 | self.reset_game() 363 | if self.step % self.params['steps_per_epoch'] == 0 : self.reset_statistics('all') 364 | else: self.reset_statistics('eval') 365 | self.forward_only = True 366 | #TODO : add video recording 367 | continue 368 | if not self.forward_only and self.step > 0 and self.step % self.params['steps_per_epoch'] == 0 and self.DB.get_size() > self.params['train_start']: 369 | self.reset_game() 370 | self.reset_statistics('all') 371 | #self.forward_only = True 372 | continue 373 | 374 | if self.forward_only and self.step_eval >= self.params['steps_per_eval'] : 375 | self.reset_game() 376 | self.reset_statistics('eval') 377 | self.forward_only = False 378 | continue 379 | 380 | if self.terminal: 381 | self.reset_game() 382 | if not self.forward_only: 383 | self.num_epi_train += 1 384 | self.total_reward_train += self.epi_reward_train 385 | self.epi_reward_train = 0 386 | else: 387 | self.num_epi_eval += 1 388 | self.total_reward_eval += self.epi_reward_eval 389 | self.epi_reward_eval = 0 390 | continue 391 | 392 | self.action_idx,self.action, self.maxQ = self.select_action(self.state_proc, runstep=runstep) 393 | self.state, self.reward, self.terminal = self.engine.next(self.action) 394 | self.reward_scaled = self.reward // max(1,abs(self.reward)) 395 | if not self.forward_only : self.epi_reward_train += self.reward ; self.total_Q_train += self.maxQ 396 | else : self.epi_reward_eval += self.reward ; self.total_Q_eval += self.maxQ 397 | 398 | self.state_gray_old = np.copy(self.state_gray) 399 | self.state_proc[:,:,0:3] = self.state_proc[:,:,1:4] 400 | self.state_resized = cv2.resize(self.state,(84,110)) 401 | self.state_gray = cv2.cvtColor(self.state_resized, cv2.COLOR_BGR2GRAY) 402 | self.state_proc[:,:,3] = self.state_gray[26:110,:]/self.params['img_scale'] 403 | 404 | print("Finished step {0} ({1})".format(self.step_eval, datetime.datetime.now())) 405 | 406 | @property 407 | def loss(self): 408 | return self.qnet.cost 409 | 410 | @property 411 | def train(self): 412 | return self.qnet.rmsprop 413 | 414 | @property 415 | def labels(self): 416 | return 417 | 418 | @property 419 | def inputs(self): 420 | return self.qnet.x, self.qnet.q_t, self.qnet.actions, self.qnet.rewards, self.qnet.terminals 421 | 422 | @property 423 | def outputs(self): 424 | return self.qnet.y # just outputs, not predictions 425 | 426 | def teardown(self): 427 | if self.sess is not None: 428 | self.sess.close() 429 | self.sess = None 430 | 431 | class DeepQFwd(DeepQ): 432 | forward_only = True 433 | 434 | if __name__=='__main__': 435 | m = DeepQ() 436 | m.setup() 437 | m.run(runstep=default_runstep, n_steps=100) 438 | m.teardown() 439 | 440 | -------------------------------------------------------------------------------- /fathom/imagenet/image_processing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Read and preprocess image data. 16 | 17 | Image processing occurs on a single image at a time. Image are read and 18 | preprocessed in pararllel across mulitple threads. The resulting images 19 | are concatenated together to form a single batch for training or evaluation. 20 | 21 | -- Provide processed image data for a network: 22 | inputs: Construct batches of evaluation examples of images. 23 | distorted_inputs: Construct batches of training examples of images. 24 | batch_inputs: Construct batches of training or evaluation examples of images. 25 | 26 | -- Data processing: 27 | parse_example_proto: Parses an Example proto containing a training example 28 | of an image. 29 | 30 | -- Image decoding: 31 | decode_jpeg: Decode a JPEG encoded string into a 3-D float32 Tensor. 32 | 33 | -- Image preprocessing: 34 | image_preprocessing: Decode and preprocess one image for evaluation or training 35 | distort_image: Distort one image for training a network. 36 | eval_image: Prepare one image for evaluation. 37 | distort_color: Distort the color in one image for training. 38 | """ 39 | 40 | 41 | 42 | 43 | 44 | import tensorflow as tf 45 | 46 | FLAGS = tf.app.flags.FLAGS 47 | 48 | tf.app.flags.DEFINE_integer('batch_size', 32, 49 | """Number of images to process in a batch.""") 50 | tf.app.flags.DEFINE_integer('image_size', 224, 51 | """Provide square images of this size.""") 52 | tf.app.flags.DEFINE_integer('num_preprocess_threads', 4, 53 | """Number of preprocessing threads per tower. """ 54 | """Please make this a multiple of 4.""") 55 | 56 | # Images are preprocessed asynchronously using multiple threads specifed by 57 | # --num_preprocss_threads and the resulting processed images are stored in a 58 | # random shuffling queue. The shuffling queue dequeues --batch_size images 59 | # for processing on a given Inception tower. A larger shuffling queue guarantees 60 | # better mixing across examples within a batch and results in slightly higher 61 | # predictive performance in a trained model. Empirically, 62 | # --input_queue_memory_factor=16 works well. A value of 16 implies a queue size 63 | # of 1024*16 images. Assuming RGB 299x299 images, this implies a queue size of 64 | # 16GB. If the machine is memory limited, then decrease this factor to 65 | # decrease the CPU memory footprint, accordingly. 66 | tf.app.flags.DEFINE_integer('input_queue_memory_factor', 1, 67 | """Size of the queue of preprocessed images. """ 68 | """Default is ideal but try smaller values, e.g. """ 69 | """4, 2 or 1, if host memory is constrained. See """ 70 | """comments in code for more details.""") 71 | 72 | 73 | def inputs(dataset, batch_size=None, num_preprocess_threads=None): 74 | """Generate batches of ImageNet images for evaluation. 75 | 76 | Use this function as the inputs for evaluating a network. 77 | 78 | Note that some (minimal) image preprocessing occurs during evaluation 79 | including central cropping and resizing of the image to fit the network. 80 | 81 | Args: 82 | dataset: instance of Dataset class specifying the dataset. 83 | batch_size: integer, number of examples in batch 84 | num_preprocess_threads: integer, total number of preprocessing threads but 85 | None defaults to FLAGS.num_preprocess_threads. 86 | 87 | Returns: 88 | images: Images. 4D tensor of size [batch_size, FLAGS.image_size, 89 | image_size, 3]. 90 | labels: 1-D integer Tensor of [FLAGS.batch_size]. 91 | """ 92 | if not batch_size: 93 | batch_size = FLAGS.batch_size 94 | 95 | # Force all input processing onto CPU in order to reserve the GPU for 96 | # the forward inference and back-propagation. 97 | with tf.device('/cpu:0'): 98 | images, labels = batch_inputs( 99 | dataset, batch_size, train=False, 100 | num_preprocess_threads=num_preprocess_threads) 101 | 102 | return images, labels 103 | 104 | 105 | def distorted_inputs(dataset, batch_size=None, num_preprocess_threads=None): 106 | """Generate batches of distorted versions of ImageNet images. 107 | 108 | Use this function as the inputs for training a network. 109 | 110 | Distorting images provides a useful technique for augmenting the data 111 | set during training in order to make the network invariant to aspects 112 | of the image that do not effect the label. 113 | 114 | Args: 115 | dataset: instance of Dataset class specifying the dataset. 116 | batch_size: integer, number of examples in batch 117 | num_preprocess_threads: integer, total number of preprocessing threads but 118 | None defaults to FLAGS.num_preprocess_threads. 119 | 120 | Returns: 121 | images: Images. 4D tensor of size [batch_size, FLAGS.image_size, 122 | FLAGS.image_size, 3]. 123 | labels: 1-D integer Tensor of [batch_size]. 124 | """ 125 | if not batch_size: 126 | batch_size = FLAGS.batch_size 127 | 128 | # Force all input processing onto CPU in order to reserve the GPU for 129 | # the forward inference and back-propagation. 130 | with tf.device('/cpu:0'): 131 | images, labels = batch_inputs( 132 | dataset, batch_size, train=True, 133 | num_preprocess_threads=num_preprocess_threads) 134 | return images, labels 135 | 136 | 137 | def decode_jpeg(image_buffer, scope=None): 138 | """Decode a JPEG string into one 3-D float image Tensor. 139 | 140 | Args: 141 | image_buffer: scalar string Tensor. 142 | scope: Optional scope for op_scope. 143 | Returns: 144 | 3-D float Tensor with values ranging from [0, 1). 145 | """ 146 | with tf.name_scope(values=[image_buffer], name=scope, default_name='decode_jpeg'): 147 | # Decode the string as an RGB JPEG. 148 | # Note that the resulting image contains an unknown height and width 149 | # that is set dynamically by decode_jpeg. In other words, the height 150 | # and width of image is unknown at compile-time. 151 | image = tf.image.decode_jpeg(image_buffer, channels=3) 152 | 153 | # After this point, all image pixels reside in [0,1) 154 | # until the very end, when they're rescaled to (-1, 1). The various 155 | # adjust_* ops all require this range for dtype float. 156 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 157 | return image 158 | 159 | 160 | def distort_color(image, thread_id=0, scope=None): 161 | """Distort the color of the image. 162 | 163 | Each color distortion is non-commutative and thus ordering of the color ops 164 | matters. Ideally we would randomly permute the ordering of the color ops. 165 | Rather then adding that level of complication, we select a distinct ordering 166 | of color ops for each preprocessing thread. 167 | 168 | Args: 169 | image: Tensor containing single image. 170 | thread_id: preprocessing thread ID. 171 | scope: Optional scope for op_scope. 172 | Returns: 173 | color-distorted image 174 | """ 175 | with tf.name_scope(values=[image], name=scope, default_name='distort_color'): 176 | color_ordering = thread_id % 2 177 | 178 | if color_ordering == 0: 179 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 180 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 181 | image = tf.image.random_hue(image, max_delta=0.2) 182 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 183 | elif color_ordering == 1: 184 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 185 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 186 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 187 | image = tf.image.random_hue(image, max_delta=0.2) 188 | 189 | # The random_* ops do not necessarily clamp. 190 | image = tf.clip_by_value(image, 0.0, 1.0) 191 | return image 192 | 193 | 194 | def distort_image(image, height, width, bbox, thread_id=0, scope=None): 195 | """Distort one image for training a network. 196 | 197 | Distorting images provides a useful technique for augmenting the data 198 | set during training in order to make the network invariant to aspects 199 | of the image that do not effect the label. 200 | 201 | Args: 202 | image: 3-D float Tensor of image 203 | height: integer 204 | width: integer 205 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] 206 | where each coordinate is [0, 1) and the coordinates are arranged 207 | as [ymin, xmin, ymax, xmax]. 208 | thread_id: integer indicating the preprocessing thread. 209 | scope: Optional scope for op_scope. 210 | Returns: 211 | 3-D float Tensor of distorted image used for training. 212 | """ 213 | with tf.name_scope(values=[image, height, width, bbox], name=scope, default_name='distort_image'): 214 | # Each bounding box has shape [1, num_boxes, box coords] and 215 | # the coordinates are ordered [ymin, xmin, ymax, xmax]. 216 | 217 | # Display the bounding box in the first thread only. 218 | if not thread_id: 219 | image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), 220 | bbox) 221 | tf.summary.image('image_with_bounding_boxes', image_with_box) 222 | 223 | # A large fraction of image datasets contain a human-annotated bounding 224 | # box delineating the region of the image containing the object of interest. 225 | # We choose to create a new bounding box for the object which is a randomly 226 | # distorted version of the human-annotated bounding box that obeys an allowed 227 | # range of aspect ratios, sizes and overlap with the human-annotated 228 | # bounding box. If no box is supplied, then we assume the bounding box is 229 | # the entire image. 230 | sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( 231 | tf.shape(image), 232 | bounding_boxes=bbox, 233 | min_object_covered=0.1, 234 | aspect_ratio_range=[0.75, 1.33], 235 | area_range=[0.05, 1.0], 236 | max_attempts=100, 237 | use_image_if_no_bounding_boxes=True) 238 | bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box 239 | if not thread_id: 240 | image_with_distorted_box = tf.image.draw_bounding_boxes( 241 | tf.expand_dims(image, 0), distort_bbox) 242 | tf.summary.image('images_with_distorted_bounding_box', 243 | image_with_distorted_box) 244 | 245 | # Crop the image to the specified bounding box. 246 | distorted_image = tf.slice(image, bbox_begin, bbox_size) 247 | 248 | # This resizing operation may distort the images because the aspect 249 | # ratio is not respected. We select a resize method in a round robin 250 | # fashion based on the thread number. 251 | # Note that ResizeMethod contains 4 enumerated resizing methods. 252 | resize_method = thread_id % 4 253 | distorted_image = tf.image.resize_images(distorted_image, [height, width], 254 | resize_method) 255 | # Restore the shape since the dynamic slice based upon the bbox_size loses 256 | # the third dimension. 257 | distorted_image.set_shape([height, width, 3]) 258 | if not thread_id: 259 | tf.summary.image('cropped_resized_image', 260 | tf.expand_dims(distorted_image, 0)) 261 | 262 | # Randomly flip the image horizontally. 263 | distorted_image = tf.image.random_flip_left_right(distorted_image) 264 | 265 | # Randomly distort the colors. 266 | distorted_image = distort_color(distorted_image, thread_id) 267 | 268 | if not thread_id: 269 | tf.summary.image('final_distorted_image', 270 | tf.expand_dims(distorted_image, 0)) 271 | return distorted_image 272 | 273 | 274 | def eval_image(image, height, width, scope=None): 275 | """Prepare one image for evaluation. 276 | 277 | Args: 278 | image: 3-D float Tensor 279 | height: integer 280 | width: integer 281 | scope: Optional scope for op_scope. 282 | Returns: 283 | 3-D float Tensor of prepared image. 284 | """ 285 | with tf.name_scope(values=[image, height, width], name=scope, default_name='eval_image'): 286 | # Crop the central region of the image with an area containing 87.5% of 287 | # the original image. 288 | image = tf.image.central_crop(image, central_fraction=0.875) 289 | 290 | # Resize the image to the original height and width. 291 | image = tf.expand_dims(image, 0) 292 | image = tf.image.resize_bilinear(image, [height, width], 293 | align_corners=False) 294 | image = tf.squeeze(image, [0]) 295 | return image 296 | 297 | 298 | def image_preprocessing(image_buffer, bbox, train, thread_id=0): 299 | """Decode and preprocess one image for evaluation or training. 300 | 301 | Args: 302 | image_buffer: JPEG encoded string Tensor 303 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] 304 | where each coordinate is [0, 1) and the coordinates are arranged as 305 | [ymin, xmin, ymax, xmax]. 306 | train: boolean 307 | thread_id: integer indicating preprocessing thread 308 | 309 | Returns: 310 | 3-D float Tensor containing an appropriately scaled image 311 | 312 | Raises: 313 | ValueError: if user does not provide bounding box 314 | """ 315 | if bbox is None: 316 | raise ValueError('Please supply a bounding box.') 317 | 318 | image = decode_jpeg(image_buffer) 319 | height = FLAGS.image_size 320 | width = FLAGS.image_size 321 | 322 | if train: 323 | image = distort_image(image, height, width, bbox, thread_id) 324 | else: 325 | image = eval_image(image, height, width) 326 | 327 | # Finally, rescale to [-1,1] instead of [0, 1) 328 | image = tf.subtract(image, 0.5) 329 | image = tf.multiply(image, 2.0) 330 | return image 331 | 332 | 333 | def parse_example_proto(example_serialized): 334 | """Parses an Example proto containing a training example of an image. 335 | 336 | The output of the build_image_data.py image preprocessing script is a dataset 337 | containing serialized Example protocol buffers. Each Example proto contains 338 | the following fields: 339 | 340 | image/height: 462 341 | image/width: 581 342 | image/colorspace: 'RGB' 343 | image/channels: 3 344 | image/class/label: 615 345 | image/class/synset: 'n03623198' 346 | image/class/text: 'knee pad' 347 | image/object/bbox/xmin: 0.1 348 | image/object/bbox/xmax: 0.9 349 | image/object/bbox/ymin: 0.2 350 | image/object/bbox/ymax: 0.6 351 | image/object/bbox/label: 615 352 | image/format: 'JPEG' 353 | image/filename: 'ILSVRC2012_val_00041207.JPEG' 354 | image/encoded: 355 | 356 | Args: 357 | example_serialized: scalar Tensor tf.string containing a serialized 358 | Example protocol buffer. 359 | 360 | Returns: 361 | image_buffer: Tensor tf.string containing the contents of a JPEG file. 362 | label: Tensor tf.int32 containing the label. 363 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] 364 | where each coordinate is [0, 1) and the coordinates are arranged as 365 | [ymin, xmin, ymax, xmax]. 366 | text: Tensor tf.string containing the human-readable label. 367 | """ 368 | # Dense features in Example proto. 369 | feature_map = { 370 | 'image/encoded': tf.FixedLenFeature([], dtype=tf.string, 371 | default_value=''), 372 | 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64, 373 | default_value=-1), 374 | 'image/class/text': tf.FixedLenFeature([], dtype=tf.string, 375 | default_value=''), 376 | } 377 | sparse_float32 = tf.VarLenFeature(dtype=tf.float32) 378 | # Sparse features in Example proto. 379 | feature_map.update( 380 | {k: sparse_float32 for k in ['image/object/bbox/xmin', 381 | 'image/object/bbox/ymin', 382 | 'image/object/bbox/xmax', 383 | 'image/object/bbox/ymax']}) 384 | 385 | features = tf.parse_single_example(example_serialized, feature_map) 386 | label = tf.cast(features['image/class/label'], dtype=tf.int32) 387 | 388 | xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) 389 | ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) 390 | xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) 391 | ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) 392 | 393 | # Note that we impose an ordering of (y, x) just to make life difficult. 394 | bbox = tf.concat(axis=0, values=[ymin, xmin, ymax, xmax]) 395 | 396 | # Force the variable number of bounding boxes into the shape 397 | # [1, num_boxes, coords]. 398 | bbox = tf.expand_dims(bbox, 0) 399 | bbox = tf.transpose(bbox, [0, 2, 1]) 400 | 401 | return features['image/encoded'], label, bbox, features['image/class/text'] 402 | 403 | 404 | def batch_inputs(dataset, batch_size, train, num_preprocess_threads=None): 405 | """Contruct batches of training or evaluation examples from the image dataset. 406 | 407 | Args: 408 | dataset: instance of Dataset class specifying the dataset. 409 | See dataset.py for details. 410 | batch_size: integer 411 | train: boolean 412 | num_preprocess_threads: integer, total number of preprocessing threads 413 | 414 | Returns: 415 | images: 4-D float Tensor of a batch of images 416 | labels: 1-D integer Tensor of [batch_size]. 417 | 418 | Raises: 419 | ValueError: if data is not found 420 | """ 421 | with tf.name_scope('batch_processing'): 422 | data_files = dataset.data_files() 423 | if data_files is None: 424 | raise ValueError('No data files found for this dataset') 425 | filename_queue = tf.train.string_input_producer(data_files, capacity=16) 426 | 427 | if num_preprocess_threads is None: 428 | num_preprocess_threads = FLAGS.num_preprocess_threads 429 | 430 | if num_preprocess_threads % 4: 431 | raise ValueError('Please make num_preprocess_threads a multiple ' 432 | 'of 4 (%d % 4 != 0).', num_preprocess_threads) 433 | # Create a subgraph with its own reader (but sharing the 434 | # filename_queue) for each preprocessing thread. 435 | images_and_labels = [] 436 | for thread_id in range(num_preprocess_threads): 437 | reader = dataset.reader() 438 | _, example_serialized = reader.read(filename_queue) 439 | 440 | # Parse a serialized Example proto to extract the image and metadata. 441 | image_buffer, label_index, bbox, _ = parse_example_proto( 442 | example_serialized) 443 | image = image_preprocessing(image_buffer, bbox, train, thread_id) 444 | images_and_labels.append([image, label_index]) 445 | 446 | # Approximate number of examples per shard. 447 | examples_per_shard = 1024 448 | # Size the random shuffle queue to balance between good global 449 | # mixing (more examples) and memory use (fewer examples). 450 | # 1 image uses 299*299*3*4 bytes = 1MB 451 | # The default input_queue_memory_factor is 16 implying a shuffling queue 452 | # size: examples_per_shard * 16 * 1MB = 17.6GB 453 | min_queue_examples = examples_per_shard * FLAGS.input_queue_memory_factor 454 | 455 | # Create a queue that produces the examples in batches after shuffling. 456 | if train: 457 | images, label_index_batch = tf.train.shuffle_batch_join( 458 | images_and_labels, 459 | batch_size=batch_size, 460 | capacity=min_queue_examples + 3 * batch_size, 461 | min_after_dequeue=min_queue_examples) 462 | else: 463 | images, label_index_batch = tf.train.batch_join( 464 | images_and_labels, 465 | batch_size=batch_size, 466 | capacity=min_queue_examples + 3 * batch_size) 467 | 468 | # Reshape images into these desired dimensions. 469 | height = FLAGS.image_size 470 | width = FLAGS.image_size 471 | depth = 3 472 | 473 | images = tf.cast(images, tf.float32) 474 | images = tf.reshape(images, shape=[batch_size, height, width, depth]) 475 | 476 | # Display the training images in the visualizer. 477 | tf.summary.image('images', images) 478 | 479 | return images, tf.reshape(label_index_batch, [batch_size]) 480 | -------------------------------------------------------------------------------- /fathom/seq2seq/seq2seq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import math 3 | import random 4 | import sys 5 | import time 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | 10 | from fathom.nn import NeuralNetworkModel, default_runstep 11 | 12 | from . import data_utils 13 | 14 | class Seq2Seq(NeuralNetworkModel): 15 | """Based on TensorFlow example of sequence-to-sequence translation.""" 16 | def build_inputs(self): 17 | # Feeds for inputs. 18 | self.encoder_inputs = [] 19 | self.decoder_inputs = [] 20 | for i in range(self.buckets[-1][0]): # Last bucket is the biggest one. 21 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], 22 | name="encoder{0}".format(i))) 23 | for i in range(self.buckets[-1][1] + 1): 24 | self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], 25 | name="decoder{0}".format(i))) 26 | 27 | @property 28 | def inputs(self): 29 | return self.encoder_inputs, self.decoder_inputs 30 | 31 | @property 32 | def labels(self): 33 | return self.target_weights 34 | 35 | def build_labels(self): 36 | # Our targets are decoder inputs shifted by one. 37 | self.targets = [self.decoder_inputs[i + 1] 38 | for i in range(len(self.decoder_inputs) - 1)] 39 | 40 | self.target_weights = [] 41 | for i in range(self.buckets[-1][1] + 1): 42 | self.target_weights.append(tf.placeholder(tf.float32, shape=[None], 43 | name="weight{0}".format(i))) 44 | 45 | def build_evaluation(self): 46 | pass 47 | 48 | def build_inference(self, xs): 49 | with self.G.as_default(): 50 | # If we use sampled softmax, we need an output projection. 51 | output_projection = None 52 | softmax_loss_function = None 53 | # Sampled softmax only makes sense if we sample less than vocabulary size. 54 | num_samples = self.num_samples 55 | if num_samples > 0 and num_samples < self.target_vocab_size: 56 | w = tf.get_variable("proj_w", [self.size, self.target_vocab_size]) 57 | w_t = tf.transpose(w) 58 | b = tf.get_variable("proj_b", [self.target_vocab_size]) 59 | output_projection = (w, b) 60 | 61 | def sampled_loss(labels, logits): 62 | labels = tf.reshape(labels, [-1, 1]) 63 | # We need to compute the sampled_softmax_loss using 32bit floats to 64 | # avoid numerical instabilities. 65 | local_w_t = tf.cast(w_t, tf.float32) 66 | local_b = tf.cast(b, tf.float32) 67 | local_inputs = tf.cast(logits, tf.float32) 68 | return tf.nn.sampled_softmax_loss( 69 | weights=local_w_t, 70 | biases=local_b, 71 | labels=labels, 72 | inputs=local_inputs, 73 | num_sampled=num_samples, 74 | num_classes=self.target_vocab_size) 75 | softmax_loss_function = sampled_loss 76 | 77 | # Create the internal multi-layer cell for our RNN. 78 | def single_cell(): 79 | if self.use_lstm: 80 | return tf.contrib.rnn.BasicLSTMCell(self.size, reuse=tf.get_variable_scope().reuse) 81 | else: 82 | return tf.contrib.rnn.GRUCell(self.size, reuse=tf.get_variable_scope().reuse) 83 | 84 | # The seq2seq function: we use embedding for the input and attention. 85 | def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): 86 | if self.num_layers > 1: 87 | cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range (self.num_layers)]) 88 | else: 89 | cell = single_cell() 90 | return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( 91 | encoder_inputs, decoder_inputs, cell, 92 | num_encoder_symbols=self.source_vocab_size, 93 | num_decoder_symbols=self.target_vocab_size, 94 | embedding_size=self.size, 95 | output_projection=output_projection, 96 | feed_previous=do_decode) 97 | 98 | # Training outputs and losses. 99 | if self.forward_only: 100 | self._outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( 101 | self.encoder_inputs, self.decoder_inputs, self.targets, 102 | self.target_weights, self.buckets, lambda x, y: seq2seq_f(x, y, True), 103 | softmax_loss_function=softmax_loss_function) 104 | # If we use output projection, we need to project outputs for decoding. 105 | if output_projection is not None: 106 | for b in range(len(self.buckets)): 107 | self._outputs[b] = [ 108 | tf.matmul(output, output_projection[0]) + output_projection[1] 109 | for output in self._outputs[b] 110 | ] 111 | else: 112 | self._outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( 113 | self.encoder_inputs, self.decoder_inputs, self.targets, 114 | self.target_weights, self.buckets, 115 | lambda x, y: seq2seq_f(x, y, False), 116 | softmax_loss_function=softmax_loss_function) 117 | 118 | return self._outputs 119 | 120 | @property 121 | def loss(self): 122 | return self.losses 123 | 124 | @property 125 | def train(self): 126 | return self.updates 127 | 128 | def build_loss(self, logits, labels): 129 | with self.G.as_default(): 130 | # TODO: how to handle this in seq2seq? refactoring needed 131 | self.loss_op = self.losses 132 | return self.losses 133 | 134 | def build_train(self, losses): 135 | # TODO: modify total_loss to handle buckets 136 | self.updates = None 137 | with self.G.as_default(): 138 | # Gradients and SGD update operation for training the model. 139 | params = tf.trainable_variables() 140 | if not self.forward_only: 141 | self.gradient_norms = [] 142 | self.updates = [] 143 | self.opt = tf.train.GradientDescentOptimizer(self.learning_rate) 144 | for b in range(len(self.buckets)): 145 | gradients = tf.gradients(self.losses[b], params) 146 | clipped_gradients, norm = tf.clip_by_global_norm(gradients, 147 | self.max_gradient_norm) 148 | self.gradient_norms.append(norm) 149 | self.updates.append(self.opt.apply_gradients( 150 | list(zip(clipped_gradients, params)), global_step=self.global_step)) 151 | 152 | return self.updates # note: this is per-bucket 153 | 154 | def load_data(self): 155 | # TODO: make configurable 156 | self.data_dir = "/data/WMT15/" 157 | 158 | print("Preparing WMT data in %s" % self.data_dir) 159 | en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data( 160 | self.data_dir, self.en_vocab_size, self.fr_vocab_size) 161 | 162 | # Read data into buckets and compute their sizes. 163 | print("Reading development and training data (limit: %d)." 164 | % self.max_train_data_size) 165 | self.dev_set = self.read_data(en_dev, fr_dev) 166 | self.train_set = self.read_data(en_train, fr_train, self.max_train_data_size) 167 | train_bucket_sizes = [len(self.train_set[b]) for b in range(len(self._buckets))] 168 | train_total_size = float(sum(train_bucket_sizes)) 169 | 170 | # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use 171 | # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to 172 | # the size if i-th training bucket, as used later. 173 | self.train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size 174 | for i in range(len(train_bucket_sizes))] 175 | 176 | def read_data(self, source_path, target_path, max_size=None): 177 | """Read data from source and target files and put into buckets. 178 | 179 | Args: 180 | source_path: path to the files with token-ids for the source language. 181 | target_path: path to the file with token-ids for the target language; 182 | it must be aligned with the source file: n-th line contains the desired 183 | output for n-th line from the source_path. 184 | max_size: maximum number of lines to read, all other will be ignored; 185 | if 0 or None, data files will be read completely (no limit). 186 | 187 | Returns: 188 | data_set: a list of length len(_buckets); data_set[n] contains a list of 189 | (source, target) pairs read from the provided data files that fit 190 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and 191 | len(target) < _buckets[n][1]; source and target are lists of token-ids. 192 | """ 193 | data_set = [[] for _ in self._buckets] 194 | with tf.gfile.GFile(source_path, mode="r") as source_file: 195 | with tf.gfile.GFile(target_path, mode="r") as target_file: 196 | source, target = source_file.readline(), target_file.readline() 197 | counter = 0 198 | while source and target and (not max_size or counter < max_size): 199 | counter += 1 200 | if counter % 100000 == 0: 201 | print(" reading data line %d" % counter) 202 | sys.stdout.flush() 203 | source_ids = [int(x) for x in source.split()] 204 | target_ids = [int(x) for x in target.split()] 205 | target_ids.append(data_utils.EOS_ID) 206 | for bucket_id, (source_size, target_size) in enumerate(self._buckets): 207 | if len(source_ids) < source_size and len(target_ids) < target_size: 208 | data_set[bucket_id].append([source_ids, target_ids]) 209 | break 210 | source, target = source_file.readline(), target_file.readline() 211 | return data_set 212 | 213 | @property 214 | def outputs(self): 215 | return self._outputs 216 | 217 | def build_hyperparameters(self): 218 | # data-specific 219 | self.en_vocab_size = 40000 220 | self.fr_vocab_size = 40000 221 | self.max_train_data_size = 1 # 0 is no limit 222 | 223 | # We use a number of buckets and pad to the closest one for efficiency. 224 | # See seq2seq_model.Seq2SeqModel for details of how they work. 225 | self._buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] 226 | # Parameters 227 | self.source_vocab_size = self.en_vocab_size 228 | self.target_vocab_size = self.fr_vocab_size 229 | self.buckets = self._buckets # FIXME: better bucket names 230 | self.num_samples = 512 231 | self.size = 256 232 | self.num_layers = 3 233 | self.use_lstm = True # else GRU 234 | 235 | self.batch_size = 64 236 | if self.init_options: 237 | self.batch_size = self.init_options.get('batch_size', self.batch_size) 238 | 239 | self.display_step = 1 240 | self.global_step = tf.Variable(0, trainable=False) 241 | if not self.forward_only: 242 | self.learning_rate = tf.Variable(0.5, trainable=False) 243 | self.learning_rate_decay_factor = 0.99 244 | self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * self.learning_rate_decay_factor) 245 | self.max_gradient_norm = 5.0 246 | 247 | def run(self, runstep=None, n_steps=1): 248 | # Grab the dataset from the internet, if necessary 249 | self.load_data() 250 | 251 | # This is the training loop. 252 | step_time, loss = 0.0, 0.0 253 | current_step = 0 254 | previous_losses = [] 255 | while True: 256 | if current_step >= n_steps: 257 | return 258 | # Choose a bucket according to data distribution. We pick a random number 259 | # in [0, 1] and use the corresponding interval in train_buckets_scale. 260 | random_number_01 = np.random.random_sample() 261 | bucket_id = min([i for i in range(len(self.train_buckets_scale)) 262 | if self.train_buckets_scale[i] > random_number_01]) 263 | 264 | # Get a batch and make a step. 265 | start_time = time.time() 266 | encoder_inputs, decoder_inputs, target_weights = self.get_batch( 267 | self.train_set, bucket_id) 268 | output_feeds, input_feeds = self.step_feeds(encoder_inputs, decoder_inputs, 269 | target_weights, bucket_id, self.forward_only) 270 | 271 | outputs = runstep( 272 | self.session, 273 | output_feeds, 274 | input_feeds, 275 | #options=run_options, run_metadata=values 276 | ) 277 | 278 | # TODO: do this in a runstep 279 | if not self.forward_only: 280 | _, step_loss, _ = outputs[1], outputs[2], None # Gradient norm, loss, no outputs. 281 | else: 282 | _, step_loss, _ = None, outputs[0], outputs[1:] # No gradient norm, loss, outputs. 283 | 284 | step_time += (time.time() - start_time) / self.display_step 285 | loss += step_loss / self.display_step 286 | current_step += 1 287 | 288 | if not self.forward_only: 289 | # Once in a while, we save checkpoint, print statistics, and run evals. 290 | if current_step % self.display_step == 0: 291 | # Print statistics for the previous epoch. 292 | perplexity = math.exp(loss) if loss < 300 else float('inf') 293 | with self.session.as_default(): 294 | print("global step %d learning rate %.4f step-time %.2f perplexity " 295 | "%.2f" % (self.global_step.eval(), self.learning_rate.eval(), 296 | step_time, perplexity)) 297 | # Decrease learning rate if no improvement was seen over last 3 times. 298 | if len(previous_losses) > 2 and loss > max(previous_losses[-3:]): 299 | self.session.run(self.learning_rate_decay_op) 300 | previous_losses.append(loss) 301 | # Save checkpoint and zero timer and loss. 302 | #checkpoint_path = os.path.join(self.train_dir, "translate.ckpt") 303 | #self.saver.save(sess, checkpoint_path, global_step=self.global_step) 304 | step_time, loss = 0.0, 0.0 305 | # Run evals on development set and print their perplexity. 306 | for bucket_id in range(len(self._buckets)): 307 | if len(self.dev_set[bucket_id]) == 0: 308 | print(" eval: empty bucket %d" % (bucket_id)) 309 | continue 310 | encoder_inputs, decoder_inputs, target_weights = self.get_batch( 311 | self.dev_set, bucket_id) 312 | output_feeds, input_feeds = self.step_feeds(encoder_inputs, decoder_inputs, 313 | target_weights, bucket_id, True) 314 | 315 | outputs = self.session.run( 316 | output_feeds, 317 | input_feeds, 318 | #options=run_options, run_metadata=values 319 | ) 320 | 321 | # TODO: do this in a runstep 322 | if not self.forward_only: 323 | _, eval_loss, _ = outputs[1], outputs[2], None # Gradient norm, loss, no outputs. 324 | 325 | if False: # FIXME: remove this temporary 326 | eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf') 327 | print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx)) 328 | else: 329 | _, eval_loss, _ = None, outputs[0], outputs[1:] # No gradient norm, loss, outputs. 330 | 331 | sys.stdout.flush() 332 | 333 | def step_feeds(self, encoder_inputs, decoder_inputs, target_weights, 334 | bucket_id, forward_only): 335 | """Construct feeds for given inputs. 336 | 337 | Args: 338 | encoder_inputs: list of numpy int vectors to feed as encoder inputs. 339 | decoder_inputs: list of numpy int vectors to feed as decoder inputs. 340 | target_weights: list of numpy float vectors to feed as target weights. 341 | bucket_id: which bucket of the model to use. 342 | forward_only: whether to do the backward step or only forward. 343 | 344 | Returns: 345 | A triple consisting of gradient norm (or None if we did not do backward), 346 | average perplexity, and the outputs. 347 | 348 | Raises: 349 | ValueError: if length of encoder_inputs, decoder_inputs, or 350 | target_weights disagrees with bucket size for the specified bucket_id. 351 | """ 352 | # Check if the sizes match. 353 | encoder_size, decoder_size = self.buckets[bucket_id] 354 | if len(encoder_inputs) != encoder_size: 355 | raise ValueError("Encoder length must be equal to the one in bucket," 356 | " %d != %d." % (len(encoder_inputs), encoder_size)) 357 | if len(decoder_inputs) != decoder_size: 358 | raise ValueError("Decoder length must be equal to the one in bucket," 359 | " %d != %d." % (len(decoder_inputs), decoder_size)) 360 | if len(target_weights) != decoder_size: 361 | raise ValueError("Weights length must be equal to the one in bucket," 362 | " %d != %d." % (len(target_weights), decoder_size)) 363 | 364 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided. 365 | input_feed = {} 366 | for l in range(encoder_size): 367 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l] 368 | #print("encoder", len(encoder_inputs[l]), self.encoder_inputs[l].get_shape()) 369 | for l in range(decoder_size): 370 | input_feed[self.decoder_inputs[l].name] = decoder_inputs[l] 371 | #print("decoder", len(decoder_inputs[l]), self.decoder_inputs[l].get_shape()) 372 | input_feed[self.target_weights[l].name] = target_weights[l] 373 | #print("target", len(target_weights[l]), self.target_weights[l].get_shape()) 374 | 375 | # Since our targets are decoder inputs shifted by one, we need one more. 376 | #last_target = self.decoder_inputs[decoder_size].name 377 | last_target = self.decoder_inputs[decoder_size] 378 | input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32) 379 | 380 | # Output feed: depends on whether we do a backward step or not. 381 | if not forward_only: 382 | output_feed = [self.updates[bucket_id], # Update Op that does SGD. 383 | self.gradient_norms[bucket_id], # Gradient norm. 384 | self.losses[bucket_id]] # Loss for this batch. 385 | else: 386 | output_feed = [self.losses[bucket_id]] # Loss for this batch. 387 | for l in range(decoder_size): # Output logits. 388 | output_feed.append(self._outputs[bucket_id][l]) 389 | 390 | return output_feed, input_feed 391 | 392 | def get_batch(self, data, bucket_id): 393 | """Get a random batch of data from the specified bucket, prepare for step. 394 | 395 | To feed data in step(..) it must be a list of batch-major vectors, while 396 | data here contains single length-major cases. So the main logic of this 397 | function is to re-index data cases to be in the proper format for feeding. 398 | 399 | Args: 400 | data: a tuple of size len(self.buckets) in which each element contains 401 | lists of pairs of input and output data that we use to create a batch. 402 | bucket_id: integer, which bucket to get the batch for. 403 | 404 | Returns: 405 | The triple (encoder_inputs, decoder_inputs, target_weights) for 406 | the constructed batch that has the proper format to call step(...) later. 407 | """ 408 | encoder_size, decoder_size = self.buckets[bucket_id] 409 | encoder_inputs, decoder_inputs = [], [] 410 | 411 | # Get a random batch of encoder and decoder inputs from data, 412 | # pad them if needed, reverse encoder inputs and add GO to decoder. 413 | for _ in range(self.batch_size): 414 | encoder_input, decoder_input = random.choice(data[bucket_id]) 415 | 416 | # Encoder inputs are padded and then reversed. 417 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) 418 | encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) 419 | 420 | # Decoder inputs get an extra "GO" symbol, and are padded then. 421 | decoder_pad_size = decoder_size - len(decoder_input) - 1 422 | decoder_inputs.append([data_utils.GO_ID] + decoder_input + 423 | [data_utils.PAD_ID] * decoder_pad_size) 424 | 425 | # Now we create batch-major vectors from the data selected above. 426 | batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] 427 | 428 | # Batch encoder inputs are just re-indexed encoder_inputs. 429 | for length_idx in range(encoder_size): 430 | batch_encoder_inputs.append( 431 | np.array([encoder_inputs[batch_idx][length_idx] 432 | for batch_idx in range(self.batch_size)], dtype=np.int32)) 433 | 434 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights. 435 | for length_idx in range(decoder_size): 436 | batch_decoder_inputs.append( 437 | np.array([decoder_inputs[batch_idx][length_idx] 438 | for batch_idx in range(self.batch_size)], dtype=np.int32)) 439 | 440 | # Create target_weights to be 0 for targets that are padding. 441 | batch_weight = np.ones(self.batch_size, dtype=np.float32) 442 | for batch_idx in range(self.batch_size): 443 | # We set weight to 0 if the corresponding target is a PAD symbol. 444 | # The corresponding target is decoder_input shifted by 1 forward. 445 | if length_idx < decoder_size - 1: 446 | target = decoder_inputs[batch_idx][length_idx + 1] 447 | if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: 448 | batch_weight[batch_idx] = 0.0 449 | batch_weights.append(batch_weight) 450 | return batch_encoder_inputs, batch_decoder_inputs, batch_weights 451 | 452 | class Seq2SeqFwd(Seq2Seq): 453 | forward_only = True 454 | 455 | if __name__=='__main__': 456 | m = Seq2Seq() 457 | m.setup() 458 | m.run(runstep=default_runstep, n_steps=10) 459 | m.teardown() 460 | 461 | --------------------------------------------------------------------------------