3 |
4 | RUN apt-get update
5 |
6 | ### Software required for Fathom
7 | RUN apt-get install -y python-scipy
8 | RUN pip install scikit-learn
9 | RUN pip install librosa
10 | RUN apt-get install -y libhdf5-dev
11 | RUN pip install h5py
12 |
13 | # ALE
14 | RUN apt-get install -y libsdl1.2-dev libsdl-gfx1.2-dev libsdl-image1.2-dev cmake
15 | RUN git clone https://github.com/mgbellemare/Arcade-Learning-Environment.git /tmp/ALE
16 | RUN mkdir /tmp/build && cd /tmp/build && \
17 | cmake -DUSE_SDL=ON -DUSE_RLGLUE=OFF /tmp/ALE && make
18 | RUN cd /tmp/ALE && pip install .
19 | # OpenCV
20 | RUN apt-get install -y libopencv-dev python-opencv
21 |
22 | ### Create a Fathom working environment
23 | RUN mkdir /data
24 | RUN useradd -ms /bin/bash fathom
25 | RUN chown fathom /data
26 | RUN chmod a+rwx /data
27 | USER fathom
28 | WORKDIR /home/fathom
29 | RUN git clone https://github.com/rdadolf/fathom.git
30 | ENV PYTHONPATH /home/fathom/fathom
31 |
32 |
--------------------------------------------------------------------------------
/fathom/speech/phoneme.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | Python representation of 61 TIMIT phonemes from DOC/PHONCODE.DOC.
5 | """
6 |
7 | # list of 61 TIMIT phonemes from phoncode.doc
8 | timit_phonemes = [
9 | # stop (and corresponding closures)
10 | 'b', 'bcl',
11 | 'd', 'dcl',
12 | 'g', 'gcl',
13 | 'p', 'pcl',
14 | 't', 'tcl', # NOTE: typo of "tck" in TIMIT docs
15 | 'k', 'kcl',
16 | 'dx',
17 | 'q',
18 |
19 | # affricates
20 | 'jh',
21 | 'ch',
22 |
23 | # fricatives
24 | 's',
25 | 'sh',
26 | 'z',
27 | 'zh',
28 | 'f',
29 | 'th',
30 | 'v',
31 | 'dh',
32 |
33 | # nasals
34 | 'm',
35 | 'n',
36 | 'ng',
37 | 'em',
38 | 'en',
39 | 'eng',
40 | 'nx',
41 |
42 | # semivowels and glides
43 | 'l',
44 | 'r',
45 | 'w',
46 | 'y',
47 | 'hh',
48 | 'hv',
49 | 'el',
50 |
51 | # vowels
52 | 'iy',
53 | 'ih',
54 | 'eh',
55 | 'ey',
56 | 'ae',
57 | 'aa',
58 | 'aw',
59 | 'ay',
60 | 'ah',
61 | 'ao',
62 | 'oy',
63 | 'ow',
64 | 'uh',
65 | 'uw',
66 | 'ux',
67 | 'er',
68 | 'ax',
69 | 'ix',
70 | 'axr',
71 | 'ax-h',
72 |
73 | # others
74 | 'pau',
75 | 'epi',
76 | 'h#',
77 |
78 | # lexicon-only (thus omitted from transcriptions)
79 | #'1',
80 | #'2',
81 | ]
82 |
83 | # map phoneme to index
84 | phoneme2index_list = [(phoneme, index) for index, phoneme in enumerate(timit_phonemes)]
85 | phoneme2index_dict = dict(phoneme2index_list)
86 |
87 | index2phoneme_list = [(index, phoneme) for index, phoneme in enumerate(timit_phonemes)]
88 | index2phoneme_dict = dict(index2phoneme_list)
89 |
90 |
--------------------------------------------------------------------------------
/docs/assets/fathom.css:
--------------------------------------------------------------------------------
1 | /* Fathom uses a light Open Sans for body and Gruppo for styled titles. */
2 | @import 'https://fonts.googleapis.com/css?family=Gruppo|Open+Sans';
3 | body, h1, h2, h3, h4, h5, h6, legend {
4 | font-family: 'Open Sans', 'Helvetica Neue', 'Helvetica';
5 | font-weight: 300;
6 | }
7 | code {
8 | font-size: 95%;
9 | padding-top: 4px;
10 | }
11 | /* Style all links in Fathom blue. */
12 | a, a:visited {
13 | color: #007da5;
14 | }
15 | /* Make the large headers more visible */
16 | h1 {
17 | padding-bottom: 5px;
18 | /*border-left: solid #007da5 1px;*/
19 | border-bottom: solid #007da5 1px;
20 | }
21 |
22 | /***** Navigation Header *****/
23 | /* Remove the house icon near the title. */
24 | a.icon-home:before { content: ''; }
25 | /* Give the title a stylized look. */
26 | a.icon-home { /* Desktop */
27 | color: #fcfcfc;
28 | font-family: 'Gruppo';
29 | font-size: 30pt;
30 | font-weight: 400;
31 | margin-bottom: 5px;
32 | }
33 | .wy-nav-top a { /* Mobile */
34 | color: #fcfcfc;
35 | font-family: 'Gruppo';
36 | font-size: 24pt;
37 | font-weight: 400;
38 | margin-bottom: 5px;
39 | }
40 |
41 | /* Add water as a background on desktop, just Fathom blue on mobile. */
42 | .wy-side-nav-search { /* Desktop */
43 | background: url(water-header.png) #007da5;
44 | padding: 5px;
45 | }
46 | .wy-nav-top {
47 | background-color: #007da5;
48 | }
49 |
50 | /***** Navigation Body *****/
51 | .wy-menu-vertical a {
52 | color: #b3b3b3; /* Back to the default */
53 | }
54 | .wy-menu-vertical li.toctree-l3 a {
55 | color: #404040; /* Highlight the difference between h1's and h*'s */
56 | }
57 |
--------------------------------------------------------------------------------
/docs/models.md:
--------------------------------------------------------------------------------
1 | # Seq2Seq
2 | *Direct language-to-language sentence translation. State-of-the-art accuracy with a simple, language-agnostic architecture.*
3 |
4 | *Documentation in progress
*
5 |
6 | # MemNet
7 | *Facebook's memory-oriented neural system. One of two novel architectures which explore a topology beyond feed-forward lattices of neurons.*
8 |
9 | *Documentation in progress
*
10 |
11 | # Speech
12 | *Baidu's speech recognition engine. Proved purely deep-learned networks can beat hand-tuned systems.*
13 |
14 | *Documentation in progress
*
15 |
16 | # Autoenc
17 | *Variational autoencoder. An efficient, generative model for feature learning.*
18 |
19 | *Documentation in progress
*
20 |
21 | # Residual
22 | *Image classifier from Microsoft Research Asia. Dramatically increased the practical depth of convolutional networks. ILSVRC 2015 winner.*
23 |
24 | *Documentation in progress
*
25 |
26 | # VGG
27 | *Image classifier demonstrating the power of small convolutional filters. ILSVRC 2014 winner.*
28 |
29 | *Documentation in progress
*
30 |
31 | # AlexNet
32 | *Image classifier. Watershed for deep learning by beating hand-tuned image systems at ILSVRC 2012.*
33 |
34 | *Documentation in progress
*
35 |
36 | # DeepQ
37 | *Atari-playing neural network from DeepMind. Achieves superhuman performance on majority of Atari2600 games, without any preconceptions.*
38 |
39 | *Documentation in progress
*
40 |
--------------------------------------------------------------------------------
/test/test_basics.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | class TestBasics(unittest.TestCase):
4 | def test_import(self):
5 | modelnames = ['Speech','DeepQ','Seq2Seq','Autoenc','MemNet','Residual','VGG','AlexNet']
6 | import fathom
7 | for modelname in modelnames:
8 | assert hasattr(fathom,modelname), 'No model named "'+str(modelname)+'" found in fathom module.'
9 | for modelname in modelnames:
10 | modelname += 'Fwd'
11 | assert hasattr(fathom,modelname), 'No model named "'+str(modelname)+'" found in fathom module.'
12 |
13 | # FIXME: ALE load failure causes testing to abort.
14 | @unittest.SkipTest
15 | def test_create_deepq(self):
16 | from fathom import DeepQ, DeepQFwd
17 | model = DeepQ()
18 | model = DeepQFwd()
19 |
20 | def test_create_speech(self):
21 | from fathom import Speech, SpeechFwd
22 | model = Speech()
23 | model = SpeechFwd()
24 |
25 | def test_create_seq2seq(self):
26 | from fathom import Seq2Seq, Seq2SeqFwd
27 | model = Seq2Seq()
28 | model = Seq2SeqFwd()
29 |
30 | def test_create_autoenc(self):
31 | from fathom import Autoenc, AutoencFwd
32 | model = Autoenc()
33 | model = AutoencFwd()
34 |
35 | def test_create_memnet(self):
36 | from fathom import MemNet, MemNetFwd
37 | model = MemNet()
38 | model = MemNetFwd()
39 |
40 | def test_create_residual(self):
41 | from fathom import Residual, ResidualFwd
42 | model = Residual()
43 | model = ResidualFwd()
44 |
45 | def test_create_vgg(self):
46 | from fathom import VGG, VGGFwd
47 | model = VGG()
48 | model = VGGFwd()
49 |
50 | def test_create_alexnet(self):
51 | from fathom import AlexNet, AlexNetFwd
52 | model = AlexNet()
53 | model = AlexNetFwd()
54 |
55 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | # The hyphens in release candidates (RCs) will automatically be normalized.
4 | # But we normalize below manually anyway.
5 | _VERSION = '1.0-rc0'
6 |
7 | # TODO: Add version numbers.
8 | REQUIRED_PACKAGES = [
9 | 'scipy',
10 | 'tensorflow >= 1.0.0',
11 | 'scikit-learn',
12 | 'librosa', # audio preprocessing
13 | 'h5py'
14 | ]
15 |
16 | setup(name='Fathom-Workloads', # "fathom" is already taken on PyPI
17 | description='Reference workloads for modern deep learning',
18 | url='http://github.com/rdadolf/fathom',
19 |
20 | version=_VERSION.replace('-', ''),
21 |
22 | # Authors: Robert Adolf, Saketh Rama, and Brandon Reagen
23 | # PyPI does not have an easy way to specify multiple authors.
24 | author="Saketh Rama",
25 | author_email="rama@seas.harvard.edu",
26 |
27 | # We don't use __file__, but mark False to be safe.
28 | zip_safe=False,
29 |
30 | python_requires='>3.5',
31 |
32 | classifiers=[
33 | 'Development Status :: 4 - Beta',
34 | 'Intended Audience :: Developers',
35 | 'Intended Audience :: Education',
36 | 'Intended Audience :: Science/Research',
37 |
38 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
39 | 'Topic :: Scientific/Engineering :: Image Recognition',
40 | 'Topic :: System :: Hardware',
41 | ],
42 |
43 | packages=find_packages(), # find packages in subdirectories
44 |
45 | package_data={'fathom': [
46 | 'fathom.png',
47 |
48 | 'Dockerfile',
49 | 'pylintrc',
50 |
51 | 'README.md',
52 | 'mkdocs.yml',
53 |
54 | 'runtest.sh',
55 |
56 | 'setup.cfg',
57 | ]},
58 | include_package_data=True,
59 | )
60 |
61 |
--------------------------------------------------------------------------------
/docs/faq.md:
--------------------------------------------------------------------------------
1 | # Functions are missing from `cv2`
2 |
3 | You've probably installed the wrong python library. Unfortunately, the `cv2` package in PyPI is not related to OpenCV at all. It's a name-squatter who has managed to upload a useless, empty package. There are a couple of ways to install OpenCV:
4 |
5 | 1. Install from source by following the directions on the [OpenCV website](http://docs.opencv.org/2.4.13/doc/tutorials/introduction/linux_install/linux_install.html#linux-installation).
6 | 2. Install via apt: `sudo apt-get install python-opencv`.
7 | 3. Install using Anaconda: `conda install opencv`.
8 |
9 | # DeepQ can't find a ROM, but it's right there!
10 |
11 | [DeepQ](/models/#deepq) currently looks for its ROMs relative to Fathom's root directory.
12 | This is a bit hacky, and it will cause problems if you run anywhere else, regardless of whether you're using Fathom from the command-line or as a module.
13 | We're planning on fixing this eventually, but in the meantime, there are two solutions:
14 |
15 | 1. Run from the Fathom root directory.
16 |
17 | This should work:
18 | ```sh
19 | $ git clone https://github.com/rdadolf/fathom.git
20 | $ cd fathom
21 | $ export PYTHONPATH=`pwd`
22 | $ ./fathom//.py
23 | ```
24 |
25 | But this won't:
26 | ```sh
27 | $ git clone https://github.com/rdadolf/fathom.git /tmp/fathom
28 | $ export PYTHONPATH=/tmp/fathom
29 | $ /tmp/fathom/fathom//.py
30 | ```
31 |
32 | 2. Edit [DeepQ](/models/#deepq) to point to an absolute path.
33 |
34 | The `ROM_PATH` variable in [emulator.py](https://github.com/rdadolf/fathom/blob/master/fathom/deepq/emulator.py) tells the model where to search for a ROM.
35 | If you replace this variable with the absolute path to fathom, you should be able to run it anywhere.
36 | For instance, this should work:
37 |
38 | ```sh
39 | $ git clone https://github.com/rdadolf/fathom.git /tmp/fathom
40 | ```
41 |
42 | ```python
43 | # in /tmp/fathom/fathom/deepq/emulator.py:
44 | ROM_PATH='/tmp/fathom/fathom/deepq/roms/'
45 | ```
46 |
47 | ```sh
48 | $ export PYTHONPATH=/tmp/fathom
49 | $ python /tmp/fathom/fathom/deepq/deepq.py
50 | ```
51 |
52 | # I found an issue with the Speech model!
53 |
54 | Our implementation requires significant improvement, which we have not yet undertaken for lack of time.
55 |
--------------------------------------------------------------------------------
/fathom/deepq/emulator.py:
--------------------------------------------------------------------------------
1 | # NOTE: Tejas Kulkarni's implementation
2 | import sys
3 | import time
4 | import os.path
5 |
6 | import numpy as np
7 | from ale_python_interface import ALEInterface
8 | import cv2
9 |
10 | ROM_PATH = 'fathom/deepq/roms/'
11 |
12 | class emulator(object):
13 | def __init__(self, rom_name, vis,frameskip=1,windowname='preview'):
14 | self.ale = ALEInterface()
15 | self.max_frames_per_episode = self.ale.getInt("max_num_frames_per_episode");
16 | self.ale.setInt("random_seed",123)
17 | self.ale.setInt("frame_skip",frameskip)
18 | romfile = str(ROM_PATH)+str(rom_name)
19 | if not os.path.exists(romfile):
20 | print('No ROM file found at "'+romfile+'".\nAdjust ROM_PATH or double-check the filt exists.')
21 | self.ale.loadROM(romfile)
22 | self.legal_actions = self.ale.getMinimalActionSet()
23 | self.action_map = dict()
24 | self.windowname = windowname
25 | for i in range(len(self.legal_actions)):
26 | self.action_map[self.legal_actions[i]] = i
27 |
28 | # print(self.legal_actions)
29 | self.screen_width,self.screen_height = self.ale.getScreenDims()
30 | print("width/height: " +str(self.screen_width) + "/" + str(self.screen_height))
31 | self.vis = vis
32 | if vis:
33 | cv2.startWindowThread()
34 | cv2.namedWindow(self.windowname, flags=cv2.WINDOW_AUTOSIZE) # permit manual resizing
35 |
36 | def get_image(self):
37 | numpy_surface = np.zeros(self.screen_height*self.screen_width*3, dtype=np.uint8)
38 | self.ale.getScreenRGB(numpy_surface)
39 | image = np.reshape(numpy_surface, (self.screen_height, self.screen_width, 3))
40 | return image
41 |
42 | def newGame(self):
43 | self.ale.reset_game()
44 | return self.get_image()
45 |
46 | def next(self, action_indx):
47 | reward = self.ale.act(action_indx)
48 | nextstate = self.get_image()
49 | # scipy.misc.imsave('test.png',nextstate)
50 | if self.vis:
51 | cv2.imshow(self.windowname,nextstate)
52 | if sys.platform == 'darwin':
53 | # if we don't do this, can hang on OS X
54 | cv2.waitKey(2)
55 | return nextstate, reward, self.ale.game_over()
56 |
57 |
58 |
59 | if __name__ == "__main__":
60 | engine = emulator('breakout.bin',True)
61 | engine.next(0)
62 | time.sleep(5)
63 |
--------------------------------------------------------------------------------
/fathom/deepq/database.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class database(object):
4 | def __init__(self, params):
5 | self.size = params['db_size']
6 | self.img_scale = params['img_scale']
7 | self.states = np.zeros([self.size,84,84],dtype='uint8') #image dimensions
8 | self.actions = np.zeros(self.size,dtype='float32')
9 | self.terminals = np.zeros(self.size,dtype='float32')
10 | self.rewards = np.zeros(self.size,dtype='float32')
11 | self.bat_size = params['batch']
12 | self.bat_s = np.zeros([self.bat_size,84,84,4])
13 | self.bat_a = np.zeros([self.bat_size])
14 | self.bat_t = np.zeros([self.bat_size])
15 | self.bat_n = np.zeros([self.bat_size,84,84,4])
16 | self.bat_r = np.zeros([self.bat_size])
17 |
18 | self.counter = 0 #keep track of next empty state
19 | self.flag = False
20 | return
21 |
22 | def get_batches(self):
23 | for i in range(self.bat_size):
24 | idx = 0
25 | while idx < 3 or (idx > self.counter-2 and idx < self.counter+3):
26 | idx = np.random.randint(3,self.get_size()-1)
27 | self.bat_s[i] = np.transpose(self.states[idx-3:idx+1,:,:],(1,2,0))/self.img_scale
28 | self.bat_n[i] = np.transpose(self.states[idx-2:idx+2,:,:],(1,2,0))/self.img_scale
29 | self.bat_a[i] = self.actions[idx]
30 | self.bat_t[i] = self.terminals[idx]
31 | self.bat_r[i] = self.rewards[idx]
32 | #self.bat_s[0] = np.transpose(self.states[10:14,:,:],(1,2,0))/self.img_scale
33 | #self.bat_n[0] = np.transpose(self.states[11:15,:,:],(1,2,0))/self.img_scale
34 | #self.bat_a[0] = self.actions[13]
35 | #self.bat_t[0] = self.terminals[13]
36 | #self.bat_r[0] = self.rewards[13]
37 |
38 | return self.bat_s,self.bat_a,self.bat_t,self.bat_n,self.bat_r
39 |
40 | def insert(self, prevstate_proc,reward,action,terminal):
41 | self.states[self.counter] = prevstate_proc
42 | self.rewards[self.counter] = reward
43 | self.actions[self.counter] = action
44 | self.terminals[self.counter] = terminal
45 | #update counter
46 | self.counter += 1
47 | if self.counter >= self.size:
48 | self.flag = True
49 | self.counter = 0
50 | return
51 |
52 | def get_size(self):
53 | if self.flag == False:
54 | return self.counter
55 | else:
56 | return self.size
57 |
58 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | [](https://travis-ci.org/rdadolf/fathom)
4 | [](http://fathom.readthedocs.io/en/latest/)
5 |
6 | ## Release: [`1.0-rc0`](https://github.com/rdadolf/fathom/releases)
7 |
8 | This release reflects the state of Fathom more or less as it was for the paper published in September 2016. We are currently developing a somewhat more user-friendly version, which you can track in the GitHub issue tracker. If you're eager to use Fathom as it is, please let us know.
9 |
10 | ## Workloads
11 |
12 | This paper contains a description of the workloads, performance characteristics, and the rationale behind the project:
13 |
14 | > R. Adolf, S. Rama, B. Reagen, G.Y. Wei, D. Brooks. "Fathom: Reference Workloads for Modern Deep Learning Methods."
15 | [(Arxiv)](http://arxiv.org/abs/1608.06581)
16 | (DOI)
17 |
18 | Name | Description
19 | -------- | -----
20 | Seq2Seq | Direct language-to-language sentence translation. State-of-the-art accuracy with a simple, language-agnostic architecture.
21 | MemNet | Facebook's memory-oriented neural system. One of two novel architectures which explore a topology beyond feed-forward lattices of neurons.
22 | Speech | Baidu's speech recognition engine. Proved purely deep-learned networks can beat hand-tuned systems.
23 | Autoenc | Variational autoencoder. An efficient, generative model for feature learning.
24 | Residual | Image classifier from Microsoft Research Asia. Dramatically increased the practical depth of convolutional networks. ILSVRC 2015 winner.
25 | VGG | Image classifier demonstrating the power of small convolutional filters. ILSVRC 2014 winner.
26 | AlexNet | Image classifier. Watershed for deep learning by beating hand-tuned image systems at ILSVRC 2012.
27 | DeepQ | Atari-playing neural network from DeepMind. Achieves superhuman performance on majority of Atari2600 games, without any preconceptions.
28 |
29 | ## Getting Started
30 |
31 | Read the [Fathom Quickstart Guide](http://fathom.readthedocs.io/en/latest/quickstart/) and let us know if you have any questions.
32 |
33 | Submit a GitHub issue if you have a suggestion or find a bug.
34 |
--------------------------------------------------------------------------------
/fathom/autoenc/variational.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import tensorflow as tf
4 |
5 | import numpy as np
6 | from fathom.nn import default_runstep
7 | from fathom.autoenc.autoenc import xavier_init, AutoencBase
8 |
9 | # heavily based on tensorflow.models.autoencoder
10 | class Autoenc(AutoencBase):
11 | """Variational Autoencoder."""
12 | def build_inference(self, inputs, transfer_function=tf.nn.softplus, scale=0.1):
13 | with self.G.as_default():
14 | self.transfer = transfer_function
15 |
16 | self.training_scale = scale
17 |
18 | network_weights = self._initialize_weights()
19 | self.weights = network_weights
20 |
21 | self.z_mean = tf.add(tf.matmul(inputs, self.weights['w1']), self.weights['b1'])
22 | self.z_log_sigma_sq = tf.add(tf.matmul(inputs, self.weights['log_sigma_w1']), self.weights['log_sigma_b1'])
23 |
24 | # sample from gaussian distribution
25 | eps = tf.random_normal(tf.stack([tf.shape(self.xs)[0], self.n_hidden]), 0, 1, dtype = tf.float32)
26 | self.z = tf.add(self.z_mean, tf.multiply(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps))
27 |
28 | self.reconstruction = tf.add(tf.matmul(self.z, self.weights['w2']), self.weights['b2'])
29 |
30 | # for unsupervised model, loss is part of testing as well
31 | self.build_loss(self.inputs, self.outputs)
32 |
33 | return self.reconstruction
34 |
35 | def build_loss(self, inputs, reconstruction):
36 | with self.G.as_default():
37 | # cost
38 | reconstr_loss = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(self.reconstruction, self.xs), 2.0))
39 | latent_loss = -0.5 * tf.reduce_sum(1 + self.z_log_sigma_sq
40 | - tf.square(self.z_mean)
41 | - tf.exp(self.z_log_sigma_sq), 1)
42 | self.loss_op = tf.reduce_mean(reconstr_loss + latent_loss)
43 | return self.loss_op
44 |
45 | def _initialize_weights(self):
46 | all_weights = dict()
47 | all_weights['w1'] = tf.Variable(xavier_init(self.n_input, self.n_hidden))
48 | all_weights['log_sigma_w1'] = tf.Variable(xavier_init(self.n_input, self.n_hidden))
49 | all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
50 | all_weights['log_sigma_b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
51 | all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
52 | all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
53 | return all_weights
54 |
55 | def generate(self, hidden = None):
56 | if hidden is None:
57 | hidden = np.random.normal(size=self.weights["b1"])
58 | return self.session.run(self.reconstruction, feed_dict={self.z_mean: hidden})
59 |
60 | class AutoencFwd(Autoenc):
61 | forward_only = True
62 |
63 | if __name__ == "__main__":
64 | m = Autoenc()
65 | m.setup()
66 | m.run(runstep=default_runstep, n_steps=10)
67 | m.teardown()
68 |
--------------------------------------------------------------------------------
/fathom/memnet/data_utils.py:
--------------------------------------------------------------------------------
1 | # Dominique Luna's implementation
2 |
3 |
4 | import os
5 | import re
6 | import numpy as np
7 |
8 | def load_task(data_dir, task_id, only_supporting=False):
9 | '''Load the nth task. There are 20 tasks in total.
10 |
11 | Returns a tuple containing the training and testing data for the task.
12 | '''
13 | assert task_id > 0 and task_id < 21
14 |
15 | files = os.listdir(data_dir)
16 | files = [os.path.join(data_dir, f) for f in files]
17 | s = 'qa{}_'.format(task_id)
18 | train_file = [f for f in files if s in f and 'train' in f][0]
19 | test_file = [f for f in files if s in f and 'test' in f][0]
20 | train_data = get_stories(train_file, only_supporting)
21 | test_data = get_stories(test_file, only_supporting)
22 | print(train_file, test_file)
23 | return train_data, test_data
24 |
25 | def tokenize(sent):
26 | '''Return the tokens of a sentence including punctuation.
27 | >>> tokenize('Bob dropped the apple. Where is the apple?')
28 | ['Bob', 'dropped', 'the', 'apple', '.', 'Where', 'is', 'the', 'apple', '?']
29 | '''
30 | return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]
31 |
32 |
33 | def parse_stories(lines, only_supporting=False):
34 | '''Parse stories provided in the bAbI tasks format
35 | If only_supporting is true, only the sentences that support the answer are kept.
36 | '''
37 | data = []
38 | story = []
39 | for line in lines:
40 | line = str.lower(line)
41 | nid, line = line.split(' ', 1)
42 | nid = int(nid)
43 | if nid == 1:
44 | story = []
45 | if '\t' in line: # question
46 | q, a, supporting = line.split('\t')
47 | q = tokenize(q)
48 | #a = tokenize(a)
49 | # answer is one vocab word even if it's actually multiple words
50 | a = [a]
51 | substory = None
52 |
53 | # remove question marks
54 | if q[-1] == "?":
55 | q = q[:-1]
56 |
57 | if only_supporting:
58 | # Only select the related substory
59 | supporting = list(map(int, supporting.split()))
60 | substory = [story[i - 1] for i in supporting]
61 | else:
62 | # Provide all the substories
63 | substory = [x for x in story if x]
64 |
65 | data.append((substory, q, a))
66 | story.append('')
67 | else: # regular sentence
68 | # remove periods
69 | sent = tokenize(line)
70 | if sent[-1] == ".":
71 | sent = sent[:-1]
72 | story.append(sent)
73 | return data
74 |
75 |
76 | def get_stories(f, only_supporting=False):
77 | '''Given a file name, read the file, retrieve the stories, and then convert the sentences into a single story.
78 | If max_length is supplied, any stories longer than max_length tokens will be discarded.
79 | '''
80 | with open(f) as f:
81 | return parse_stories(f.readlines(), only_supporting=only_supporting)
82 |
83 | def vectorize_data(data, word_idx, sentence_size, memory_size):
84 | """
85 | Vectorize stories and queries.
86 |
87 | If a sentence length < sentence_size, the sentence will be padded with 0's.
88 |
89 | If a story length < memory_size, the story will be padded with empty memories.
90 | Empty memories are 1-D arrays of length sentence_size filled with 0's.
91 |
92 | The answer array is returned as a one-hot encoding.
93 | """
94 | S = []
95 | Q = []
96 | A = []
97 | for story, query, answer in data:
98 | ss = []
99 | for sentence in story:
100 | ls = max(0, sentence_size - len(sentence))
101 | ss.append([word_idx[w] for w in sentence] + [0] * ls)
102 |
103 | # take only the most recent sentences that fit in memory
104 | ss = ss[::-1][:memory_size][::-1]
105 |
106 | # pad to memory_size
107 | lm = max(0, memory_size - len(ss))
108 | for _ in range(lm):
109 | ss.append([0] * sentence_size)
110 |
111 | lq = max(0, sentence_size - len(query))
112 | q = [word_idx[w] for w in query] + [0] * lq
113 |
114 | y = np.zeros(len(word_idx) + 1) # 0 is reserved for nil word
115 | for a in answer:
116 | y[word_idx[a]] = 1
117 |
118 | S.append(ss)
119 | Q.append(q)
120 | A.append(y)
121 | return np.array(S), np.array(Q), np.array(A)
122 |
--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
1 | # Largely based on the pylint default configuration.
2 | # pylint --generate-rcfile
3 |
4 | [MASTER]
5 | ignore=CVS
6 | persistent=no
7 | load-plugins=
8 | jobs=1
9 | unsafe-load-any-extension=no
10 | extension-pkg-whitelist=
11 | optimize-ast=no
12 |
13 | [MESSAGES CONTROL]
14 | disable=all
15 | # Full list at https://pylint.readthedocs.io/en/latest/features.html
16 | enable=
17 | # 2 spaces. I don't care what PEP8 says.
18 | bad-indentation,mixed-indentation,
19 | # keeps things clean
20 | #unused-import,unused-variable,
21 | # I'm needlessly inconsistent about this.
22 | wrong-import-order,
23 | # No need for old-style classes, and I sometimes rely on new-style ones. So make everything new.
24 | old-style-class,
25 | # There's almost always a better way.
26 | dangerous-default-value,
27 | # I should probably just automatically fix this...
28 | trailing-whitespace,
29 | # Prudent.
30 | #arguments-differ,
31 |
32 | [REPORTS]
33 | # Defaults
34 | output-format=text
35 | files-output=no
36 | reports=yes
37 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
38 | #msg-template=
39 |
40 | [BASIC]
41 | # List of builtins function names that should not be used
42 | bad-functions=input
43 | # Good/bad variable names which should always/never be accepted
44 | good-names=i,j,k,ex,Run,_
45 | bad-names=
46 |
47 | function-rgx=[a-z_][a-z0-9_]{2,30}$
48 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
49 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
50 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
51 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
52 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
53 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
54 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
55 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
56 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
57 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
58 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
59 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
60 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
61 | class-rgx=[A-Z_][a-zA-Z0-9]+$
62 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
63 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
64 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
65 | method-rgx=[a-z_][a-z0-9_]{2,30}$
66 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
67 | no-docstring-rgx=^_
68 | docstring-min-length=-1
69 |
70 | [ELIF]
71 | # Maximum number of nested blocks for function / method body
72 | max-nested-blocks=5
73 |
74 | [FORMAT]
75 | # Maximum number of characters on a single line.
76 | max-line-length=80
77 | ignore-long-lines=^\s*(# )??$
78 | single-line-if-stmt=no
79 | no-space-check=trailing-comma,dict-separator
80 | max-module-lines=1000
81 | indent-string=' '
82 | # Number of spaces of indent required inside a hanging or continued line.
83 | indent-after-paren=2
84 | expected-line-ending-format=
85 |
86 | [LOGGING]
87 | # Logging modules to check that the string format arguments are in logging
88 | # function parameter format
89 | logging-modules=logging
90 |
91 | [MISCELLANEOUS]
92 | # List of note tags to take in consideration, separated by a comma.
93 | notes=FIXME,XXX,TODO
94 |
95 | [SIMILARITIES]
96 | min-similarity-lines=4
97 | ignore-comments=yes
98 | ignore-docstrings=yes
99 | ignore-imports=no
100 |
101 | [TYPECHECK]
102 | ignore-mixin-members=yes
103 | ignored-modules=
104 | ignored-classes=
105 | generated-members=
106 |
107 | [VARIABLES]
108 | # Tells whether we should check for unused import in __init__ files.
109 | init-import=no
110 | dummy-variables-rgx=_$
111 | additional-builtins=
112 | callbacks=
113 |
114 | [CLASSES]
115 | defining-attr-methods=__init__,__new__
116 | valid-classmethod-first-arg=cls
117 | valid-metaclass-classmethod-first-arg=mcs
118 | exclude-protected=
119 |
120 | [DESIGN]
121 | max-args=1000
122 | ignored-argument-names=_.*
123 | max-locals=15
124 | max-returns=6
125 | max-branches=12
126 | max-statements=50
127 | max-parents=7
128 | max-attributes=7
129 | min-public-methods=2
130 | max-public-methods=20
131 | max-bool-expr=5
132 |
133 | [IMPORTS]
134 | deprecated-modules=regsub,TERMIOS,Bastion,rexec
135 | import-graph=
136 | ext-import-graph=
137 | int-import-graph=
138 |
139 | [EXCEPTIONS]
140 | overgeneral-exceptions=Exception
141 |
--------------------------------------------------------------------------------
/fathom/imagenet/imagenet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import tensorflow as tf
4 |
5 | from fathom.nn import NeuralNetworkModel, default_runstep
6 | from fathom.dataset import Dataset
7 | from fathom.imagenet.image_processing import distorted_inputs
8 |
9 | # TODO: don't hard-code this
10 | imagenet_record_dir = '/data/ILSVRC2012/imagenet-tfrecord/'
11 |
12 | class Imagenet(Dataset):
13 | """Design from TensorFlow Inception example."""
14 | def __init__(self, subset, record_dir=imagenet_record_dir):
15 | super(Imagenet, self).__init__(subset, record_dir)
16 |
17 | def num_classes(self):
18 | return 1000
19 |
20 | def num_examples_per_epoch(self):
21 | # Bounding box data consists of 615299 bounding boxes for 544546 images.
22 | if self.subset == 'train':
23 | return 1281167
24 | if self.subset == 'validation':
25 | return 50000
26 |
27 | class ImagenetModel(NeuralNetworkModel):
28 | @property
29 | def inputs(self):
30 | return self.images
31 |
32 | @property
33 | def labels(self):
34 | return self._labels
35 |
36 | @property
37 | def outputs(self):
38 | return self.logits
39 |
40 | @property
41 | def loss(self):
42 | return self.loss_op
43 |
44 | @property
45 | def train(self):
46 | return self.train_op
47 |
48 | def build_inputs(self):
49 | with self.G.as_default():
50 | # TODO: configure image_size in image_processing.py
51 | self.image_size = 224 # side of the square image
52 | self.channels = 3
53 | self.n_input = self.image_size * self.image_size * self.channels
54 |
55 | self.images = tf.placeholder(tf.float32, [None, self.image_size, self.image_size, self.channels])
56 |
57 | # add queue runners (evaluation dequeues records)
58 | self.dataset = Imagenet('train')
59 | self.batch_images_queue, self.batch_labels_queue = distorted_inputs(self.dataset, batch_size=self.batch_size)
60 |
61 | def build_labels(self):
62 | with self.G.as_default():
63 | self.n_classes = 1000 + 1 # background class
64 | self._labels = tf.placeholder(tf.int64, [None])
65 |
66 | def build_evaluation(self):
67 | """Evaluation metrics (e.g., accuracy)."""
68 | self.correct_pred = tf.equal(tf.argmax(self.outputs, 1), self.labels) # TODO: off-by-one?
69 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
70 |
71 | def build_hyperparameters(self):
72 | with self.G.as_default():
73 | self.learning_rate = 0.001
74 | self.training_iters = 200000
75 | self.batch_size = 64
76 | self.display_step = 1
77 |
78 | self.dropout = 0.8 # Dropout, probability to keep units
79 |
80 | # TODO: can this not be a placeholder?
81 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability)
82 |
83 | def build_loss(self, logits, labels):
84 | with self.G.as_default():
85 | # Define loss
86 | # TODO: does this labels have unexpected state?
87 | self.loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
88 | return self.loss_op
89 |
90 | def build_train(self, total_loss):
91 | with self.G.as_default():
92 | opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
93 |
94 | # Compute and apply gradients.
95 | #self.train_op = opt.minimize(total_loss, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
96 | self.train_op = opt.minimize(total_loss)
97 |
98 | return self.train_op
99 |
100 | def load_data(self):
101 | # Grab the dataset from the internet, if necessary
102 | self.num_batches_per_epoch = self.dataset.num_examples_per_epoch() / self.batch_size
103 |
104 | def run(self, runstep=default_runstep, n_steps=1):
105 | self.load_data()
106 |
107 | with self.G.as_default():
108 | # Keep training until reach max iterations
109 | step = 1
110 | while step * self.batch_size < self.training_iters:
111 | if step > n_steps:
112 | return
113 |
114 | # TODO: switch to test
115 | batch_images, batch_labels = self.session.run([self.batch_images_queue, self.batch_labels_queue])
116 |
117 | print("Queued ImageNet batch.")
118 |
119 | if not self.forward_only:
120 | _, loss_value, acc = runstep(
121 | self.session,
122 | [self.train, self.loss, self.accuracy],
123 | feed_dict={self.images: batch_images, self._labels: batch_labels, self.keep_prob: self.dropout},
124 | )
125 |
126 | if step % self.display_step == 0:
127 | print("Iter " + str(step*self.batch_size) + ", Minibatch Loss= " + "{:.6f}".format(loss_value) + ", Training Accuracy= " + "{:.5f}".format(acc))
128 | else:
129 | _ = runstep(
130 | self.session,
131 | self.outputs,
132 | feed_dict={self.images: batch_images, self._labels: batch_labels, self.keep_prob: 1.},
133 | )
134 |
135 | step += 1
136 |
137 | #print "Testing Accuracy:", runstep(self.session, [self.accuracy], feed_dict={self.images: self.mnist.test.images[:256], self._labels: self.mnist.test.labels[:256], self.keep_prob: 1.})
138 |
--------------------------------------------------------------------------------
/fathom/nn.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from abc import ABCMeta, abstractmethod, abstractproperty
4 | import tensorflow as tf
5 |
6 | class GenericModel(object, metaclass=ABCMeta):
7 | def __init__(self, device=None, init_options=None):
8 | self.device=device
9 |
10 | @abstractmethod
11 | def model(self):
12 | 'Return a reference to the native representation of the model.'
13 | pass
14 | def setup(self, setup_options=None):
15 | '(Optional) Prepare the model for running.'
16 | pass
17 | @abstractmethod
18 | def run(self, runstep=None, n_steps=1, *args, **kwargs):
19 | 'Run the model.'
20 | pass
21 | def teardown(self):
22 | '(Optional) Clean up after a model run.'
23 | pass
24 |
25 | def default_runstep(session, sink_ops, *options, **kw_options):
26 | return session.run(sink_ops, *options, **kw_options)
27 |
28 |
29 | class NeuralNetworkModel(GenericModel, metaclass=ABCMeta):
30 | forward_only = False
31 |
32 | def __init__(self, device=None, init_options=None):
33 | super(NeuralNetworkModel,self).__init__(device=device, init_options=init_options)
34 |
35 | self.G = tf.Graph()
36 | self.session = None
37 |
38 | # e.g., for batch_size
39 | self.init_options = init_options
40 |
41 | with self.G.device(device):
42 | with self.G.as_default():
43 | self.build()
44 |
45 | with self.G.as_default():
46 | self.init = tf.global_variables_initializer()
47 |
48 | @abstractmethod
49 | def load_data(self):
50 | """Load dataset (possibly downloading it)."""
51 | pass
52 |
53 | @abstractmethod
54 | def build_inputs(self):
55 | """Construct graph's input placeholders."""
56 | pass
57 |
58 | @abstractmethod
59 | def build_labels(self):
60 | """Construct graph's label placeholders."""
61 | pass
62 |
63 | @abstractproperty
64 | def inputs(self):
65 | pass
66 |
67 | @abstractproperty
68 | def labels(self):
69 | pass
70 |
71 | @abstractmethod
72 | def build_hyperparameters(self):
73 | """Set hard-coded hyperparameters."""
74 | pass
75 |
76 | @abstractproperty
77 | def outputs(self):
78 | """Network outputs before loss function."""
79 | pass
80 |
81 | @abstractproperty
82 | def loss(self):
83 | """Loss function."""
84 | pass
85 |
86 | @abstractproperty
87 | def train(self):
88 | """Training/optimization operation."""
89 | pass
90 |
91 | def build_evaluation(self):
92 | """Evaluation metrics (e.g., accuracy)."""
93 | self.correct_pred = tf.equal(tf.argmax(self.outputs, 1), tf.argmax(self.labels, 1))
94 | self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
95 |
96 | def build(self):
97 | """Build computation graph."""
98 | with self.G.as_default():
99 | self.global_step = tf.Variable(0, trainable=False)
100 |
101 | self.build_hyperparameters()
102 |
103 | self.build_inputs()
104 | self.build_labels()
105 |
106 | self.build_inference(self.inputs)
107 |
108 | if not self.forward_only:
109 | self.build_loss(self.outputs, self.labels)
110 | self.build_train(self.loss_op)
111 |
112 | self.build_evaluation()
113 |
114 | @abstractmethod
115 | def build_inference(self, inputs):
116 | """Build inference.
117 |
118 | Args:
119 | inputs: Images, for example.
120 |
121 | Returns:
122 | Logits.
123 | """
124 | pass
125 |
126 | @abstractmethod
127 | def build_loss(self, outputs, labels):
128 | """Add loss to trainable variables.
129 | Args:
130 | outputs: Outputs from inference().
131 | labels: Labels from inputs. 1-D tensor of shape [batch_size].
132 |
133 | Returns:
134 | Loss tensor of type float.
135 | """
136 | pass
137 |
138 | @abstractmethod
139 | def build_train(self, total_loss, global_step):
140 | """Train model.
141 |
142 | Create optimizer and apply to all trainable variables.
143 |
144 | Args:
145 | total_loss: Total loss from loss().
146 | global_step: Integer Variable counting number of training steps processed.
147 |
148 | Returns:
149 | train_op: op for training.
150 | """
151 | pass
152 |
153 | def model(self):
154 | return self.G
155 |
156 | def setup(self, setup_options=None):
157 | """Make session and launch queue runners."""
158 | super(NeuralNetworkModel,self).setup(setup_options=setup_options)
159 | with self.G.as_default():
160 | # Start a new session and initialize the network
161 | if setup_options is not None:
162 | self.session = tf.Session(config=tf.ConfigProto(**setup_options))
163 | else:
164 | self.session = tf.Session()
165 | # Start the input data loaders
166 | self.coord = tf.train.Coordinator()
167 | self.session.run(self.init)
168 | # Start the input data loaders
169 | self.threads = tf.train.start_queue_runners(sess=self.session,coord=self.coord)
170 |
171 | def teardown(self):
172 | """Close session and join queue runners."""
173 | self.coord.request_stop()
174 | self.coord.join(self.threads, stop_grace_period_secs=10)
175 | if self.session is not None:
176 | self.session.close()
177 | self.session = None
178 |
179 |
--------------------------------------------------------------------------------
/fathom/vgg/vgg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from math import sqrt
4 | import tensorflow as tf
5 | from fathom.nn import default_runstep
6 | from fathom.imagenet import imagenet
7 |
8 | class VGG(imagenet.ImagenetModel):
9 | """VGG Network."""
10 | def build_hyperparameters(self):
11 | # TODO: put these into runstep options or somewhere else
12 | # Parameters
13 | self.learning_rate = 0.0001
14 | self.training_iters = 200000
15 | self.batch_size = 8
16 | if self.init_options:
17 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
18 | self.display_step = 1
19 |
20 | if not self.forward_only:
21 | self.dropout = 0.8 # Dropout, probability to keep units
22 | else:
23 | self.dropout = 1.
24 |
25 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability)
26 |
27 | def build_inference(self, images):
28 | with self.G.as_default():
29 | # fix dimensions
30 | input_shape = images.get_shape().as_list()
31 | if len(input_shape) == 2:
32 | ndim = int(sqrt(input_shape[1]))
33 | if ndim * ndim != input_shape[1]:
34 | raise ValueError('input_shape should be square')
35 | images = tf.reshape(images, [-1, ndim, ndim, 1])
36 |
37 | # assume images shape is 224x224x3
38 |
39 | # block 1 -- outputs 112x112x64
40 | conv1_1 = conv_op(images, name="conv1_1", kh=3, kw=3, n_out=64, dh=1, dw=1)
41 | conv1_2 = conv_op(conv1_1, name="conv1_2", kh=3, kw=3, n_out=64, dh=1, dw=1)
42 | pool1 = mpool_op(conv1_2, name="pool1", kh=2, kw=2, dw=2, dh=2)
43 |
44 | # block 2 -- outputs 56x56x128
45 | conv2_1 = conv_op(pool1, name="conv2_1", kh=3, kw=3, n_out=128, dh=1, dw=1)
46 | conv2_2 = conv_op(conv2_1, name="conv2_2", kh=3, kw=3, n_out=128, dh=1, dw=1)
47 | pool2 = mpool_op(conv2_2, name="pool2", kh=2, kw=2, dh=2, dw=2)
48 |
49 | # TODO: VGG pooling in later layers is too aggressive for MNIST
50 | using_imagenet = True
51 | if using_imagenet:
52 | # block 3 -- outputs 28x28x256
53 | conv3_1 = conv_op(pool2, name="conv3_1", kh=3, kw=3, n_out=256, dh=1, dw=1)
54 | conv3_2 = conv_op(conv3_1, name="conv3_2", kh=3, kw=3, n_out=256, dh=1, dw=1)
55 | pool3 = mpool_op(conv3_2, name="pool3", kh=2, kw=2, dh=2, dw=2)
56 |
57 | # block 4 -- outputs 14x14x512
58 | conv4_1 = conv_op(pool3, name="conv4_1", kh=3, kw=3, n_out=512, dh=1, dw=1)
59 | conv4_2 = conv_op(conv4_1, name="conv4_2", kh=3, kw=3, n_out=512, dh=1, dw=1)
60 | conv4_3 = conv_op(conv4_2, name="conv4_2", kh=3, kw=3, n_out=512, dh=1, dw=1)
61 | pool4 = mpool_op(conv4_3, name="pool4", kh=2, kw=2, dh=2, dw=2)
62 |
63 | # block 5 -- outputs 7x7x512
64 | conv5_1 = conv_op(pool4, name="conv5_1", kh=3, kw=3, n_out=512, dh=1, dw=1)
65 | conv5_2 = conv_op(conv5_1, name="conv5_2", kh=3, kw=3, n_out=512, dh=1, dw=1)
66 | conv5_3 = conv_op(conv5_2, name="conv5_3", kh=3, kw=3, n_out=512, dh=1, dw=1)
67 | pool5 = mpool_op(conv5_3, name="pool5", kh=2, kw=2, dw=2, dh=2)
68 |
69 | # flatten
70 | shp = pool5.get_shape().as_list() # pool2 if shrunk
71 | flattened_shape = shp[1] * shp[2] * shp[3]
72 | resh1 = tf.reshape(pool5, [self.batch_size, flattened_shape], name="resh1")
73 |
74 | # fully connected
75 | fc6 = fc_op(resh1, name="fc6", n_out=4096)
76 | fc6_drop = tf.nn.dropout(fc6, self.dropout, name="fc6_drop")
77 |
78 | fc7 = fc_op(fc6_drop, name="fc7", n_out=4096)
79 | fc7_drop = tf.nn.dropout(fc7, self.dropout, name="fc7_drop")
80 |
81 | fc8 = fc_op(fc7_drop, name="fc8", n_out=self.n_classes)
82 |
83 | self.logits = fc8
84 |
85 | return self.logits
86 |
87 | # crudely based on https://github.com/huyng/tensorflow-vgg
88 | # TODO: refactor these utility functions across convnet models to remove dependencies
89 | def conv_op(input_op, name, kw, kh, n_out, dw, dh):
90 | n_in = input_op.get_shape()[-1].value
91 |
92 | with tf.name_scope(name) as scope:
93 | kernel_init_val = tf.truncated_normal([kh, kw, n_in, n_out], dtype=tf.float32, stddev=0.1)
94 | kernel = tf.Variable(kernel_init_val, trainable=True, name='w')
95 | conv = tf.nn.conv2d(input_op, kernel, (1, dh, dw, 1), padding='SAME')
96 | bias_init_val = tf.constant(0.0, shape=[n_out], dtype=tf.float32)
97 | biases = tf.Variable(bias_init_val, trainable=True, name='b')
98 | z = tf.reshape(tf.nn.bias_add(conv, biases), [n_in] + conv.get_shape().as_list()[1:])
99 | z = tf.nn.bias_add(conv, biases)
100 | activation = tf.nn.relu(z, name=scope)
101 | return activation
102 |
103 | def fc_op(input_op, name, n_out):
104 | n_in = input_op.get_shape()[-1].value
105 |
106 | with tf.name_scope(name):
107 | kernel = tf.Variable(tf.truncated_normal([n_in, n_out], dtype=tf.float32, stddev=0.1), name='w')
108 | biases = tf.Variable(tf.constant(0.0, shape=[n_out], dtype=tf.float32), name='b')
109 | activation = tf.nn.relu_layer(input_op, kernel, biases, name=name)
110 | return activation
111 |
112 | def mpool_op(input_op, name, kh, kw, dh, dw):
113 | return tf.nn.max_pool(input_op,
114 | ksize=[1, kh, kw, 1],
115 | strides=[1, dh, dw, 1],
116 | padding='VALID',
117 | name=name)
118 |
119 | class VGGFwd(VGG):
120 | forward_only = True
121 |
122 | if __name__ == "__main__":
123 | m = VGG()
124 | m.setup()
125 | m.run(runstep=default_runstep, n_steps=10)
126 | m.teardown()
127 |
--------------------------------------------------------------------------------
/fathom/alexnet/alexnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import tensorflow as tf
3 |
4 | from fathom.imagenet import imagenet
5 | from fathom.nn import default_runstep
6 |
7 | def conv2d(name, l_input, w, b):
8 | return tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(l_input, w, strides=[1, 1, 1, 1], padding='SAME'),b), name=name)
9 |
10 | def max_pool(name, l_input, k):
11 | return tf.nn.max_pool(l_input, ksize=[1, k, k, 1], strides=[1, k, k, 1], padding='SAME', name=name)
12 |
13 | def norm(name, l_input, lsize=4):
14 | return tf.nn.lrn(l_input, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name=name)
15 |
16 | class AlexNet(imagenet.ImagenetModel):
17 | """Based on Aymeric Damien's TensorFlow example of AlexNet."""
18 | def build_inference(self, images):
19 | with self.G.as_default():
20 | # conv1
21 | with tf.name_scope('conv1') as scope:
22 | kernel = tf.Variable(tf.truncated_normal([11, 11, 3, 64], dtype=tf.float32,
23 | stddev=1e-1), name='weights')
24 | conv = tf.nn.conv2d(images, kernel, [1, 4, 4, 1], padding='SAME')
25 | biases = tf.Variable(tf.constant(0.0, shape=[64], dtype=tf.float32),
26 | trainable=True, name='biases')
27 | bias = tf.nn.bias_add(conv, biases)
28 | conv1 = tf.nn.relu(bias, name=scope)
29 |
30 | # pool1
31 | pool1 = tf.nn.max_pool(conv1,
32 | ksize=[1, 3, 3, 1],
33 | strides=[1, 2, 2, 1],
34 | padding='VALID',
35 | name='pool1')
36 |
37 | # TODO: lrn1
38 | lsize = 4
39 | norm1 = tf.nn.lrn(pool1, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
40 |
41 | # conv2
42 | with tf.name_scope('conv2') as scope:
43 | kernel = tf.Variable(tf.truncated_normal([5, 5, 64, 192], dtype=tf.float32,
44 | stddev=1e-1), name='weights')
45 | conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
46 | biases = tf.Variable(tf.constant(0.0, shape=[192], dtype=tf.float32),
47 | trainable=True, name='biases')
48 | bias = tf.nn.bias_add(conv, biases)
49 | conv2 = tf.nn.relu(bias, name=scope)
50 |
51 | # pool2
52 | pool2 = tf.nn.max_pool(conv2,
53 | ksize=[1, 3, 3, 1],
54 | strides=[1, 2, 2, 1],
55 | padding='VALID',
56 | name='pool2')
57 |
58 | norm2 = tf.nn.lrn(pool2, lsize, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
59 |
60 | # conv3
61 | with tf.name_scope('conv3') as scope:
62 | kernel = tf.Variable(tf.truncated_normal([3, 3, 192, 384],
63 | dtype=tf.float32,
64 | stddev=1e-1), name='weights')
65 | conv = tf.nn.conv2d(norm2, kernel, [1, 1, 1, 1], padding='SAME')
66 | biases = tf.Variable(tf.constant(0.0, shape=[384], dtype=tf.float32),
67 | trainable=True, name='biases')
68 | bias = tf.nn.bias_add(conv, biases)
69 | conv3 = tf.nn.relu(bias, name=scope)
70 |
71 | # conv4
72 | with tf.name_scope('conv4') as scope:
73 | kernel = tf.Variable(tf.truncated_normal([3, 3, 384, 256],
74 | dtype=tf.float32,
75 | stddev=1e-1), name='weights')
76 | conv = tf.nn.conv2d(conv3, kernel, [1, 1, 1, 1], padding='SAME')
77 | biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
78 | trainable=True, name='biases')
79 | bias = tf.nn.bias_add(conv, biases)
80 | conv4 = tf.nn.relu(bias, name=scope)
81 |
82 | # conv5
83 | with tf.name_scope('conv5') as scope:
84 | kernel = tf.Variable(tf.truncated_normal([3, 3, 256, 256],
85 | dtype=tf.float32,
86 | stddev=1e-1), name='weights')
87 | conv = tf.nn.conv2d(conv4, kernel, [1, 1, 1, 1], padding='SAME')
88 | biases = tf.Variable(tf.constant(0.0, shape=[256], dtype=tf.float32),
89 | trainable=True, name='biases')
90 | bias = tf.nn.bias_add(conv, biases)
91 | conv5 = tf.nn.relu(bias, name=scope)
92 |
93 | # pool5
94 | pool5 = tf.nn.max_pool(conv5,
95 | ksize=[1, 3, 3, 1],
96 | strides=[1, 2, 2, 1],
97 | padding='VALID',
98 | name='pool5')
99 |
100 | pool5_shape = pool5.get_shape().as_list()
101 | pool5_length = pool5_shape[1] * pool5_shape[2] * pool5_shape[3]
102 |
103 | wd1 = tf.Variable(tf.random_normal([pool5_length, 4096]))
104 | bd1 = tf.Variable(tf.random_normal([4096]))
105 |
106 | flattened_pool5 = tf.reshape(pool5, [self.batch_size, pool5_length])
107 | dense1 = tf.nn.relu(tf.nn.xw_plus_b(flattened_pool5, wd1, bd1), name='fc1')
108 |
109 | wd2 = tf.Variable(tf.random_normal([4096, 4096]))
110 | bd2 = tf.Variable(tf.random_normal([4096]))
111 | dense2 = tf.nn.relu(tf.nn.xw_plus_b(dense1, wd2, bd2), name='fc2')
112 |
113 | w_out = tf.Variable(tf.random_normal([4096, self.n_classes]))
114 | b_out = tf.Variable(tf.random_normal([self.n_classes]))
115 |
116 | self.logits = tf.nn.xw_plus_b(dense2, w_out, b_out)
117 |
118 | return self.logits
119 |
120 | def build_hyperparameters(self):
121 | self.learning_rate = 0.001
122 | self.training_iters = 200000
123 | self.batch_size = 64
124 | if self.init_options:
125 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
126 | self.display_step = 1
127 |
128 | self.dropout = 0.8 # Dropout, probability to keep units
129 |
130 | # TODO: can this not be a placeholder?
131 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability)
132 |
133 | class AlexNetFwd(AlexNet):
134 | forward_only = True
135 |
136 | if __name__=='__main__':
137 | m = AlexNet()
138 | m.setup()
139 | m.run(runstep=default_runstep, n_steps=10)
140 | m.teardown()
141 |
142 |
--------------------------------------------------------------------------------
/fathom/imagenet/mnist.py:
--------------------------------------------------------------------------------
1 | """Functions for downloading and reading MNIST data.
2 |
3 | Original Author: Aymeric Damien
4 | https://github.com/aymericdamien/TensorFlow-Examples/
5 | """
6 |
7 | # TODO: clean up dataset code
8 |
9 |
10 | import gzip
11 | import os
12 | import urllib.request, urllib.parse, urllib.error
13 | import numpy
14 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
15 | def maybe_download(filename, work_directory):
16 | """Download the data from Yann's website, unless it's already here."""
17 | if not os.path.exists(work_directory):
18 | os.mkdir(work_directory)
19 | filepath = os.path.join(work_directory, filename)
20 | if not os.path.exists(filepath):
21 | filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)
22 | statinfo = os.stat(filepath)
23 | print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
24 | return filepath
25 | def _read32(bytestream):
26 | dt = numpy.dtype(numpy.uint32).newbyteorder('>')
27 | return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]
28 | def extract_images(filename):
29 | """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
30 | print('Extracting', filename)
31 | with gzip.open(filename) as bytestream:
32 | magic = _read32(bytestream)
33 | if magic != 2051:
34 | raise ValueError(
35 | 'Invalid magic number %d in MNIST image file: %s' %
36 | (magic, filename))
37 | num_images = _read32(bytestream)
38 | rows = _read32(bytestream)
39 | cols = _read32(bytestream)
40 | buf = bytestream.read(rows * cols * num_images)
41 | data = numpy.frombuffer(buf, dtype=numpy.uint8)
42 | data = data.reshape(num_images, rows, cols, 1)
43 | return data
44 | def dense_to_one_hot(labels_dense, num_classes=10):
45 | """Convert class labels from scalars to one-hot vectors."""
46 | num_labels = labels_dense.shape[0]
47 | index_offset = numpy.arange(num_labels) * num_classes
48 | labels_one_hot = numpy.zeros((num_labels, num_classes))
49 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1
50 | return labels_one_hot
51 | def extract_labels(filename, one_hot=False):
52 | """Extract the labels into a 1D uint8 numpy array [index]."""
53 | print('Extracting', filename)
54 | with gzip.open(filename) as bytestream:
55 | magic = _read32(bytestream)
56 | if magic != 2049:
57 | raise ValueError(
58 | 'Invalid magic number %d in MNIST label file: %s' %
59 | (magic, filename))
60 | num_items = _read32(bytestream)
61 | buf = bytestream.read(num_items)
62 | labels = numpy.frombuffer(buf, dtype=numpy.uint8)
63 | if one_hot:
64 | return dense_to_one_hot(labels)
65 | return labels
66 | class DataSet(object):
67 | def __init__(self, images, labels, fake_data=False):
68 | if fake_data:
69 | self._num_examples = 10000
70 | else:
71 | assert images.shape[0] == labels.shape[0], (
72 | "images.shape: %s labels.shape: %s" % (images.shape,
73 | labels.shape))
74 | self._num_examples = images.shape[0]
75 | # Convert shape from [num examples, rows, columns, depth]
76 | # to [num examples, rows*columns] (assuming depth == 1)
77 | assert images.shape[3] == 1
78 | images = images.reshape(images.shape[0],
79 | images.shape[1] * images.shape[2])
80 | # Convert from [0, 255] -> [0.0, 1.0].
81 | images = images.astype(numpy.float32)
82 | images = numpy.multiply(images, 1.0 / 255.0)
83 | self._images = images
84 | self._labels = labels
85 | self._epochs_completed = 0
86 | self._index_in_epoch = 0
87 | @property
88 | def images(self):
89 | return self._images
90 | @property
91 | def labels(self):
92 | return self._labels
93 | @property
94 | def num_examples(self):
95 | return self._num_examples
96 | @property
97 | def epochs_completed(self):
98 | return self._epochs_completed
99 | def next_batch(self, batch_size, fake_data=False):
100 | """Return the next `batch_size` examples from this data set."""
101 | if fake_data:
102 | fake_image = [1.0 for _ in range(784)]
103 | fake_label = 0
104 | return [fake_image for _ in range(batch_size)], [
105 | fake_label for _ in range(batch_size)]
106 | start = self._index_in_epoch
107 | self._index_in_epoch += batch_size
108 | if self._index_in_epoch > self._num_examples:
109 | # Finished epoch
110 | self._epochs_completed += 1
111 | # Shuffle the data
112 | perm = numpy.arange(self._num_examples)
113 | numpy.random.shuffle(perm)
114 | self._images = self._images[perm]
115 | self._labels = self._labels[perm]
116 | # Start next epoch
117 | start = 0
118 | self._index_in_epoch = batch_size
119 | assert batch_size <= self._num_examples
120 | end = self._index_in_epoch
121 | return self._images[start:end], self._labels[start:end]
122 | def read_data_sets(train_dir, fake_data=False, one_hot=False):
123 | class DataSets(object):
124 | pass
125 | data_sets = DataSets()
126 | if fake_data:
127 | data_sets.train = DataSet([], [], fake_data=True)
128 | data_sets.validation = DataSet([], [], fake_data=True)
129 | data_sets.test = DataSet([], [], fake_data=True)
130 | return data_sets
131 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
132 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
133 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
134 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
135 | VALIDATION_SIZE = 5000
136 | local_file = maybe_download(TRAIN_IMAGES, train_dir)
137 | train_images = extract_images(local_file)
138 | local_file = maybe_download(TRAIN_LABELS, train_dir)
139 | train_labels = extract_labels(local_file, one_hot=one_hot)
140 | local_file = maybe_download(TEST_IMAGES, train_dir)
141 | test_images = extract_images(local_file)
142 | local_file = maybe_download(TEST_LABELS, train_dir)
143 | test_labels = extract_labels(local_file, one_hot=one_hot)
144 | validation_images = train_images[:VALIDATION_SIZE]
145 | validation_labels = train_labels[:VALIDATION_SIZE]
146 | train_images = train_images[VALIDATION_SIZE:]
147 | train_labels = train_labels[VALIDATION_SIZE:]
148 | data_sets.train = DataSet(train_images, train_labels)
149 | data_sets.validation = DataSet(validation_images, validation_labels)
150 | data_sets.test = DataSet(test_images, test_labels)
151 | return data_sets
152 |
--------------------------------------------------------------------------------
/fathom/residual/residual.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from math import sqrt
4 | from collections import namedtuple
5 | import tensorflow as tf
6 | from fathom.nn import default_runstep
7 | from fathom.imagenet import imagenet
8 |
9 | # Code heavily based on Parag Mital's TensorFlow tutorials.
10 | class Residual(imagenet.ImagenetModel):
11 | """Residual Network."""
12 | def build_hyperparameters(self):
13 | # Parameters
14 | self.learning_rate = 0.01
15 | self.training_iters = 200000
16 | self.batch_size = 16
17 | if self.init_options:
18 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
19 | self.display_step = 1
20 |
21 | self.dropout = 0.8 # Dropout, probability to keep units
22 | self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability)
23 |
24 | def build_inference(self, images):
25 | with self.G.as_default():
26 | LayerBlock = namedtuple(
27 | 'LayerBlock', ['num_repeats', 'num_filters', 'bottleneck_size'])
28 | blocks = [
29 | LayerBlock(3, 128, 32),
30 | LayerBlock(3, 256, 64),
31 | LayerBlock(3, 512, 128),
32 | LayerBlock(3, 1024, 256)
33 | ]
34 |
35 | # %%
36 | input_shape = images.get_shape().as_list()
37 | if len(input_shape) == 2:
38 | ndim = int(sqrt(input_shape[1]))
39 | if ndim * ndim != input_shape[1]:
40 | raise ValueError('input_shape should be square')
41 | images = tf.reshape(images, [-1, ndim, ndim, 1])
42 |
43 | # %%
44 | # First convolution expands to 64 channels and downsamples
45 | net = conv2d(images, 64, k_h=7, k_w=7,
46 | name='conv1',
47 | activation=tf.nn.relu)
48 |
49 | # %%
50 | # Max pool and downsampling
51 | net = tf.nn.max_pool(
52 | net, [1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME')
53 |
54 | # %%
55 | # Setup first chain of resnets
56 | net = conv2d(net, blocks[0].num_filters, k_h=1, k_w=1,
57 | stride_h=1, stride_w=1, padding='VALID', name='conv2')
58 |
59 | # %%
60 | # Loop through all res blocks
61 | for block_i, block in enumerate(blocks):
62 | for repeat_i in range(block.num_repeats):
63 | name = 'block_%d/repeat_%d' % (block_i, repeat_i)
64 | conv = conv2d(net, block.bottleneck_size, k_h=1, k_w=1,
65 | padding='VALID', stride_h=1, stride_w=1,
66 | activation=tf.nn.relu,
67 | name=name + '/conv_in')
68 |
69 | conv = conv2d(conv, block.bottleneck_size, k_h=3, k_w=3,
70 | padding='SAME', stride_h=1, stride_w=1,
71 | activation=tf.nn.relu,
72 | name=name + '/conv_bottleneck')
73 |
74 | conv = conv2d(conv, block.num_filters, k_h=1, k_w=1,
75 | padding='VALID', stride_h=1, stride_w=1,
76 | activation=tf.nn.relu,
77 | name=name + '/conv_out')
78 |
79 | net = conv + net
80 |
81 | try:
82 | # upscale to the next block size
83 | next_block = blocks[block_i + 1]
84 | net = conv2d(net, next_block.num_filters, k_h=1, k_w=1,
85 | padding='SAME', stride_h=1, stride_w=1, bias=False,
86 | name='block_%d/conv_upscale' % block_i)
87 | except IndexError:
88 | pass
89 |
90 | # %%
91 | net = tf.nn.avg_pool(net,
92 | ksize=[1, net.get_shape().as_list()[1],
93 | net.get_shape().as_list()[2], 1],
94 | strides=[1, 1, 1, 1], padding='VALID')
95 | net = tf.reshape(
96 | net,
97 | [-1, net.get_shape().as_list()[1] *
98 | net.get_shape().as_list()[2] *
99 | net.get_shape().as_list()[3]])
100 |
101 | self.logits = linear(net, self.n_classes, activation=tf.identity)
102 |
103 | # %%
104 | return self.logits
105 |
106 | def conv2d(x, n_filters,
107 | k_h=5, k_w=5,
108 | stride_h=2, stride_w=2,
109 | stddev=0.02,
110 | activation=lambda x: x,
111 | bias=True,
112 | padding='SAME',
113 | name="Conv2D"):
114 | """2D Convolution with options for kernel size, stride, and init deviation.
115 | Parameters
116 | ----------
117 | x : Tensor
118 | Input tensor to convolve.
119 | n_filters : int
120 | Number of filters to apply.
121 | k_h : int, optional
122 | Kernel height.
123 | k_w : int, optional
124 | Kernel width.
125 | stride_h : int, optional
126 | Stride in rows.
127 | stride_w : int, optional
128 | Stride in cols.
129 | stddev : float, optional
130 | Initialization's standard deviation.
131 | activation : arguments, optional
132 | Function which applies a nonlinearity
133 | padding : str, optional
134 | 'SAME' or 'VALID'
135 | name : str, optional
136 | Variable scope to use.
137 | Returns
138 | -------
139 | x : Tensor
140 | Convolved input.
141 | """
142 | with tf.variable_scope(name):
143 | w = tf.get_variable(
144 | 'w', [k_h, k_w, x.get_shape()[-1], n_filters],
145 | initializer=tf.truncated_normal_initializer(stddev=stddev))
146 | conv = tf.nn.conv2d(x, w, strides=[1, stride_h, stride_w, 1], padding=padding)
147 | if bias:
148 | b = tf.get_variable(
149 | 'b', [n_filters],
150 | initializer=tf.truncated_normal_initializer(stddev=stddev))
151 |
152 | conv = conv + b
153 | return activation(conv)
154 |
155 | def linear(x, n_units, scope=None, stddev=0.02,
156 | activation=lambda x: x):
157 | """Fully-connected network.
158 | Parameters
159 | ----------
160 | x : Tensor
161 | Input tensor to the network.
162 | n_units : int
163 | Number of units to connect to.
164 | scope : str, optional
165 | Variable scope to use.
166 | stddev : float, optional
167 | Initialization's standard deviation.
168 | activation : arguments, optional
169 | Function which applies a nonlinearity
170 | Returns
171 | -------
172 | x : Tensor
173 | Fully-connected output.
174 | """
175 | shape = x.get_shape().as_list()
176 |
177 | with tf.variable_scope(scope or "Linear"):
178 | matrix = tf.get_variable("Matrix", [shape[1], n_units], tf.float32,
179 | tf.random_normal_initializer(stddev=stddev))
180 | return activation(tf.matmul(x, matrix))
181 |
182 | class ResidualFwd(Residual):
183 | forward_only = True
184 |
185 | if __name__ == "__main__":
186 | m = Residual()
187 | m.setup()
188 | m.run(runstep=default_runstep, n_steps=10)
189 | m.teardown()
190 |
--------------------------------------------------------------------------------
/fathom/autoenc/autoenc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from math import sqrt
4 |
5 | import tensorflow as tf
6 | import numpy as np
7 | import sklearn.preprocessing as prep
8 |
9 | from fathom.nn import NeuralNetworkModel, default_runstep
10 | import fathom.imagenet.mnist as input_data
11 |
12 | # TODO: create an unsupervised parent class
13 |
14 | def standard_scale(X_train, X_test):
15 | preprocessor = prep.StandardScaler().fit(X_train)
16 | X_train = preprocessor.transform(X_train)
17 | X_test = preprocessor.transform(X_test)
18 | return X_train, X_test
19 |
20 | # heavily based on tensorflow.models.autoencoder
21 | class AutoencBase(NeuralNetworkModel):
22 | """Basic Autoencoder (denoising optional)."""
23 | def load_data(self):
24 | # Grab the dataset from the internet, if necessary
25 | self.mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
26 | self.X_train, self.X_test = standard_scale(self.mnist.train.images, self.mnist.test.images)
27 |
28 | def build_hyperparameters(self):
29 | # Parameters
30 | self.learning_rate = 0.001
31 | self.batch_size = 128
32 | if self.init_options:
33 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
34 | self.display_step = 1
35 |
36 | # Network Parameters
37 | self.n_hidden = 200
38 |
39 | # TODO: remove this data-specific stuff
40 | self.n_input = 784 # MNIST data input (img shape: 28*28)
41 |
42 | if not self.forward_only:
43 | self.scale = tf.placeholder(tf.float32)
44 | #self.keep_prob = tf.placeholder(tf.float32) # dropout (keep probability)
45 |
46 | def build_inputs(self):
47 | # tf Graph input
48 | self.xs = tf.placeholder(tf.float32, [None, self.n_input])
49 |
50 | @property
51 | def inputs(self):
52 | return self.xs
53 |
54 | @property
55 | def outputs(self):
56 | return self.reconstruction
57 |
58 | # TODO: remove labels methods upon creating unsupervised parent class
59 | def build_labels(self):
60 | # inputs are the ground truth
61 | pass
62 |
63 | @property
64 | def labels(self):
65 | # inputs are the ground truth
66 | return self.inputs
67 |
68 | def run(self, runstep=None, n_steps=1):
69 | self.load_data()
70 |
71 | with self.G.as_default():
72 | # %% We'll train in minibatches and report accuracy:
73 | self.epochs = 20
74 | self.display_step = 1
75 |
76 | if self.forward_only:
77 | self.epochs = 1
78 |
79 | for epoch in range(self.epochs):
80 | # TODO: re-enable options and metadata, which slow down the run
81 |
82 | total_batch = self.mnist.train.num_examples // self.batch_size
83 |
84 | avg_cost = 0
85 | for batch_i in range(total_batch):
86 | if batch_i >= n_steps:
87 | break
88 | #batch_xs = self.mnist.train.next_batch(self.batch_size)
89 | batch_xs = get_random_block_from_data(self.X_train, self.batch_size)
90 |
91 | # TODO: summary nodes
92 |
93 | if not self.forward_only:
94 | # train on batch
95 | _, loss_value = runstep(
96 | self.session,
97 | [self.train, self.loss],
98 | feed_dict={self.xs: batch_xs, self.scale: self.training_scale},
99 | #options=run_options, run_metadata=values
100 | )
101 | else:
102 | # run forward on train batch
103 | _ = runstep(
104 | self.session,
105 | self.outputs,
106 | feed_dict={self.xs: batch_xs}
107 | )
108 |
109 | if not self.forward_only:
110 | avg_cost += loss_value * self.mnist.train.num_examples * self.batch_size
111 |
112 | if epoch % self.display_step == 0:
113 | print('epoch:', epoch, 'cost:', avg_cost)
114 |
115 | print("Total cost:", self.calc_total_cost(self.X_test))
116 |
117 | def noisy_input(self, inputs, scale, dist=tf.random_normal):
118 | """Add scaled noise to input for denoising autoencoder."""
119 | with self.G.as_default():
120 | return inputs + scale * dist((self.n_input,))
121 |
122 | def build_inference(self, inputs, transfer_function=tf.nn.softplus, scale=0.1, denoising=True):
123 | with self.G.as_default():
124 | self.transfer = transfer_function
125 |
126 | self.training_scale = scale
127 |
128 | network_weights = self._initialize_weights()
129 | self.weights = network_weights
130 |
131 | if denoising and not self.forward_only:
132 | # add white noise to the input so the autoencoder learns to reconstruct from noise
133 | self.hidden = self.transfer(
134 | tf.matmul(self.noisy_input(self.xs, self.scale), self.weights['w1']) + self.weights['b1'])
135 | else:
136 | # learn to reconstruct the input alone
137 | self.hidden = self.transfer(tf.add(tf.matmul(self.xs, self.weights['w1']), self.weights['b1']))
138 |
139 | self.reconstruction = tf.add(tf.matmul(self.hidden, self.weights['w2']), self.weights['b2'])
140 |
141 | # for an autoencoder, the cost/loss is not just part of training
142 | self.build_loss(self.inputs, self.reconstruction)
143 |
144 | return self.reconstruction
145 |
146 | def build_loss(self, inputs, reconstruction):
147 | with self.G.as_default():
148 | self.loss_op = 0.5 * tf.reduce_sum(tf.pow(tf.subtract(reconstruction, inputs), 2.0))
149 | return self.loss_op
150 |
151 | @property
152 | def loss(self):
153 | return self.loss_op
154 |
155 | def build_train(self, total_loss):
156 | with self.G.as_default():
157 | opt = tf.train.AdamOptimizer()
158 |
159 | # Compute and apply gradients.
160 | self.train_op = opt.minimize(total_loss)#, global_step)
161 |
162 | return self.train_op
163 |
164 | @property
165 | def train(self):
166 | return self.train_op
167 |
168 | def _initialize_weights(self):
169 | all_weights = dict()
170 | all_weights['w1'] = tf.Variable(xavier_init(self.n_input, self.n_hidden))
171 | all_weights['b1'] = tf.Variable(tf.zeros([self.n_hidden], dtype=tf.float32))
172 | all_weights['w2'] = tf.Variable(tf.zeros([self.n_hidden, self.n_input], dtype=tf.float32))
173 | all_weights['b2'] = tf.Variable(tf.zeros([self.n_input], dtype=tf.float32))
174 | return all_weights
175 |
176 | def calc_total_cost(self, X):
177 | return self.session.run(self.loss, feed_dict = {self.xs: X, self.scale: self.training_scale})
178 |
179 | def transform(self, X):
180 | return self.session.run(self.hidden, feed_dict={self.xs: X, self.scale: self.training_scale})
181 |
182 | def generate(self, hidden = None):
183 | if hidden is None:
184 | hidden = np.random.normal(size=self.weights["b1"])
185 | return self.session.run(self.reconstruction, feed_dict={self.hidden: hidden})
186 |
187 | def reconstruct(self, X):
188 | return self.session.run(self.reconstruction, feed_dict={self.xs: X, self.scale: self.training_scale})
189 |
190 | def xavier_init(fan_in, fan_out, constant = 1):
191 | low = -constant * sqrt(6.0 / (fan_in + fan_out))
192 | high = constant * sqrt(6.0 / (fan_in + fan_out))
193 | return tf.random_uniform((fan_in, fan_out),
194 | minval = low, maxval = high,
195 | dtype = tf.float32)
196 |
197 | def get_random_block_from_data(data, batch_size):
198 | start_index = np.random.randint(0, len(data) - batch_size)
199 | return data[start_index:(start_index + batch_size)]
200 |
201 | class AutoencBaseFwd(AutoencBase):
202 | forward_only = True
203 |
204 | if __name__ == "__main__":
205 | m = AutoencBase()
206 | m.setup()
207 | m.run(runstep=default_runstep)
208 | m.teardown()
209 |
--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
1 | # Installing Prerequisites
2 |
3 | Fathom requires a fair number of other software packages to use. TensorFlow is the obvious dependency, but there are a number of other support libraries which are mostly used for data processing and ingest. Deep learning algorithms operate on real data, so many of them have to do a substantial amount of work to turn raw inputs into a form they can process efficiently.
4 |
5 | ## TensorFlow
6 |
7 | - Python 2.6+
8 | - TensorFlow 1.x+ (artifact for paper required [TensorFlow 0.8.0rc0](https://github.com/tensorflow/tensorflow/releases/tag/v0.8.0rc0))
9 |
10 | For TensorFlow, you can either download a pre-built binary or build from source. The latter is more involved, but can allow more flexibility in configuration (i.e.- you can pass specific options to the underlying math libraries which can affect performance).
11 |
12 | To build from source, you'll also need Bazel, Google's build system. Instructions can be found in the TensorFlow documentation.
13 |
14 | The TensorFlow API is rapidly changing, and so it is possible for Fathom to break in small ways on unintended versions of TensorFlow. These issues tend to be about package imports and renaming rather than fundamentally devastating differences, so feel free to submit pull requests if you fix them on your own.
15 |
16 | ## Supporting libraries
17 |
18 | Fathom needs several other python as well, mostly for pre-processing inputs. For all of these, you have your choice of methods for installing them:
19 |
20 | - `apt-get`: (or your favorite Linux distribution's package manager) This is a quick route, but be careful of versioning. Sometimes distributions lag a fair ways behind in version numbers.
21 | - `pip`: preferred package installer for Python
22 | - `conda`: If you're using an Anaconda distribution of python, this is probably your best bet for numpy, scipy, and scikit-learn. You'll need to use `pip` for librosa and tqdm, though (as Continuum doesn't support these packages).
23 |
24 | You'll want to install the following list of packages. (You may have several of them installed already, and you shouldn't need to re-install—Fathom doesn't use any fancy features).
25 |
26 | - numpy (most)
27 | - scipy (for scikit-learn)
28 | - scikit-learn ([MemNet](/models/#memnet), [Speech](/models/#speech), [Autoenc](/models/#autoenc))
29 | - six ([Seq2Seq](/models/#seq2seq))
30 | - librosa ([Speech](/models/#speech))
31 |
32 | - h5py* ([Speech](/models/#speech))
33 |
34 | *For h5py, you'll also need libhdf5, which is the C++ backend for interfacing with HDF5-formatted files. This is usually available as a Linux package, but [building from source](https://support.hdfgroup.org/downloads/index.html) is also fine. Any recent version should work. In Ubuntu, the package you're looking for is `libhdf5-dev`.
35 |
36 | ## Atari emulation
37 |
38 | [DeepQ](/models/#deepq) requires a bit more support than the other models. This is largely because it is interacting directly with a running Atari emulator. Consequently, you'll need both the emulator itself and OpenCV to run it.
39 |
40 | The [Arcade Learning Environment (ALE)](http://www.arcadelearningenvironment.org/) is a clean, two-way interface between machine learning models and an Atari 2600 emulator. Installation instructions can be found in the [ALE Manual](https://github.com/mgbellemare/Arcade-Learning-Environment/raw/master/doc/manual/manual.pdf), but boil down to two steps: building the ALE C++ backend, and installing the python wrapper.
41 |
42 |
43 | [OpenCV](http://opencv.org/) is a collection of image processing and computational geometry functions designed to support computer vision. You'll need both a 2.x version of the backend library and also the python interface wrapper. Many Linux distributions have a package for both (Ubuntu's are `libopencv-dev` and `python-opencv`), but you can also [build from source ](http://docs.opencv.org/2.4.13/doc/tutorials/introduction/linux_install/linux_install.html) and then use `pip` to install the `opencv-python` wrapper.
44 |
45 | # Alternative: Quickstart via Docker
46 |
47 | If you don't need accurate performance numbers right away, we also provide a pre-built [Docker image](https://hub.docker.com/r/rdadolf/fathom/) to make it easy to get familiar with the Fathom workloads.
48 |
49 | If you're not familiar with Docker, you can think of it as a lightweight virtualization layer, similar to a VM but at a higher level of abstraction. Installation instructions can be found on the [docker website](https://www.docker.com/). To run the Fathom image interactively, use this:
50 |
51 | ```sh
52 | docker run -it rdadolf/fathom
53 | ```
54 |
55 | The image will automatically be downloaded from the Docker hub, launched, and you'll be given a shell prompt with the environment all set up.
56 |
57 | # Downloading Data
58 |
59 | *Documentation in progress
*
60 |
61 | Fathom does not come with datasets suitable for training. This is a combination of size (realistic training sets are often massive) and licensing (an oft-repeated mantra is that good data is more valuable than a good model).
62 |
63 | Regardless, the inputs Fathom is designed for are standard and widely-available:
64 |
65 | - [ImageNet](http://www.image-net.org/download-images) - requires registration, but downloads are free for non-commercial purposes.
66 | - [WMT15](http://www.statmt.org/wmt15/translation-task.html) - freely available online, and automatically downloaded by Fathom
67 | - [bAbI](https://research.facebook.com/research/babi/) - freely available online
68 | - [MNIST](http://yann.lecun.com/exdb/mnist/) - freely available online, and automatically downloaded by Fathom.
69 | - [TIMIT](https://catalog.ldc.upenn.edu/ldc93s1) - requires membership of the Linguistic Data Consortium (this is not free, but it is widely available in the research community).
70 | - Atari "Breakout" ROM - Technically not freely available. In practice, it is [available online](https://www.google.com/search?q=atari+breakout+rom). You can also legally obtain this by dumping the memory of an Atari 2600 running a copy of Breakout you bought.
71 |
72 | We eventually want to write synthetic datasets which allow users to run Fathom out of the box without requiring the above downloads.
73 |
74 | # Running the Workloads
75 |
76 | Fathom is a Python library with command-line shims. To use Fathom, you'll need to tell your Python installation where to find it. The easiest way is to adjust your `PYTHONPATH` environment variable:
77 |
78 | ```sh
79 | $ git clone https://github.com/rdadolf/fathom.git
80 | $ export PYTHONPATH=`pwd`/fathom
81 | ```
82 |
83 | Once you've done that, you can either run the models directly (using the command-line shims):
84 |
85 | ```sh
86 | $ cd fathom
87 | $ ./fathom/seq2seq/seq2seq.py
88 | ```
89 |
90 | or you can use Fathom as a Python library directly in your scripts:
91 |
92 | ```python
93 | from fathom import Seq2seq
94 | model = Seq2seq()
95 | model.setup()
96 | model.run()
97 | ```
98 |
99 | ## ImageNet
100 |
101 | The ImageNet workflow is finicky for training and the parameters and optimizations we have included do not reflect the state-of-the-art (e.g., batch normalization). Several users have reported issues with running the training flow out of the box, and we are currently working on resolving these issues.
102 |
103 | If you do not want to download and set up ImageNet, then you can switch to using MNIST as provided in `fathom/imagenet/mnist.py`. Some of the models (e.g., VGG) may require modification because their convolutional kernels compress the smaller MNIST images too much.
104 |
105 | ## DeepQ
106 |
107 | Note: [DeepQ](/models/#deepq) currently looks for its ROMs relative to Fathom's root directory. In practice, this will cause problems if you don't run in that directory. We are working on a more general configuration interface, but in the meantime, you should feel free to modify the `ROM_PATH` variable in `fathom/deepq/emulator.py`.
108 |
109 |
--------------------------------------------------------------------------------
/fathom/memnet/memnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """Dominique Luna's implementation of End-to-End Memory Networks, refactored."""
4 |
5 | from functools import reduce
6 | from itertools import chain
7 |
8 | import tensorflow as tf
9 | import numpy as np
10 |
11 | from sklearn import cross_validation
12 | from fathom.nn import NeuralNetworkModel, default_runstep
13 | from .data_utils import load_task, vectorize_data
14 |
15 | data_dir = "/data/babi/tasks_1-20_v1-2/en/"
16 | task_id = 1
17 |
18 | class MemNet(NeuralNetworkModel):
19 | def build_inference(self, inputs):
20 | with self.G.as_default():
21 | self.encoding_op = tf.constant(self.encoding(self.sentence_size, self.embedding_size), name="encoding")
22 |
23 | # variables
24 | #with tf.variable_scope(self.name):
25 | nil_word_slot = tf.zeros([1, self.embedding_size])
26 | A = tf.concat(axis=0, values=[ nil_word_slot, self.initializer([self.vocab_size-1, self.embedding_size]) ])
27 | B = tf.concat(axis=0, values=[ nil_word_slot, self.initializer([self.vocab_size-1, self.embedding_size]) ])
28 | self.A = tf.Variable(A, name="A")
29 | self.B = tf.Variable(B, name="B")
30 |
31 | self.TA = tf.Variable(self.initializer([self.memory_size, self.embedding_size]), name='TA')
32 |
33 | self.H = tf.Variable(self.initializer([self.embedding_size, self.embedding_size]), name="H")
34 | self.W = tf.Variable(self.initializer([self.embedding_size, self.vocab_size]), name="W")
35 |
36 | #with tf.variable_scope(self.name):
37 | q_emb = tf.nn.embedding_lookup(self.B, self.queries)
38 | u_0 = tf.reduce_sum(q_emb * self.encoding_op, 1)
39 | u = [u_0]
40 | m_emb = tf.nn.embedding_lookup(self.A, self.stories)
41 | m = tf.reduce_sum(m_emb * self.encoding_op, 2) + self.TA
42 |
43 | # hop
44 | for hop_number in range(self.hops):
45 | with tf.name_scope('Hop_'+str(hop_number)):
46 | u_temp = tf.transpose(tf.expand_dims(u[-1], -1), [0, 2, 1])
47 | dotted = tf.reduce_sum(m * u_temp, 2)
48 |
49 | # Calculate probabilities
50 | probs = tf.nn.softmax(dotted)
51 |
52 | probs_temp = tf.transpose(tf.expand_dims(probs, -1), [0, 2, 1])
53 | c_temp = tf.transpose(m, [0, 2, 1])
54 | o_k = tf.reduce_sum(c_temp * probs_temp, 2)
55 |
56 | u_k = tf.matmul(u[-1], self.H) + o_k
57 |
58 | # nonlinearity
59 | if self.nonlin:
60 | u_k = nonlin(u_k)
61 |
62 | u.append(u_k)
63 |
64 | self.nil_vars = set([self.A.name, self.B.name])
65 |
66 | self._outputs = tf.matmul(u_k, self.W)
67 |
68 | return self._outputs
69 |
70 | @property
71 | def outputs(self):
72 | return self._outputs
73 |
74 | def build_loss(self, logits, labels):
75 | with self.G.as_default():
76 | with tf.name_scope('loss'):
77 | # Define loss
78 | # TODO: does this labels have unexpected state?
79 | self.loss_op = tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.cast(labels, tf.float32)))
80 | return self.loss_op
81 |
82 | @property
83 | def loss(self):
84 | return self.loss_op
85 |
86 | def build_train(self, total_loss):
87 | with self.G.as_default():
88 | self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
89 |
90 | # can't use opt.minimize because we need to clip the gradients
91 | grads_and_vars = self.opt.compute_gradients(self.loss)
92 | grads_and_vars = [(tf.clip_by_norm(g, self.max_grad_norm), v) for g,v in grads_and_vars]
93 | grads_and_vars = [(add_gradient_noise(g), v) for g,v in grads_and_vars]
94 | nil_grads_and_vars = []
95 | for g, v in grads_and_vars:
96 | if v.name in self.nil_vars:
97 | nil_grads_and_vars.append((zero_nil_slot(g), v))
98 | else:
99 | nil_grads_and_vars.append((g, v))
100 |
101 | self.train_op = self.opt.apply_gradients(nil_grads_and_vars, name="train_op")
102 |
103 | return self.train_op
104 | @property
105 | def train(self):
106 | return self.train_op
107 |
108 | def load_data(self):
109 | # single babi task
110 | # TODO: refactor all this running elsewhere
111 | # task data
112 | train, test = load_task(data_dir, task_id)
113 |
114 | vocab = sorted(reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q + a) for s, q, a in train + test)))
115 | word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
116 |
117 | self.memory_size = 50
118 |
119 | self.max_story_size = max(list(map(len, (s for s, _, _ in train + test))))
120 | self.mean_story_size = int(np.mean(list(map(len, (s for s, _, _ in train + test)))))
121 | self.sentence_size = max(list(map(len, chain.from_iterable(s for s, _, _ in train + test))))
122 | self.query_size = max(list(map(len, (q for _, q, _ in train + test))))
123 | self.memory_size = min(self.memory_size, self.max_story_size)
124 | self.vocab_size = len(word_idx) + 1 # +1 for nil word
125 | self.sentence_size = max(self.query_size, self.sentence_size) # for the position
126 |
127 | print("Longest sentence length", self.sentence_size)
128 | print("Longest story length", self.max_story_size)
129 | print("Average story length", self.mean_story_size)
130 |
131 | # train/validation/test sets
132 | self.S, self.Q, self.A = vectorize_data(train, word_idx, self.sentence_size, self.memory_size)
133 | self.trainS, self.valS, self.trainQ, self.valQ, self.trainA, self.valA = cross_validation.train_test_split(self.S, self.Q, self.A, test_size=.1) # TODO: randomstate
134 | self.testS, self.testQ, self.testA = vectorize_data(test, word_idx, self.sentence_size, self.memory_size)
135 |
136 | print(self.testS[0])
137 |
138 | print("Training set shape", self.trainS.shape)
139 |
140 | # params
141 | self.n_train = self.trainS.shape[0]
142 | self.n_test = self.testS.shape[0]
143 | self.n_val = self.valS.shape[0]
144 |
145 | print("Training Size", self.n_train)
146 | print("Validation Size", self.n_val)
147 | print("Testing Size", self.n_test)
148 |
149 | def build_hyperparameters(self):
150 | with self.G.as_default():
151 | # TODO: put these into runstep options or somewhere else
152 | # Parameters
153 | self.learning_rate = 0.01
154 | self.batch_size = 32
155 | if self.init_options:
156 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
157 | self.embedding_size = 20
158 | self.hops = 3
159 | self.max_grad_norm = 40.0
160 | self.nonlin = None
161 | self.encoding = position_encoding
162 | self.display_step = 10
163 |
164 | def build_inputs(self):
165 | self.load_data() # TODO: get static numbers for the things that currently require loading and move this to run
166 |
167 | with self.G.as_default():
168 | # inputs
169 | self.stories = tf.placeholder(tf.int32, [None, self.memory_size, self.sentence_size], name="stories")
170 | self.queries = tf.placeholder(tf.int32, [None, self.sentence_size], name="queries")
171 |
172 | self.initializer = tf.random_normal_initializer(stddev=0.1)
173 |
174 | @property
175 | def inputs(self):
176 | return self.stories, self.queries
177 |
178 | def build_labels(self):
179 | with self.G.as_default():
180 | self.answers = tf.placeholder(tf.int32, [None, self.vocab_size], name="answers")
181 |
182 | @property
183 | def labels(self):
184 | return self.answers
185 |
186 | def run(self, runstep=None, n_steps=1):
187 | # load babi data
188 | # vocab, memory, sentence sizes set here
189 | # TODO: get static data size numbers and don't load in inputs anymore
190 | #self.load_data()
191 | #tf.set_random_seed(random_state)
192 |
193 | start = 0
194 | assert self.batch_sizeself.n_train:
219 | start,end = 0,self.batch_size
220 | else:
221 | start,end = end,end+self.batch_size
222 |
223 | acc = self.session.run(
224 | self.accuracy,
225 | feed_dict={self.stories: self.testS, self.queries: self.testQ, self.answers: self.testA}
226 | )
227 |
228 | print("Test accuracy: {:.5f}".format(acc))
229 |
230 | def position_encoding(sentence_size, embedding_size):
231 | """
232 | Position Encoding described in section 4.1 [1]
233 | """
234 | encoding = np.ones((embedding_size, sentence_size), dtype=np.float32)
235 | ls = sentence_size+1
236 | le = embedding_size+1
237 | for i in range(1, le):
238 | for j in range(1, ls):
239 | encoding[i-1, j-1] = (i - (le-1)/2) * (j - (ls-1)/2)
240 | encoding = 1 + 4 * encoding / embedding_size / sentence_size
241 | return np.transpose(encoding)
242 |
243 | def zero_nil_slot(t, name=None):
244 | """
245 | Overwrites the nil_slot (first row) of the input Tensor with zeros.
246 | The nil_slot is a dummy slot and should not be trained and influence
247 | the training algorithm.
248 | """
249 | with tf.name_scope(values=[t], name=name, default_name="zero_nil_slot") as name:
250 | t = tf.convert_to_tensor(t, name="t")
251 | s = tf.shape(t)[1]
252 | z = tf.zeros(tf.stack([1, s]))
253 | return tf.concat(axis=0, values=[z, tf.slice(t, [1, 0], [-1, -1])], name=name)
254 |
255 | def add_gradient_noise(t, stddev=1e-3, name=None):
256 | """
257 | Adds gradient noise as described in http://arxiv.org/abs/1511.06807 [2].
258 | The input Tensor `t` should be a gradient.
259 | The output will be `t` + gaussian noise.
260 | 0.001 was said to be a good fixed value for memory networks [2].
261 | """
262 | with tf.name_scope(values=[t, stddev], name=name, default_name="add_gradient_noise") as name:
263 | t = tf.convert_to_tensor(t, name="t")
264 | gn = tf.random_normal(tf.shape(t), stddev=stddev)
265 | return tf.add(t, gn, name=name)
266 |
267 | class MemNetFwd(MemNet):
268 | forward_only = True
269 |
270 | if __name__=='__main__':
271 | m = MemNet()
272 | m.setup()
273 | m.run(runstep=default_runstep, n_steps=100)
274 | m.teardown()
275 |
276 |
--------------------------------------------------------------------------------
/fathom/speech/speech.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import numpy as np
4 | import tensorflow as tf
5 |
6 | #from tensorflow.models.rnn import rnn, rnn_cell
7 | from tensorflow.python.ops import functional_ops
8 | from tensorflow.python.ops import variable_scope as vs
9 | from tensorflow.contrib.rnn.python.ops.rnn_cell import _linear
10 |
11 | from fathom.nn import NeuralNetworkModel, default_runstep
12 |
13 | from .preproc import load_timit, timit_hdf5_filepath
14 | from .phoneme import index2phoneme_dict
15 |
16 |
17 | def clipped_relu(inputs, clip=20):
18 | """Similar to tf.nn.relu6, but can clip at 20 as in Deep Speech."""
19 | return tf.minimum(tf.nn.relu(inputs), clip)
20 |
21 |
22 | class ClippedReluRNNCell(tf.contrib.rnn.RNNCell):
23 | """Basic RNN cell with clipped ReLU rather than tanh activation."""
24 |
25 | def __init__(self, num_units, input_size=None):
26 | self._num_units = num_units
27 |
28 | @property
29 | def state_size(self):
30 | return self._num_units
31 |
32 | @property
33 | def output_size(self):
34 | return self._num_units
35 |
36 | def __call__(self, inputs, state, scope=None):
37 | """Basic RNN: output = new_state = clipped_relu(W * input + U * state + B)."""
38 | with vs.variable_scope(scope or type(self).__name__):
39 | output = clipped_relu(_linear([inputs, state], self._num_units, True))
40 | return output, output
41 |
42 |
43 | # TODO: show label error rate
44 | # TODO: avoid labels and blank off-by-one error due to padding zeros
45 | class Speech(NeuralNetworkModel):
46 | """RNN for speech recognition."""
47 | def __init__(self, device=None, init_options=None):
48 | super(Speech,self).__init__(device=device, init_options=init_options)
49 |
50 | #def inference(self, inputs, n_hidden=2048):
51 | def build_inference(self, inputs, n_hidden=1024):
52 | with self.G.as_default():
53 | self.n_hidden = n_hidden
54 |
55 | # Architecture of Deep Speech [Hannun et al. 2014]
56 | outputs_1 = self.mlp_layer(inputs, self.n_coeffs, self.n_hidden)
57 | outputs_2 = self.mlp_layer(outputs_1, self.n_hidden, self.n_hidden)
58 | outputs_3 = self.mlp_layer(outputs_2, self.n_hidden, self.n_hidden)
59 | outputs_4 = self.bidirectional_layer(outputs_3, n_input=self.n_hidden, n_hidden=self.n_hidden, n_output=self.n_hidden)
60 | outputs_5 = self.mlp_layer(outputs_3, self.n_hidden, self.n_labels)
61 |
62 | self._outputs = outputs_5
63 |
64 | # transpose in preparation for CTC loss
65 | self.logits_t = tf.transpose(self._outputs, perm=[1,0,2])
66 |
67 | return outputs_5
68 |
69 | @property
70 | def outputs(self):
71 | return self._outputs
72 |
73 | @property
74 | def loss(self):
75 | return self.loss_op
76 |
77 | def build_loss(self, logits, labels):
78 | with self.G.as_default():
79 | # NOTE: CTC does the softmax for us, according to the code
80 |
81 | # CTC loss requires sparse labels
82 | self.sparse_labels = self.ctc_label_dense_to_sparse(self.labels, self.seq_lens)
83 |
84 | # CTC
85 | self.loss_op = tf.nn.ctc_loss(
86 | inputs=self.logits_t,
87 | labels=self.sparse_labels,
88 | sequence_length=self.seq_lens
89 | )
90 |
91 | return self.loss_op
92 |
93 | def build_train(self, loss):
94 | # TODO: buckets
95 | with self.G.as_default():
96 | self.train_op = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss)
97 | return self.train_op
98 |
99 | @property
100 | def train(self):
101 | return self.train_op
102 |
103 | def mlp_layer(self, inputs, n_input, n_output):
104 | with self.G.as_default():
105 | # layer sees inputs as (batch_size, max_time, n_input)
106 | W = tf.Variable(tf.zeros([n_input, n_output]))
107 | b = tf.Variable(tf.zeros([n_output]))
108 |
109 | W_batch_multiples = tf.constant([self.batch_size, 1, 1], dtype=tf.int32)
110 | W_batch = tf.tile(tf.expand_dims(W, 0), W_batch_multiples)
111 |
112 | # TODO: is tiling a bias vector over batch and frames correct?
113 | b_batch_multiples = tf.constant([self.batch_size, self.max_frames, 1], dtype=tf.int32)
114 | b_batch = tf.tile(tf.expand_dims(tf.expand_dims(b, 0), 0), b_batch_multiples)
115 |
116 | # TODO: change batch_matmul to an averaging reshape so that batching happens and dimensions are easier
117 | outputs = tf.add(tf.matmul(inputs, W_batch), b_batch)
118 |
119 | return clipped_relu(outputs)
120 |
121 | def bidirectional_layer(self, inputs, n_input, n_hidden, n_output):
122 | """Bidirectional RNN layer."""
123 | with self.G.as_default():
124 | fw_cell = ClippedReluRNNCell(n_hidden)
125 | bw_cell = ClippedReluRNNCell(n_hidden)
126 |
127 | # input shape: (batch_size, max_time, n_input)
128 | inputs = tf.transpose(inputs, perm=[1, 0, 2]) # permute max_time and batch_size
129 | inputs = tf.reshape(inputs, [-1, n_input]) # (max_time*batch_size, n_input)
130 |
131 | inputs = tf.split(axis=0, num_or_size_splits=self.max_frames, value=inputs) # max_time * (batch_size, n_hidden)
132 |
133 | # optional initial states
134 | istate_fw = tf.placeholder("float", [None, n_hidden])
135 | istate_bw = tf.placeholder("float", [None, n_hidden])
136 |
137 | # TODO: support both tanh (default) and clipped_relu
138 | outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=istate_fw, initial_state_bw=istate_bw)
139 |
140 | # TODO: is this the right output?
141 | return outputs[-1]
142 |
143 | def ctc_label_dense_to_sparse( self, labels, label_lengths ):
144 | """Mike Henry's implementation, with some minor modifications."""
145 | with self.G.as_default():
146 | label_shape = tf.shape( labels )
147 | num_batches_tns = tf.stack( [label_shape[0]] )
148 | max_num_labels_tns = tf.stack( [label_shape[1]] )
149 |
150 | def range_less_than(previous_state, current_input):
151 | return tf.expand_dims( tf.range( label_shape[1] ), 0 ) < current_input
152 |
153 | init = tf.cast( tf.fill( max_num_labels_tns, 0 ), tf.bool )
154 | init = tf.expand_dims( init, 0 )
155 | dense_mask = functional_ops.scan(range_less_than, label_lengths , initializer=init, parallel_iterations=1)
156 | dense_mask = dense_mask[ :, 0, : ]
157 |
158 | label_array = tf.reshape( tf.tile( tf.range( 0, label_shape[1] ), num_batches_tns ), label_shape )
159 | label_ind = tf.boolean_mask( label_array, dense_mask )
160 |
161 | batch_array = tf.transpose( tf.reshape( tf.tile( tf.range( 0, label_shape[0] ), max_num_labels_tns ), tf.reverse( label_shape,[0]) ) )
162 | batch_ind = tf.boolean_mask( batch_array, dense_mask )
163 |
164 | indices = tf.transpose( tf.reshape( tf.concat( axis=0, values=[batch_ind, label_ind] ), [2,-1] ) )
165 | vals_sparse = tf.gather_nd( labels, indices )
166 | return tf.SparseTensor( tf.to_int64(indices), vals_sparse, tf.to_int64( label_shape ) )
167 |
168 | def build_hyperparameters(self):
169 | self.n_labels = 61 + 1 # add blank
170 | self.max_frames = 1566 # TODO: compute dynamically
171 | self.max_labels = 75
172 | self.n_coeffs = 26
173 | self.batch_size = 32
174 | if self.init_options:
175 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
176 |
177 | def build_inputs(self):
178 | with self.G.as_default():
179 | # NOTE: ctc_loss requires a transpose
180 | # tf.transpose(inputs,perm=[1,0,2])
181 | self._inputs = tf.placeholder(tf.float32, [None, self.max_frames, self.n_coeffs], name="inputs")
182 |
183 | @property
184 | def inputs(self):
185 | return self._inputs
186 |
187 | def build_labels(self):
188 | with self.G.as_default():
189 | self._labels = tf.placeholder(tf.int32, [None, self.max_labels], name="labels")
190 | self.seq_lens = tf.placeholder(tf.int32, [None], name="seq_lens")
191 |
192 | @property
193 | def labels(self):
194 | return self._labels
195 |
196 | def build(self):
197 | super(Speech, self).build()
198 |
199 | with self.G.as_default():
200 | self.decode_op = self.decoding()
201 |
202 | def load_data(self):
203 | self.train_spectrograms, self.train_labels, self.train_seq_lens = load_timit(timit_hdf5_filepath, train=True)
204 | # TODO: load test
205 |
206 | def get_random_batch(self):
207 | """Get random batch from np.arrays (not tf.train.shuffle_batch)."""
208 | n_examples = self.train_spectrograms.shape[0]
209 | random_sample = np.random.randint(n_examples, size=self.batch_size)
210 | return self.train_spectrograms[random_sample, :, :], self.train_labels[random_sample, :], self.train_seq_lens[random_sample]
211 |
212 | def decoding(self):
213 | """Predict labels from learned sequence model."""
214 | # TODO: label error rate on validation set
215 | decoded, _ = tf.nn.ctc_greedy_decoder(self.logits_t, self.seq_lens)
216 | sparse_decode_op = decoded[0] # single-element list
217 | self.decode_op = tf.sparse_to_dense(sparse_decode_op.indices, sparse_decode_op.dense_shape, sparse_decode_op.values)
218 | return self.decode_op
219 |
220 | def run(self, runstep=None, n_steps=1, *args, **kwargs):
221 | print("Loading spectrogram features...")
222 | self.load_data()
223 |
224 | with self.G.as_default():
225 | print('Starting run...')
226 | for _ in range(n_steps):
227 | spectrogram_batch, label_batch, seq_len_batch = self.get_random_batch()
228 |
229 | if not self.forward_only:
230 | _, _ = runstep(self.session,
231 | [self.train_op, self.loss_op],
232 | feed_dict={self.inputs: spectrogram_batch, self.labels: label_batch, self.seq_lens: seq_len_batch})
233 | else:
234 | # run forward-only on train batch
235 | _ = runstep(self.session,
236 | self.outputs,
237 | feed_dict={self.inputs: spectrogram_batch, self.labels: label_batch, self.seq_lens: seq_len_batch})
238 |
239 | # decode the same batch, for debugging
240 | decoded = self.session.run(self.decode_op,
241 | feed_dict={self.inputs: spectrogram_batch, self.labels: label_batch, self.seq_lens: seq_len_batch})
242 |
243 | # print some decoded examples
244 | if False:
245 | print(' '.join(self.labels2phonemes(decoded[0])))
246 | # TODO: fix dtypes in dataset (labels are accidentally floats right now)
247 | print(' '.join(self.labels2phonemes(np.array(label_batch[0,:], dtype=np.int32))))
248 |
249 | def labels2phonemes(self, decoded_labels):
250 | """Convert a list of label indices to a list of corresponding phonemes."""
251 | return [index2phoneme_dict[label] for label in decoded_labels]
252 |
253 | class SpeechFwd(Speech):
254 | forward_only = True
255 |
256 | if __name__=='__main__':
257 | m = Speech()
258 | m.setup()
259 | m.run(runstep=default_runstep, n_steps=10)
260 | m.teardown()
261 |
--------------------------------------------------------------------------------
/fathom/speech/preproc.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Convert TIMIT audio files into spectral coefficients.
4 | """
5 |
6 | import logging
7 | import os
8 | import fnmatch
9 |
10 | import numpy as np
11 | import h5py
12 | import librosa
13 | import sklearn.preprocessing
14 |
15 | from .phoneme import timit_phonemes, phoneme2index_list, phoneme2index_dict
16 |
17 | # global config: load from previous saved dataset if True, else recompute
18 | load_features = False
19 |
20 | # TODO: configurable path to /data/speech/timit/
21 | timit_dir = '/data/speech/timit/TIMIT/'
22 | timit_hdf5_filepath = '/data/speech/timit/timit.hdf5'
23 |
24 | train_name, test_name = 'train', 'test'
25 | train_dir = os.path.join(timit_dir, train_name.upper())
26 | test_dir = os.path.join(timit_dir, test_name.upper())
27 |
28 |
29 | # simple logging
30 | logger = logging.getLogger('TIMIT')
31 | logger.setLevel(logging.INFO)
32 |
33 | ch = logging.StreamHandler()
34 | ch.setLevel(logging.DEBUG)
35 | formatter = logging.Formatter('%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
36 | ch.setFormatter(formatter)
37 | logger.addHandler(ch)
38 |
39 |
40 | def recursive_glob_ext(dirpath, ext):
41 | """Recursively find files with an extension in a TIMIT directory."""
42 | return [os.path.splitext(os.path.join(dirpath, filename))[0] # remove extension
43 | for dirpath, _, files in os.walk(dirpath)
44 | for filename in fnmatch.filter(files, '*.{}'.format(ext))]
45 |
46 |
47 | def mfcc_features(filename):
48 | """Preprocessing per CTC paper.
49 |
50 | (These are not the simpler linear spectrogram features alone as in Deep
51 | Speech).
52 |
53 | Properties:
54 | - 10ms frames with 5ms overlap
55 | - 12 MFCCs with 26 filter banks
56 | - replace first MFCC with energy (TODO: log-energy)
57 | - add first-order derivatives for all of the above
58 | - total: 26 coefficients
59 | """
60 | d, sr = librosa.load(filename)
61 |
62 | frame_length_seconds = 0.010
63 | frame_overlap_seconds = 0.005
64 |
65 | mfccs = librosa.feature.mfcc(d, sr, n_mfcc=1+12, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))
66 |
67 | # energy (TODO: log?)
68 | energy = librosa.feature.rmse(d, n_fft=int(frame_overlap_seconds*sr), hop_length=int(frame_overlap_seconds*sr))
69 |
70 | mfccs[0] = energy # replace first MFCC with energy, per convention
71 |
72 | deltas = librosa.feature.delta(mfccs, order=1)
73 | mfccs_plus_deltas = np.vstack([mfccs, deltas])
74 |
75 | coeffs = sklearn.preprocessing.scale(mfccs_plus_deltas, axis=1)
76 |
77 | return coeffs
78 |
79 |
80 | def dirpath2dataset(dirpath):
81 | """Convert a TIMIT dirpath to a dataset.
82 |
83 | The filename alone is not unique.
84 |
85 | e.g., TIMIT/TRAIN/DR8/MMPM0/SX251.WAV => MMPM0/SX251.WAV
86 | """
87 | if not '/' in dirpath:
88 | raise Exception("not a valid TIMIT dirpath")
89 |
90 | dataset_name = '/'.join(dirpath.split('/')[-2:])
91 | return dataset_name
92 |
93 |
94 | def phoneme_transcription(phoneme_filename):
95 | phoneme_column = -1
96 | # we can discard the first two columns, which provide the time alignment
97 | transcription = [line.split()[phoneme_column].strip() for line in open(phoneme_filename)]
98 | return transcription
99 |
100 |
101 | def verify_phonemes(timit_phoneme_set, transcription_phoneme_set):
102 | """Make sure every pre-specified phoneme was seen in data, and the converse."""
103 | for phoneme in transcription_phoneme_set:
104 | if phoneme not in timit_phoneme_set:
105 | logger.error(phoneme + ' not in TIMIT phonemes')
106 |
107 | for phoneme in timit_phoneme_set:
108 | if phoneme not in transcription_phoneme_set:
109 | logger.error(phoneme + ' not in transcribed phonemes')
110 |
111 |
112 | def compute_spectrograms(audio_filenames):
113 | """Extract spectrogram features from each audio file."""
114 | features_list = []
115 | audio_ext = ".WAV"
116 |
117 | for audio_basename in audio_filenames:
118 | # recompute spectrogram features
119 | # FIXME: on interrupt, kill the thread which librosa launches via audioread
120 | feature_vector = mfcc_features(audio_basename + audio_ext)
121 | features_list.append(feature_vector)
122 |
123 | return features_list
124 |
125 |
126 | def load_precomputed_spectrograms(filepath):
127 | """Load precomputed spectrogram features to save time."""
128 | features_list = []
129 | # TODO: this HDF5 group structure is outdated, recompute and save a new one
130 | with h5py.File(filepath, 'r') as hf:
131 | for g in hf['utterances']:
132 | for dataset in hf['utterances'][g]:
133 | data = np.array(hf['utterances'][g][dataset])
134 | features_list.append(data)
135 |
136 | return features_list
137 |
138 |
139 | def load_timit(filepath, train=True):
140 | # TODO: load test also
141 | with h5py.File(filepath, 'r') as hf:
142 | train_spectrograms = np.array(hf['timit']['train']['spectrograms'])
143 | train_labels = np.array(hf['timit']['train']['labels'])
144 | train_seq_lens = np.array(hf['timit']['train']['seq_lens'])
145 |
146 | return train_spectrograms, train_labels, train_seq_lens
147 |
148 |
149 | def save_feature_dataset(audio_filenames, spectrograms, seq_lens, phoneme2index_list, labels, filepath, overwrite=False):
150 | """Save computed features for TIMIT.
151 |
152 | Args:
153 | - maps from subset kinds 'train' and 'test' to corresponding data:
154 | - audio_filenames: list of basepaths to TIMIT examples
155 | - spectrograms: np.array((n_examples, max_frames, n_coeffs))
156 | - n_examples: number of TIMIT examples (e.g., train=4206)
157 | - max_frames: the most frames in any example
158 | - n_coeffs: number of spectrogram features (e.g., 26 with 12 MFCCs, one
159 | energy, and their 13 deltas)
160 | - seq_lens: number of labels in each target sequence (<= max_labels)
161 | - labels: np.array((n_examples, max_labels))
162 | - max_labels: the most labels in any example (e.g., train=75)
163 | - phoneme2index_list: a map from phoneme strings (e.g., 'sh') to indices,
164 | ordered as in TIMIT PHONCODE.DOC
165 | """
166 | if overwrite:
167 | file_mode = 'w'
168 | else:
169 | file_mode = 'w-' # fail if file exists
170 |
171 | with h5py.File(filepath, file_mode) as hf:
172 | timit = hf.create_group('timit')
173 |
174 | train_name = 'train'
175 | test_name = 'test'
176 |
177 | train = timit.create_group(train_name)
178 | test = timit.create_group(test_name)
179 |
180 | for subset_kind, subset_dataset in [(train_name, train), (test_name, test)]:
181 | # (n_examples,)
182 | subset_dataset.create_dataset('example_paths', dtype="S100", data=np.array(audio_filenames[subset_kind]))
183 |
184 | # (n_examples, max_frames, n_coeffs)
185 | subset_dataset.create_dataset('spectrograms', data=spectrograms[subset_kind])
186 |
187 | # (n_examples,)
188 | subset_dataset.create_dataset('seq_lens', data=seq_lens[subset_kind])
189 |
190 | # (n_examples, max_labels)
191 | label_dataset = subset_dataset.create_dataset('labels', data=labels[subset_kind])
192 |
193 | # store phoneme <-> index mapping in HDF5 attributes to avoid numpy structured arrays
194 | # indices are per order in TIMIT phoncode.doc
195 | for phoneme, index in phoneme2index_list:
196 | label_dataset.attrs[phoneme] = index
197 |
198 | # NOTE: because we don't use '1' and '2' as TIMIT phonemes, there
199 | # shouldn't be any collisions with the indices '1' and '2' when we put
200 | # both into the same dict as strings
201 | label_dataset.attrs[str(index)] = phoneme
202 |
203 |
204 | def index_labels(phoneme2index_dict, timit_transcriptions, max_labels):
205 | """Convert TIMIT transcriptions to integer np.array of indices."""
206 | labels = np.empty((n_examples, max_labels))
207 | seq_lens = np.empty((n_examples,))
208 | for i, transcription in enumerate(timit_transcriptions):
209 | index_transcription = [phoneme2index_dict[phoneme] for phoneme in transcription]
210 | labels[i,:len(transcription)] = index_transcription
211 | seq_lens[i] = len(index_transcription)
212 |
213 | return labels, seq_lens
214 |
215 |
216 | def build_spectrogram_array(features_list, n_examples, max_frames, n_coeffs):
217 | """Convert list of ragged spectrograms to np.array with list of lens."""
218 | spectrograms = np.empty((n_examples, max_frames, n_coeffs))
219 |
220 | for i, feature_vector in enumerate(features_list):
221 | example_frames = feature_vector.shape[1]
222 | spectrograms[i,:example_frames,:] = feature_vector.T
223 |
224 | return spectrograms
225 |
226 |
227 | def load_transcriptions(audio_filenames):
228 | """Load list of phoneme transcriptions.
229 |
230 | Each phoneme transcription is a list of phonemes without time alignments.
231 | """
232 | phoneme_ext = ".PHN"
233 | transcriptions = []
234 | for audio_basename in tqdm(audio_filenames):
235 | # obtain list of phonemes, discarding time-alignment
236 | tr = phoneme_transcription(audio_basename + phoneme_ext)
237 | transcriptions.append(tr)
238 |
239 | return transcriptions
240 |
241 |
242 | def phoneme_set(transcriptions):
243 | """Reduce list of lists of phonemes to a set of phonemes."""
244 | transcription_phonemes = set()
245 | for transcription in transcriptions:
246 | for phoneme in transcription:
247 | transcription_phonemes.add(phoneme)
248 |
249 | return transcription_phonemes
250 |
251 |
252 | if __name__ == "__main__":
253 | logger.info("Starting to preprocess TIMIT audio data.")
254 | logger.info("Walking TIMIT data directory...")
255 |
256 | audio_filenames = {}
257 | spectrograms = {}
258 | seq_lens = {}
259 | labels = {}
260 |
261 | for subset_kind, subset_dir in [(train_name, train_dir), (test_name, test_dir)]:
262 | subset_audio_filenames = recursive_glob_ext(subset_dir, ext="WAV")
263 |
264 | logger.info("Loading phoneme transcriptions for {}...".format(subset_kind))
265 | subset_transcriptions = load_transcriptions(subset_audio_filenames)
266 |
267 | # sanity check
268 | verify_phonemes(set(timit_phonemes), phoneme_set(subset_transcriptions))
269 |
270 | subset_features_list = []
271 | if load_features:
272 | logger.info("Loading precomputed spectrograms for {}...".format(subset_kind))
273 | features_list = load_precomputed_spectrograms(filepath='/data/speech/timit/mfcc-timit.hdf5')
274 | else:
275 | logger.info("Computing spectrograms for {}...".format(subset_kind))
276 | subset_features_list = compute_spectrograms(subset_audio_filenames)
277 |
278 | # compute sizes for np.arrays
279 | n_examples = len(subset_features_list)
280 | max_frames = max(feature_vector.shape[1] for feature_vector in subset_features_list)
281 | n_coeffs = subset_features_list[0].shape[0] # same for all
282 | max_labels = max(len(transcription) for transcription in subset_transcriptions)
283 |
284 | logger.info("Building label array by indexing labels from transcriptions for {}...".format(subset_kind))
285 | subset_labels, subset_seq_lens = index_labels(phoneme2index_dict, subset_transcriptions, max_labels)
286 |
287 | logger.info("Building spectrogram array for {}...".format(subset_kind))
288 | subset_spectrograms = build_spectrogram_array(subset_features_list, n_examples, max_frames, n_coeffs)
289 |
290 | # store for later saving
291 | audio_filenames[subset_kind] = subset_audio_filenames
292 | spectrograms[subset_kind] = subset_spectrograms
293 | labels[subset_kind] = subset_labels
294 | seq_lens[subset_kind] = subset_seq_lens
295 |
296 | logger.info("Finished preprocessing {}.".format(subset_kind))
297 |
298 | logger.info("Saving HDF5 train/test dataset...")
299 | save_feature_dataset(audio_filenames, spectrograms, seq_lens, phoneme2index_list, labels, filepath=timit_hdf5_filepath)
300 |
301 | logger.info("Finished.")
302 |
303 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2016-2017, the President and Fellows of Harvard College
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/fathom/seq2seq/data_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2015 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Utilities for downloading data from WMT, tokenizing, vocabularies."""
17 |
18 |
19 |
20 |
21 | import gzip
22 | import os
23 | import re
24 | import tarfile
25 |
26 | from six.moves import urllib
27 |
28 | from tensorflow.python.platform import gfile
29 |
30 | # Special vocabulary symbols - we always put them at the start.
31 | _PAD = b"_PAD"
32 | _GO = b"_GO"
33 | _EOS = b"_EOS"
34 | _UNK = b"_UNK"
35 | _START_VOCAB = [_PAD, _GO, _EOS, _UNK]
36 |
37 | PAD_ID = 0
38 | GO_ID = 1
39 | EOS_ID = 2
40 | UNK_ID = 3
41 |
42 | # Regular expressions used to tokenize.
43 | _WORD_SPLIT = re.compile(b"([.,!?\"':;)(])")
44 | _DIGIT_RE = re.compile(br"\d")
45 |
46 | # URLs for WMT data.
47 | _WMT_ENFR_TRAIN_URL = "http://www.statmt.org/wmt10/training-giga-fren.tar"
48 | _WMT_ENFR_DEV_URL = "http://www.statmt.org/wmt15/dev-v2.tgz"
49 |
50 |
51 | def maybe_download(directory, filename, url):
52 | """Download filename from url unless it's already in directory."""
53 | if not os.path.exists(directory):
54 | print("Creating directory %s" % directory)
55 | os.makedirs(directory)
56 | filepath = os.path.join(directory, filename)
57 | if not os.path.exists(filepath):
58 | print("Downloading %s to %s" % (url, filepath))
59 | filepath, _ = urllib.request.urlretrieve(url, filepath)
60 | statinfo = os.stat(filepath)
61 | print("Succesfully downloaded", filename, statinfo.st_size, "bytes")
62 | return filepath
63 |
64 |
65 | def gunzip_file(gz_path, new_path):
66 | """Unzips from gz_path into new_path."""
67 | print("Unpacking %s to %s" % (gz_path, new_path))
68 | with gzip.open(gz_path, "rb") as gz_file:
69 | with open(new_path, "wb") as new_file:
70 | for line in gz_file:
71 | new_file.write(line)
72 |
73 |
74 | def get_wmt_enfr_train_set(directory):
75 | """Download the WMT en-fr training corpus to directory unless it's there."""
76 | train_path = os.path.join(directory, "giga-fren.release2.fixed")
77 | if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")):
78 | corpus_file = maybe_download(directory, "training-giga-fren.tar",
79 | _WMT_ENFR_TRAIN_URL)
80 | print("Extracting tar file %s" % corpus_file)
81 | with tarfile.open(corpus_file, "r") as corpus_tar:
82 | corpus_tar.extractall(directory)
83 | gunzip_file(train_path + ".fr.gz", train_path + ".fr")
84 | gunzip_file(train_path + ".en.gz", train_path + ".en")
85 | return train_path
86 |
87 |
88 | def get_wmt_enfr_dev_set(directory):
89 | """Download the WMT en-fr training corpus to directory unless it's there."""
90 | dev_name = "newstest2013"
91 | dev_path = os.path.join(directory, dev_name)
92 | if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")):
93 | dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL)
94 | print("Extracting tgz file %s" % dev_file)
95 | with tarfile.open(dev_file, "r:gz") as dev_tar:
96 | fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr")
97 | en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en")
98 | fr_dev_file.name = dev_name + ".fr" # Extract without "dev/" prefix.
99 | en_dev_file.name = dev_name + ".en"
100 | dev_tar.extract(fr_dev_file, directory)
101 | dev_tar.extract(en_dev_file, directory)
102 | return dev_path
103 |
104 |
105 | def basic_tokenizer(sentence):
106 | """Very basic tokenizer: split the sentence into a list of tokens."""
107 | words = []
108 | for space_separated_fragment in sentence.strip().split():
109 | words.extend(re.split(_WORD_SPLIT, space_separated_fragment))
110 | return [w for w in words if w]
111 |
112 |
113 | def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size,
114 | tokenizer=None, normalize_digits=True):
115 | """Create vocabulary file (if it does not exist yet) from data file.
116 |
117 | Data file is assumed to contain one sentence per line. Each sentence is
118 | tokenized and digits are normalized (if normalize_digits is set).
119 | Vocabulary contains the most-frequent tokens up to max_vocabulary_size.
120 | We write it to vocabulary_path in a one-token-per-line format, so that later
121 | token in the first line gets id=0, second line gets id=1, and so on.
122 |
123 | Args:
124 | vocabulary_path: path where the vocabulary will be created.
125 | data_path: data file that will be used to create vocabulary.
126 | max_vocabulary_size: limit on the size of the created vocabulary.
127 | tokenizer: a function to use to tokenize each data sentence;
128 | if None, basic_tokenizer will be used.
129 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
130 | """
131 | if not gfile.Exists(vocabulary_path):
132 | print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
133 | vocab = {}
134 | with gfile.GFile(data_path, mode="rb") as f:
135 | counter = 0
136 | for line in f:
137 | counter += 1
138 | if counter % 100000 == 0:
139 | print(" processing line %d" % counter)
140 | tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
141 | for w in tokens:
142 | word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w
143 | if word in vocab:
144 | vocab[word] += 1
145 | else:
146 | vocab[word] = 1
147 | vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
148 | if len(vocab_list) > max_vocabulary_size:
149 | vocab_list = vocab_list[:max_vocabulary_size]
150 | with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
151 | for w in vocab_list:
152 | vocab_file.write(w + b"\n")
153 |
154 |
155 | def initialize_vocabulary(vocabulary_path):
156 | """Initialize vocabulary from file.
157 |
158 | We assume the vocabulary is stored one-item-per-line, so a file:
159 | dog
160 | cat
161 | will result in a vocabulary {"dog": 0, "cat": 1}, and this function will
162 | also return the reversed-vocabulary ["dog", "cat"].
163 |
164 | Args:
165 | vocabulary_path: path to the file containing the vocabulary.
166 |
167 | Returns:
168 | a pair: the vocabulary (a dictionary mapping string to integers), and
169 | the reversed vocabulary (a list, which reverses the vocabulary mapping).
170 |
171 | Raises:
172 | ValueError: if the provided vocabulary_path does not exist.
173 | """
174 | if gfile.Exists(vocabulary_path):
175 | rev_vocab = []
176 | with gfile.GFile(vocabulary_path, mode="rb") as f:
177 | rev_vocab.extend(f.readlines())
178 | rev_vocab = [line.strip() for line in rev_vocab]
179 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
180 | return vocab, rev_vocab
181 | else:
182 | raise ValueError("Vocabulary file %s not found.", vocabulary_path)
183 |
184 |
185 | def sentence_to_token_ids(sentence, vocabulary,
186 | tokenizer=None, normalize_digits=True):
187 | """Convert a string to list of integers representing token-ids.
188 |
189 | For example, a sentence "I have a dog" may become tokenized into
190 | ["I", "have", "a", "dog"] and with vocabulary {"I": 1, "have": 2,
191 | "a": 4, "dog": 7"} this function will return [1, 2, 4, 7].
192 |
193 | Args:
194 | sentence: the sentence in bytes format to convert to token-ids.
195 | vocabulary: a dictionary mapping tokens to integers.
196 | tokenizer: a function to use to tokenize each sentence;
197 | if None, basic_tokenizer will be used.
198 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
199 |
200 | Returns:
201 | a list of integers, the token-ids for the sentence.
202 | """
203 |
204 | if tokenizer:
205 | words = tokenizer(sentence)
206 | else:
207 | words = basic_tokenizer(sentence)
208 | if not normalize_digits:
209 | return [vocabulary.get(w, UNK_ID) for w in words]
210 | # Normalize digits by 0 before looking words up in the vocabulary.
211 | return [vocabulary.get(re.sub(_DIGIT_RE, b"0", w), UNK_ID) for w in words]
212 |
213 |
214 | def data_to_token_ids(data_path, target_path, vocabulary_path,
215 | tokenizer=None, normalize_digits=True):
216 | """Tokenize data file and turn into token-ids using given vocabulary file.
217 |
218 | This function loads data line-by-line from data_path, calls the above
219 | sentence_to_token_ids, and saves the result to target_path. See comment
220 | for sentence_to_token_ids on the details of token-ids format.
221 |
222 | Args:
223 | data_path: path to the data file in one-sentence-per-line format.
224 | target_path: path where the file with token-ids will be created.
225 | vocabulary_path: path to the vocabulary file.
226 | tokenizer: a function to use to tokenize each sentence;
227 | if None, basic_tokenizer will be used.
228 | normalize_digits: Boolean; if true, all digits are replaced by 0s.
229 | """
230 | if not gfile.Exists(target_path):
231 | print("Tokenizing data in %s" % data_path)
232 | vocab, _ = initialize_vocabulary(vocabulary_path)
233 | with gfile.GFile(data_path, mode="rb") as data_file:
234 | with gfile.GFile(target_path, mode="w") as tokens_file:
235 | counter = 0
236 | for line in data_file:
237 | counter += 1
238 | if counter % 100000 == 0:
239 | print(" tokenizing line %d" % counter)
240 | token_ids = sentence_to_token_ids(line, vocab, tokenizer,
241 | normalize_digits)
242 | tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
243 |
244 |
245 | def prepare_wmt_data(data_dir, en_vocabulary_size, fr_vocabulary_size, tokenizer=None):
246 | """Get WMT data into data_dir, create vocabularies and tokenize data.
247 |
248 | Args:
249 | data_dir: directory in which the data sets will be stored.
250 | en_vocabulary_size: size of the English vocabulary to create and use.
251 | fr_vocabulary_size: size of the French vocabulary to create and use.
252 | tokenizer: a function to use to tokenize each data sentence;
253 | if None, basic_tokenizer will be used.
254 |
255 | Returns:
256 | A tuple of 6 elements:
257 | (1) path to the token-ids for English training data-set,
258 | (2) path to the token-ids for French training data-set,
259 | (3) path to the token-ids for English development data-set,
260 | (4) path to the token-ids for French development data-set,
261 | (5) path to the English vocabulary file,
262 | (6) path to the French vocabulary file.
263 | """
264 | # Get wmt data to the specified directory.
265 | train_path = get_wmt_enfr_train_set(data_dir)
266 | dev_path = get_wmt_enfr_dev_set(data_dir)
267 |
268 | # Create vocabularies of the appropriate sizes.
269 | fr_vocab_path = os.path.join(data_dir, "vocab%d.fr" % fr_vocabulary_size)
270 | en_vocab_path = os.path.join(data_dir, "vocab%d.en" % en_vocabulary_size)
271 | create_vocabulary(fr_vocab_path, train_path + ".fr", fr_vocabulary_size, tokenizer)
272 | create_vocabulary(en_vocab_path, train_path + ".en", en_vocabulary_size, tokenizer)
273 |
274 | # Create token ids for the training data.
275 | fr_train_ids_path = train_path + (".ids%d.fr" % fr_vocabulary_size)
276 | en_train_ids_path = train_path + (".ids%d.en" % en_vocabulary_size)
277 | data_to_token_ids(train_path + ".fr", fr_train_ids_path, fr_vocab_path, tokenizer)
278 | data_to_token_ids(train_path + ".en", en_train_ids_path, en_vocab_path, tokenizer)
279 |
280 | # Create token ids for the development data.
281 | fr_dev_ids_path = dev_path + (".ids%d.fr" % fr_vocabulary_size)
282 | en_dev_ids_path = dev_path + (".ids%d.en" % en_vocabulary_size)
283 | data_to_token_ids(dev_path + ".fr", fr_dev_ids_path, fr_vocab_path, tokenizer)
284 | data_to_token_ids(dev_path + ".en", en_dev_ids_path, en_vocab_path, tokenizer)
285 |
286 | return (en_train_ids_path, fr_train_ids_path,
287 | en_dev_ids_path, fr_dev_ids_path,
288 | en_vocab_path, fr_vocab_path)
289 |
--------------------------------------------------------------------------------
/fathom/deepq/deepq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # NOTE: Based on Tejas Kulkarni's implementation
3 | # (https://github.com/mrkulk/deepQN_tensorflow).
4 | import time
5 | import datetime
6 |
7 | import tensorflow as tf
8 | import numpy as np
9 | import cv2
10 |
11 | from fathom.nn import GenericModel, default_runstep
12 |
13 | from .database import *
14 | from .emulator import *
15 |
16 | # TODO: clean up this file
17 | nature_params = {
18 | 'game': 'breakout',
19 | 'window_name': "NNModel: Deep Q-Learning for Atari",
20 | 'frameskip': 1,
21 | 'visualize' : False,
22 | 'network_type':'nips',
23 | 'ckpt_file':None,
24 | 'steps_per_epoch': 50000,
25 | 'num_epochs': 100,
26 | 'eval_freq':50000,
27 | 'steps_per_eval':10000,
28 | 'copy_freq' : 10000,
29 | 'disp_freq':10000,
30 | 'save_interval':10000,
31 | 'db_size': 1000000,
32 | 'batch': 32,
33 | 'num_act': 0,
34 | 'input_dims' : [210, 160, 3],
35 | 'input_dims_proc' : [84, 84, 4],
36 | 'learning_interval': 1,
37 | 'eps': 1.0,
38 | 'eps_step':1000000,
39 | 'eps_min' : 0.1,
40 | 'eps_eval' : 0.05,
41 | 'discount': 0.95,
42 | 'lr': 0.0002,
43 | 'rms_decay':0.99,
44 | 'rms_eps':1e-6,
45 | 'train_start':100, # default: 100
46 | 'img_scale':255.0,
47 | 'clip_delta' : 0, #nature : 1
48 | 'gpu_fraction' : 0.25,
49 | 'batch_accumulator':'mean',
50 | 'record_eval' : True,
51 | 'only_eval' : 'n'
52 | }
53 |
54 | nature_params['steps_per_epoch']= 200000
55 | nature_params['eval_freq'] = 100000
56 | nature_params['steps_per_eval'] = 10000
57 | nature_params['copy_freq'] = 10000
58 | nature_params['disp_freq'] = 20000
59 | nature_params['save_interval'] = 20000
60 | #nature_params['learning_interval'] = 1
61 | nature_params['discount'] = 0.99
62 | nature_params['lr'] = 0.00025
63 | nature_params['rms_decay'] = 0.95
64 | nature_params['rms_eps']=0.01
65 | nature_params['clip_delta'] = 1.0
66 | #nature_params['train_start']=50000
67 | nature_params['batch_accumulator'] = 'sum'
68 | nature_params['eps_step'] = 1000000
69 | nature_params['num_epochs'] = 250
70 | nature_params['batch'] = 32
71 |
72 | # The actual neural network interface implementation is the network which
73 | # combines the Q-network and target-network below, not this one.
74 | class DeepQNetNature(object):
75 | """Q-learning network which approximates action-value and action-value targets."""
76 | def __init__(self, params, parent_graph):
77 | self.G = parent_graph
78 | self.build(params)
79 |
80 | def build(self, params):
81 | with self.G.as_default():
82 | self.network_type = 'nature'
83 | self.params = params
84 | self.network_name = "deepqnet"
85 | self.x = tf.placeholder('float32',[None,84,84,4],name=self.network_name + '_x')
86 | self.q_t = tf.placeholder('float32',[None],name=self.network_name + '_q_t')
87 | self.actions = tf.placeholder("float32", [None, params['num_act']],name=self.network_name + '_actions')
88 | self.rewards = tf.placeholder("float32", [None],name=self.network_name + '_rewards')
89 | self.terminals = tf.placeholder("float32", [None],name=self.network_name + '_terminals')
90 |
91 | #conv1
92 | layer_name = 'conv1' ; size = 8 ; channels = 4 ; filters = 32 ; stride = 4
93 | self.w1 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
94 | self.b1 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases')
95 | self.c1 = tf.nn.conv2d(self.x, self.w1, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs')
96 | self.o1 = tf.nn.relu(tf.add(self.c1,self.b1),name=self.network_name + '_'+layer_name+'_activations')
97 | #self.n1 = tf.nn.lrn(self.o1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
98 |
99 | #conv2
100 | layer_name = 'conv2' ; size = 4 ; channels = 32 ; filters = 64 ; stride = 2
101 | self.w2 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
102 | self.b2 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases')
103 | self.c2 = tf.nn.conv2d(self.o1, self.w2, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs')
104 | self.o2 = tf.nn.relu(tf.add(self.c2,self.b2),name=self.network_name + '_'+layer_name+'_activations')
105 | #self.n2 = tf.nn.lrn(self.o2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
106 |
107 | #conv3
108 | layer_name = 'conv3' ; size = 3 ; channels = 64 ; filters = 64 ; stride = 1
109 | self.w3 = tf.Variable(tf.random_normal([size,size,channels,filters], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
110 | self.b3 = tf.Variable(tf.constant(0.1, shape=[filters]),name=self.network_name + '_'+layer_name+'_biases')
111 | self.c3 = tf.nn.conv2d(self.o2, self.w3, strides=[1, stride, stride, 1], padding='VALID',name=self.network_name + '_'+layer_name+'_convs')
112 | self.o3 = tf.nn.relu(tf.add(self.c3,self.b3),name=self.network_name + '_'+layer_name+'_activations')
113 | #self.n2 = tf.nn.lrn(self.o2, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
114 |
115 | #flat
116 | o3_shape = self.o3.get_shape().as_list()
117 |
118 | #fc3
119 | layer_name = 'fc4' ; hiddens = 512 ; dim = o3_shape[1]*o3_shape[2]*o3_shape[3]
120 | self.o3_flat = tf.reshape(self.o3, [-1,dim],name=self.network_name + '_'+layer_name+'_input_flat')
121 | self.w4 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
122 | self.b4 = tf.Variable(tf.constant(0.1, shape=[hiddens]),name=self.network_name + '_'+layer_name+'_biases')
123 | self.ip4 = tf.add(tf.matmul(self.o3_flat,self.w4),self.b4,name=self.network_name + '_'+layer_name+'_ips')
124 | self.o4 = tf.nn.relu(self.ip4,name=self.network_name + '_'+layer_name+'_activations')
125 |
126 | #fc4
127 | layer_name = 'fc5' ; hiddens = params['num_act'] ; dim = 512
128 | self.w5 = tf.Variable(tf.random_normal([dim,hiddens], stddev=0.01),name=self.network_name + '_'+layer_name+'_weights')
129 | self.b5 = tf.Variable(tf.constant(0.1, shape=[hiddens]),name=self.network_name + '_'+layer_name+'_biases')
130 | self.y = tf.add(tf.matmul(self.o4,self.w5),self.b5,name=self.network_name + '_'+layer_name+'_outputs')
131 |
132 | #Q,Cost,Optimizer
133 | self.discount = tf.constant(self.params['discount'])
134 | self.yj = tf.add(self.rewards, tf.multiply(1.0-self.terminals, tf.multiply(self.discount, self.q_t)))
135 | self.Qxa = tf.multiply(self.y,self.actions)
136 | self.Q_pred = tf.reduce_max(self.Qxa, axis=1)
137 | #self.yjr = tf.reshape(self.yj,(-1,1))
138 | #self.yjtile = tf.concat(1,[self.yjr,self.yjr,self.yjr,self.yjr])
139 | #self.yjax = tf.mul(self.yjtile,self.actions)
140 |
141 | #half = tf.constant(0.5)
142 | self.diff = tf.subtract(self.yj, self.Q_pred)
143 | if self.params['clip_delta'] > 0 :
144 | self.quadratic_part = tf.minimum(tf.abs(self.diff), tf.constant(self.params['clip_delta']))
145 | self.linear_part = tf.subtract(tf.abs(self.diff),self.quadratic_part)
146 | self.diff_square = 0.5 * tf.pow(self.quadratic_part,2) + self.params['clip_delta']*self.linear_part
147 |
148 | else:
149 | self.diff_square = tf.multiply(tf.constant(0.5),tf.pow(self.diff, 2))
150 | # add optimization
151 |
152 | self.loss()
153 | self.train()
154 |
155 | def loss(self):
156 | with self.G.as_default():
157 | if self.params['batch_accumulator'] == 'sum':
158 | self.cost = tf.reduce_sum(self.diff_square)
159 | else:
160 | self.cost = tf.reduce_mean(self.diff_square)
161 |
162 | def train(self):
163 | with self.G.as_default():
164 | self.global_step = tf.Variable(0, name='global_step', trainable=False)
165 | self.rmsprop = tf.train.RMSPropOptimizer(self.params['lr'],self.params['rms_decay'],0.0,self.params['rms_eps']).minimize(self.cost,global_step=self.global_step)
166 | return self.rmsprop
167 |
168 | class DeepQ(GenericModel):
169 | """Deep Q-Learning."""
170 | forward_only = False
171 |
172 | def __init__(self, device=None, init_options=None, game=nature_params['game']):
173 | super(DeepQ,self).__init__(device=device, init_options=init_options)
174 | assert game in ["breakout", "space_invaders", "seaquest"]
175 |
176 | self.G = tf.Graph()
177 |
178 | # NOTE: moved tf.Graph construction to setup
179 | self.params = nature_params
180 |
181 | self.DB = database(self.params)
182 | self.engine = emulator(rom_name='{}.bin'.format(game), vis=self.params['visualize'], frameskip=self.params['frameskip'], windowname=self.params['window_name'])
183 | #self.engine = emulator(rom_name='{}.bin'.format(game), vis=self.params['visualize'], frameskip=self.params['frameskip'], windowname=self.params['window_name'])
184 | self.params['num_act'] = len(self.engine.legal_actions)
185 |
186 | with self.G.device(device):
187 | self.build_inference()
188 |
189 | def build_inference(self):
190 | with self.G.as_default():
191 | print('Building QNet and targetnet...')
192 | self.qnet = DeepQNetNature(self.params, self.G)
193 | self.targetnet = DeepQNetNature(self.params, self.G)
194 | saver_dict = {'qw1':self.qnet.w1,'qb1':self.qnet.b1,
195 | 'qw2':self.qnet.w2,'qb2':self.qnet.b2,
196 | 'qw3':self.qnet.w3,'qb3':self.qnet.b3,
197 | 'qw4':self.qnet.w4,'qb4':self.qnet.b4,
198 | 'qw5':self.qnet.w5,'qb5':self.qnet.b5,
199 | 'tw1':self.targetnet.w1,'tb1':self.targetnet.b1,
200 | 'tw2':self.targetnet.w2,'tb2':self.targetnet.b2,
201 | 'tw3':self.targetnet.w3,'tb3':self.targetnet.b3,
202 | 'tw4':self.targetnet.w4,'tb4':self.targetnet.b4,
203 | 'tw5':self.targetnet.w5,'tb5':self.targetnet.b5,
204 | 'step':self.qnet.global_step}
205 |
206 | print("#ops", len(self.G.get_operations()))
207 |
208 | self.saver = tf.train.Saver(saver_dict)
209 | #self.saver = tf.train.Saver()
210 |
211 | self.cp_ops = [
212 | self.targetnet.w1.assign(self.qnet.w1),self.targetnet.b1.assign(self.qnet.b1),
213 | self.targetnet.w2.assign(self.qnet.w2),self.targetnet.b2.assign(self.qnet.b2),
214 | self.targetnet.w3.assign(self.qnet.w3),self.targetnet.b3.assign(self.qnet.b3),
215 | self.targetnet.w4.assign(self.qnet.w4),self.targetnet.b4.assign(self.qnet.b4),
216 | self.targetnet.w5.assign(self.qnet.w5),self.targetnet.b5.assign(self.qnet.b5)]
217 |
218 | if self.params['ckpt_file'] is not None:
219 | print('loading checkpoint : ' + self.params['ckpt_file'])
220 | self.saver.restore(self.sess,self.params['ckpt_file'])
221 | temp_train_cnt = self.sess.run(self.qnet.global_step)
222 | temp_step = temp_train_cnt * self.params['learning_interval']
223 | print('Continue from')
224 | print(' -> Steps : ' + str(temp_step))
225 | print(' -> Minibatch update : ' + str(temp_train_cnt))
226 |
227 | def model(self):
228 | return self.G
229 |
230 | def setup(self, setup_options=None):
231 | super(DeepQ,self).setup(setup_options=setup_options)
232 | with self.G.as_default():
233 | if setup_options is None:
234 | self.setup_config = tf.ConfigProto(gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=self.params['gpu_fraction']))
235 | else:
236 | self.setup_config = tf.ConfigProto(**setup_options)
237 | self.setup_config.gpu_options.per_process_gpu_memory_fraction=self.params['gpu_fraction']
238 |
239 | self.sess = tf.Session(config=self.setup_config)
240 | self.init = tf.global_variables_initializer()
241 | self.sess.run(self.init)
242 | self.sess.run(self.cp_ops)
243 |
244 | self.reset_game()
245 | self.step = 0
246 | self.reset_statistics('all')
247 | self.train_cnt = self.sess.run(self.qnet.global_step)
248 |
249 | def reset_game(self):
250 | self.state_proc = np.zeros((84,84,4)); self.action = -1; self.terminal = False; self.reward = 0
251 | self.state = self.engine.newGame()
252 | self.state_resized = cv2.resize(self.state,(84,110))
253 | self.state_gray = cv2.cvtColor(self.state_resized, cv2.COLOR_BGR2GRAY)
254 | self.state_gray_old = None
255 | self.state_proc[:,:,3] = self.state_gray[26:110,:]/self.params['img_scale']
256 |
257 | def reset_statistics(self, mode):
258 | if mode == 'all':
259 | self.epi_reward_train = 0
260 | self.epi_Q_train = 0
261 | self.num_epi_train = 0
262 | self.total_reward_train = 0
263 | self.total_Q_train = 0
264 | self.total_cost_train = 0
265 | self.steps_train = 0
266 | self.train_cnt_for_disp = 0
267 | self.step_eval = 0
268 | self.epi_reward_eval = 0
269 | self.epi_Q_eval = 0
270 | self.num_epi_eval = 0
271 | self.total_reward_eval = 0
272 | self.total_Q_eval = 0
273 |
274 | def select_action(self, st, runstep=None):
275 | with self.G.as_default():
276 | if np.random.rand() > self.params['eps']:
277 | #greedy with random tie-breaking
278 | if not self.forward_only:
279 | Q_pred = self.sess.run(self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0]
280 | else:
281 | Q_pred = runstep(self.sess, self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0]
282 |
283 | a_winner = np.argwhere(Q_pred == np.amax(Q_pred))
284 | if len(a_winner) > 1:
285 | act_idx = a_winner[np.random.randint(0, len(a_winner))][0]
286 | return act_idx,self.engine.legal_actions[act_idx], np.amax(Q_pred)
287 | else:
288 | act_idx = a_winner[0][0]
289 | return act_idx,self.engine.legal_actions[act_idx], np.amax(Q_pred)
290 | else:
291 | #random
292 | act_idx = np.random.randint(0,len(self.engine.legal_actions))
293 | if not self.forward_only:
294 | Q_pred = self.sess.run(self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0]
295 | else:
296 | Q_pred = runstep(self.sess, self.qnet.y, feed_dict = {self.qnet.x: np.reshape(st, (1,84,84,4))})[0]
297 | return act_idx,self.engine.legal_actions[act_idx], Q_pred[act_idx]
298 |
299 | def get_onehot(self,actions):
300 | actions_onehot = np.zeros((self.params['batch'], self.params['num_act']))
301 |
302 | for i in range(self.params['batch']):
303 | actions_onehot[i,int(actions[i])] = 1
304 | return actions_onehot
305 |
306 | def run(self, runstep=default_runstep, n_steps=1):
307 | self.s = time.time()
308 | print(self.params)
309 | print('Start training!')
310 | print('Collecting replay memory for ' + str(self.params['train_start']) + ' steps')
311 |
312 | with self.G.as_default():
313 | while self.step < (self.params['steps_per_epoch'] * self.params['num_epochs'] * self.params['learning_interval'] + self.params['train_start']):
314 | if not self.forward_only:
315 | if self.step >= n_steps:
316 | return
317 | if self.DB.get_size() >= self.params['train_start'] : self.step += 1 ; self.steps_train += 1
318 | else:
319 | if self.step_eval >= n_steps:
320 | return
321 | self.step_eval += 1
322 | if self.state_gray_old is not None and not self.forward_only:
323 | self.DB.insert(self.state_gray_old[26:110,:],self.reward_scaled,self.action_idx,self.terminal)
324 |
325 | if not self.forward_only and self.params['copy_freq'] > 0 and self.step % self.params['copy_freq'] == 0 and self.DB.get_size() > self.params['train_start']:
326 | print('&&& Copying Qnet to targetnet\n')
327 | self.sess.run(self.cp_ops)
328 |
329 | if not self.forward_only and self.step % self.params['learning_interval'] == 0 and self.DB.get_size() > self.params['train_start'] :
330 | bat_s,bat_a,bat_t,bat_n,bat_r = self.DB.get_batches()
331 | bat_a = self.get_onehot(bat_a)
332 |
333 | if self.params['copy_freq'] > 0 :
334 | feed_dict={self.targetnet.x: bat_n}
335 | q_t = self.sess.run(self.targetnet.y,feed_dict=feed_dict)
336 | else:
337 | feed_dict={self.qnet.x: bat_n}
338 | q_t = self.sess.run(self.qnet.y,feed_dict=feed_dict)
339 |
340 | q_t = np.amax(q_t,axis=1)
341 |
342 | feed_dict={self.qnet.x: bat_s, self.qnet.q_t: q_t, self.qnet.actions: bat_a, self.qnet.terminals:bat_t, self.qnet.rewards: bat_r}
343 |
344 | # NOTE: we only runstep the Qnet
345 | _,self.train_cnt,self.cost = runstep(self.sess, [self.qnet.rmsprop,self.qnet.global_step,self.qnet.cost],feed_dict=feed_dict)
346 |
347 | self.total_cost_train += np.sqrt(self.cost)
348 | self.train_cnt_for_disp += 1
349 |
350 | if not self.forward_only:
351 | self.params['eps'] = max(self.params['eps_min'],1.0 - float(self.train_cnt * self.params['learning_interval'])/float(self.params['eps_step']))
352 | else:
353 | self.params['eps'] = 0.05
354 |
355 | if self.DB.get_size() > self.params['train_start'] and self.step % self.params['save_interval'] == 0 and not self.forward_only:
356 | save_idx = self.train_cnt
357 | self.saver.save(self.sess,'ckpt/model_'+self.params['network_type']+'_'+str(save_idx))
358 | sys.stdout.write('$$$ Model saved : %s\n\n' % ('ckpt/model_'+self.params['network_type']+'_'+str(save_idx)))
359 | sys.stdout.flush()
360 |
361 | if not self.forward_only and self.step > 0 and self.step % self.params['eval_freq'] == 0 and self.DB.get_size() > self.params['train_start']:
362 | self.reset_game()
363 | if self.step % self.params['steps_per_epoch'] == 0 : self.reset_statistics('all')
364 | else: self.reset_statistics('eval')
365 | self.forward_only = True
366 | #TODO : add video recording
367 | continue
368 | if not self.forward_only and self.step > 0 and self.step % self.params['steps_per_epoch'] == 0 and self.DB.get_size() > self.params['train_start']:
369 | self.reset_game()
370 | self.reset_statistics('all')
371 | #self.forward_only = True
372 | continue
373 |
374 | if self.forward_only and self.step_eval >= self.params['steps_per_eval'] :
375 | self.reset_game()
376 | self.reset_statistics('eval')
377 | self.forward_only = False
378 | continue
379 |
380 | if self.terminal:
381 | self.reset_game()
382 | if not self.forward_only:
383 | self.num_epi_train += 1
384 | self.total_reward_train += self.epi_reward_train
385 | self.epi_reward_train = 0
386 | else:
387 | self.num_epi_eval += 1
388 | self.total_reward_eval += self.epi_reward_eval
389 | self.epi_reward_eval = 0
390 | continue
391 |
392 | self.action_idx,self.action, self.maxQ = self.select_action(self.state_proc, runstep=runstep)
393 | self.state, self.reward, self.terminal = self.engine.next(self.action)
394 | self.reward_scaled = self.reward // max(1,abs(self.reward))
395 | if not self.forward_only : self.epi_reward_train += self.reward ; self.total_Q_train += self.maxQ
396 | else : self.epi_reward_eval += self.reward ; self.total_Q_eval += self.maxQ
397 |
398 | self.state_gray_old = np.copy(self.state_gray)
399 | self.state_proc[:,:,0:3] = self.state_proc[:,:,1:4]
400 | self.state_resized = cv2.resize(self.state,(84,110))
401 | self.state_gray = cv2.cvtColor(self.state_resized, cv2.COLOR_BGR2GRAY)
402 | self.state_proc[:,:,3] = self.state_gray[26:110,:]/self.params['img_scale']
403 |
404 | print("Finished step {0} ({1})".format(self.step_eval, datetime.datetime.now()))
405 |
406 | @property
407 | def loss(self):
408 | return self.qnet.cost
409 |
410 | @property
411 | def train(self):
412 | return self.qnet.rmsprop
413 |
414 | @property
415 | def labels(self):
416 | return
417 |
418 | @property
419 | def inputs(self):
420 | return self.qnet.x, self.qnet.q_t, self.qnet.actions, self.qnet.rewards, self.qnet.terminals
421 |
422 | @property
423 | def outputs(self):
424 | return self.qnet.y # just outputs, not predictions
425 |
426 | def teardown(self):
427 | if self.sess is not None:
428 | self.sess.close()
429 | self.sess = None
430 |
431 | class DeepQFwd(DeepQ):
432 | forward_only = True
433 |
434 | if __name__=='__main__':
435 | m = DeepQ()
436 | m.setup()
437 | m.run(runstep=default_runstep, n_steps=100)
438 | m.teardown()
439 |
440 |
--------------------------------------------------------------------------------
/fathom/imagenet/image_processing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Read and preprocess image data.
16 |
17 | Image processing occurs on a single image at a time. Image are read and
18 | preprocessed in pararllel across mulitple threads. The resulting images
19 | are concatenated together to form a single batch for training or evaluation.
20 |
21 | -- Provide processed image data for a network:
22 | inputs: Construct batches of evaluation examples of images.
23 | distorted_inputs: Construct batches of training examples of images.
24 | batch_inputs: Construct batches of training or evaluation examples of images.
25 |
26 | -- Data processing:
27 | parse_example_proto: Parses an Example proto containing a training example
28 | of an image.
29 |
30 | -- Image decoding:
31 | decode_jpeg: Decode a JPEG encoded string into a 3-D float32 Tensor.
32 |
33 | -- Image preprocessing:
34 | image_preprocessing: Decode and preprocess one image for evaluation or training
35 | distort_image: Distort one image for training a network.
36 | eval_image: Prepare one image for evaluation.
37 | distort_color: Distort the color in one image for training.
38 | """
39 |
40 |
41 |
42 |
43 |
44 | import tensorflow as tf
45 |
46 | FLAGS = tf.app.flags.FLAGS
47 |
48 | tf.app.flags.DEFINE_integer('batch_size', 32,
49 | """Number of images to process in a batch.""")
50 | tf.app.flags.DEFINE_integer('image_size', 224,
51 | """Provide square images of this size.""")
52 | tf.app.flags.DEFINE_integer('num_preprocess_threads', 4,
53 | """Number of preprocessing threads per tower. """
54 | """Please make this a multiple of 4.""")
55 |
56 | # Images are preprocessed asynchronously using multiple threads specifed by
57 | # --num_preprocss_threads and the resulting processed images are stored in a
58 | # random shuffling queue. The shuffling queue dequeues --batch_size images
59 | # for processing on a given Inception tower. A larger shuffling queue guarantees
60 | # better mixing across examples within a batch and results in slightly higher
61 | # predictive performance in a trained model. Empirically,
62 | # --input_queue_memory_factor=16 works well. A value of 16 implies a queue size
63 | # of 1024*16 images. Assuming RGB 299x299 images, this implies a queue size of
64 | # 16GB. If the machine is memory limited, then decrease this factor to
65 | # decrease the CPU memory footprint, accordingly.
66 | tf.app.flags.DEFINE_integer('input_queue_memory_factor', 1,
67 | """Size of the queue of preprocessed images. """
68 | """Default is ideal but try smaller values, e.g. """
69 | """4, 2 or 1, if host memory is constrained. See """
70 | """comments in code for more details.""")
71 |
72 |
73 | def inputs(dataset, batch_size=None, num_preprocess_threads=None):
74 | """Generate batches of ImageNet images for evaluation.
75 |
76 | Use this function as the inputs for evaluating a network.
77 |
78 | Note that some (minimal) image preprocessing occurs during evaluation
79 | including central cropping and resizing of the image to fit the network.
80 |
81 | Args:
82 | dataset: instance of Dataset class specifying the dataset.
83 | batch_size: integer, number of examples in batch
84 | num_preprocess_threads: integer, total number of preprocessing threads but
85 | None defaults to FLAGS.num_preprocess_threads.
86 |
87 | Returns:
88 | images: Images. 4D tensor of size [batch_size, FLAGS.image_size,
89 | image_size, 3].
90 | labels: 1-D integer Tensor of [FLAGS.batch_size].
91 | """
92 | if not batch_size:
93 | batch_size = FLAGS.batch_size
94 |
95 | # Force all input processing onto CPU in order to reserve the GPU for
96 | # the forward inference and back-propagation.
97 | with tf.device('/cpu:0'):
98 | images, labels = batch_inputs(
99 | dataset, batch_size, train=False,
100 | num_preprocess_threads=num_preprocess_threads)
101 |
102 | return images, labels
103 |
104 |
105 | def distorted_inputs(dataset, batch_size=None, num_preprocess_threads=None):
106 | """Generate batches of distorted versions of ImageNet images.
107 |
108 | Use this function as the inputs for training a network.
109 |
110 | Distorting images provides a useful technique for augmenting the data
111 | set during training in order to make the network invariant to aspects
112 | of the image that do not effect the label.
113 |
114 | Args:
115 | dataset: instance of Dataset class specifying the dataset.
116 | batch_size: integer, number of examples in batch
117 | num_preprocess_threads: integer, total number of preprocessing threads but
118 | None defaults to FLAGS.num_preprocess_threads.
119 |
120 | Returns:
121 | images: Images. 4D tensor of size [batch_size, FLAGS.image_size,
122 | FLAGS.image_size, 3].
123 | labels: 1-D integer Tensor of [batch_size].
124 | """
125 | if not batch_size:
126 | batch_size = FLAGS.batch_size
127 |
128 | # Force all input processing onto CPU in order to reserve the GPU for
129 | # the forward inference and back-propagation.
130 | with tf.device('/cpu:0'):
131 | images, labels = batch_inputs(
132 | dataset, batch_size, train=True,
133 | num_preprocess_threads=num_preprocess_threads)
134 | return images, labels
135 |
136 |
137 | def decode_jpeg(image_buffer, scope=None):
138 | """Decode a JPEG string into one 3-D float image Tensor.
139 |
140 | Args:
141 | image_buffer: scalar string Tensor.
142 | scope: Optional scope for op_scope.
143 | Returns:
144 | 3-D float Tensor with values ranging from [0, 1).
145 | """
146 | with tf.name_scope(values=[image_buffer], name=scope, default_name='decode_jpeg'):
147 | # Decode the string as an RGB JPEG.
148 | # Note that the resulting image contains an unknown height and width
149 | # that is set dynamically by decode_jpeg. In other words, the height
150 | # and width of image is unknown at compile-time.
151 | image = tf.image.decode_jpeg(image_buffer, channels=3)
152 |
153 | # After this point, all image pixels reside in [0,1)
154 | # until the very end, when they're rescaled to (-1, 1). The various
155 | # adjust_* ops all require this range for dtype float.
156 | image = tf.image.convert_image_dtype(image, dtype=tf.float32)
157 | return image
158 |
159 |
160 | def distort_color(image, thread_id=0, scope=None):
161 | """Distort the color of the image.
162 |
163 | Each color distortion is non-commutative and thus ordering of the color ops
164 | matters. Ideally we would randomly permute the ordering of the color ops.
165 | Rather then adding that level of complication, we select a distinct ordering
166 | of color ops for each preprocessing thread.
167 |
168 | Args:
169 | image: Tensor containing single image.
170 | thread_id: preprocessing thread ID.
171 | scope: Optional scope for op_scope.
172 | Returns:
173 | color-distorted image
174 | """
175 | with tf.name_scope(values=[image], name=scope, default_name='distort_color'):
176 | color_ordering = thread_id % 2
177 |
178 | if color_ordering == 0:
179 | image = tf.image.random_brightness(image, max_delta=32. / 255.)
180 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
181 | image = tf.image.random_hue(image, max_delta=0.2)
182 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
183 | elif color_ordering == 1:
184 | image = tf.image.random_brightness(image, max_delta=32. / 255.)
185 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
186 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
187 | image = tf.image.random_hue(image, max_delta=0.2)
188 |
189 | # The random_* ops do not necessarily clamp.
190 | image = tf.clip_by_value(image, 0.0, 1.0)
191 | return image
192 |
193 |
194 | def distort_image(image, height, width, bbox, thread_id=0, scope=None):
195 | """Distort one image for training a network.
196 |
197 | Distorting images provides a useful technique for augmenting the data
198 | set during training in order to make the network invariant to aspects
199 | of the image that do not effect the label.
200 |
201 | Args:
202 | image: 3-D float Tensor of image
203 | height: integer
204 | width: integer
205 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
206 | where each coordinate is [0, 1) and the coordinates are arranged
207 | as [ymin, xmin, ymax, xmax].
208 | thread_id: integer indicating the preprocessing thread.
209 | scope: Optional scope for op_scope.
210 | Returns:
211 | 3-D float Tensor of distorted image used for training.
212 | """
213 | with tf.name_scope(values=[image, height, width, bbox], name=scope, default_name='distort_image'):
214 | # Each bounding box has shape [1, num_boxes, box coords] and
215 | # the coordinates are ordered [ymin, xmin, ymax, xmax].
216 |
217 | # Display the bounding box in the first thread only.
218 | if not thread_id:
219 | image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
220 | bbox)
221 | tf.summary.image('image_with_bounding_boxes', image_with_box)
222 |
223 | # A large fraction of image datasets contain a human-annotated bounding
224 | # box delineating the region of the image containing the object of interest.
225 | # We choose to create a new bounding box for the object which is a randomly
226 | # distorted version of the human-annotated bounding box that obeys an allowed
227 | # range of aspect ratios, sizes and overlap with the human-annotated
228 | # bounding box. If no box is supplied, then we assume the bounding box is
229 | # the entire image.
230 | sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
231 | tf.shape(image),
232 | bounding_boxes=bbox,
233 | min_object_covered=0.1,
234 | aspect_ratio_range=[0.75, 1.33],
235 | area_range=[0.05, 1.0],
236 | max_attempts=100,
237 | use_image_if_no_bounding_boxes=True)
238 | bbox_begin, bbox_size, distort_bbox = sample_distorted_bounding_box
239 | if not thread_id:
240 | image_with_distorted_box = tf.image.draw_bounding_boxes(
241 | tf.expand_dims(image, 0), distort_bbox)
242 | tf.summary.image('images_with_distorted_bounding_box',
243 | image_with_distorted_box)
244 |
245 | # Crop the image to the specified bounding box.
246 | distorted_image = tf.slice(image, bbox_begin, bbox_size)
247 |
248 | # This resizing operation may distort the images because the aspect
249 | # ratio is not respected. We select a resize method in a round robin
250 | # fashion based on the thread number.
251 | # Note that ResizeMethod contains 4 enumerated resizing methods.
252 | resize_method = thread_id % 4
253 | distorted_image = tf.image.resize_images(distorted_image, [height, width],
254 | resize_method)
255 | # Restore the shape since the dynamic slice based upon the bbox_size loses
256 | # the third dimension.
257 | distorted_image.set_shape([height, width, 3])
258 | if not thread_id:
259 | tf.summary.image('cropped_resized_image',
260 | tf.expand_dims(distorted_image, 0))
261 |
262 | # Randomly flip the image horizontally.
263 | distorted_image = tf.image.random_flip_left_right(distorted_image)
264 |
265 | # Randomly distort the colors.
266 | distorted_image = distort_color(distorted_image, thread_id)
267 |
268 | if not thread_id:
269 | tf.summary.image('final_distorted_image',
270 | tf.expand_dims(distorted_image, 0))
271 | return distorted_image
272 |
273 |
274 | def eval_image(image, height, width, scope=None):
275 | """Prepare one image for evaluation.
276 |
277 | Args:
278 | image: 3-D float Tensor
279 | height: integer
280 | width: integer
281 | scope: Optional scope for op_scope.
282 | Returns:
283 | 3-D float Tensor of prepared image.
284 | """
285 | with tf.name_scope(values=[image, height, width], name=scope, default_name='eval_image'):
286 | # Crop the central region of the image with an area containing 87.5% of
287 | # the original image.
288 | image = tf.image.central_crop(image, central_fraction=0.875)
289 |
290 | # Resize the image to the original height and width.
291 | image = tf.expand_dims(image, 0)
292 | image = tf.image.resize_bilinear(image, [height, width],
293 | align_corners=False)
294 | image = tf.squeeze(image, [0])
295 | return image
296 |
297 |
298 | def image_preprocessing(image_buffer, bbox, train, thread_id=0):
299 | """Decode and preprocess one image for evaluation or training.
300 |
301 | Args:
302 | image_buffer: JPEG encoded string Tensor
303 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
304 | where each coordinate is [0, 1) and the coordinates are arranged as
305 | [ymin, xmin, ymax, xmax].
306 | train: boolean
307 | thread_id: integer indicating preprocessing thread
308 |
309 | Returns:
310 | 3-D float Tensor containing an appropriately scaled image
311 |
312 | Raises:
313 | ValueError: if user does not provide bounding box
314 | """
315 | if bbox is None:
316 | raise ValueError('Please supply a bounding box.')
317 |
318 | image = decode_jpeg(image_buffer)
319 | height = FLAGS.image_size
320 | width = FLAGS.image_size
321 |
322 | if train:
323 | image = distort_image(image, height, width, bbox, thread_id)
324 | else:
325 | image = eval_image(image, height, width)
326 |
327 | # Finally, rescale to [-1,1] instead of [0, 1)
328 | image = tf.subtract(image, 0.5)
329 | image = tf.multiply(image, 2.0)
330 | return image
331 |
332 |
333 | def parse_example_proto(example_serialized):
334 | """Parses an Example proto containing a training example of an image.
335 |
336 | The output of the build_image_data.py image preprocessing script is a dataset
337 | containing serialized Example protocol buffers. Each Example proto contains
338 | the following fields:
339 |
340 | image/height: 462
341 | image/width: 581
342 | image/colorspace: 'RGB'
343 | image/channels: 3
344 | image/class/label: 615
345 | image/class/synset: 'n03623198'
346 | image/class/text: 'knee pad'
347 | image/object/bbox/xmin: 0.1
348 | image/object/bbox/xmax: 0.9
349 | image/object/bbox/ymin: 0.2
350 | image/object/bbox/ymax: 0.6
351 | image/object/bbox/label: 615
352 | image/format: 'JPEG'
353 | image/filename: 'ILSVRC2012_val_00041207.JPEG'
354 | image/encoded:
355 |
356 | Args:
357 | example_serialized: scalar Tensor tf.string containing a serialized
358 | Example protocol buffer.
359 |
360 | Returns:
361 | image_buffer: Tensor tf.string containing the contents of a JPEG file.
362 | label: Tensor tf.int32 containing the label.
363 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
364 | where each coordinate is [0, 1) and the coordinates are arranged as
365 | [ymin, xmin, ymax, xmax].
366 | text: Tensor tf.string containing the human-readable label.
367 | """
368 | # Dense features in Example proto.
369 | feature_map = {
370 | 'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
371 | default_value=''),
372 | 'image/class/label': tf.FixedLenFeature([1], dtype=tf.int64,
373 | default_value=-1),
374 | 'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
375 | default_value=''),
376 | }
377 | sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
378 | # Sparse features in Example proto.
379 | feature_map.update(
380 | {k: sparse_float32 for k in ['image/object/bbox/xmin',
381 | 'image/object/bbox/ymin',
382 | 'image/object/bbox/xmax',
383 | 'image/object/bbox/ymax']})
384 |
385 | features = tf.parse_single_example(example_serialized, feature_map)
386 | label = tf.cast(features['image/class/label'], dtype=tf.int32)
387 |
388 | xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
389 | ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
390 | xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
391 | ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
392 |
393 | # Note that we impose an ordering of (y, x) just to make life difficult.
394 | bbox = tf.concat(axis=0, values=[ymin, xmin, ymax, xmax])
395 |
396 | # Force the variable number of bounding boxes into the shape
397 | # [1, num_boxes, coords].
398 | bbox = tf.expand_dims(bbox, 0)
399 | bbox = tf.transpose(bbox, [0, 2, 1])
400 |
401 | return features['image/encoded'], label, bbox, features['image/class/text']
402 |
403 |
404 | def batch_inputs(dataset, batch_size, train, num_preprocess_threads=None):
405 | """Contruct batches of training or evaluation examples from the image dataset.
406 |
407 | Args:
408 | dataset: instance of Dataset class specifying the dataset.
409 | See dataset.py for details.
410 | batch_size: integer
411 | train: boolean
412 | num_preprocess_threads: integer, total number of preprocessing threads
413 |
414 | Returns:
415 | images: 4-D float Tensor of a batch of images
416 | labels: 1-D integer Tensor of [batch_size].
417 |
418 | Raises:
419 | ValueError: if data is not found
420 | """
421 | with tf.name_scope('batch_processing'):
422 | data_files = dataset.data_files()
423 | if data_files is None:
424 | raise ValueError('No data files found for this dataset')
425 | filename_queue = tf.train.string_input_producer(data_files, capacity=16)
426 |
427 | if num_preprocess_threads is None:
428 | num_preprocess_threads = FLAGS.num_preprocess_threads
429 |
430 | if num_preprocess_threads % 4:
431 | raise ValueError('Please make num_preprocess_threads a multiple '
432 | 'of 4 (%d % 4 != 0).', num_preprocess_threads)
433 | # Create a subgraph with its own reader (but sharing the
434 | # filename_queue) for each preprocessing thread.
435 | images_and_labels = []
436 | for thread_id in range(num_preprocess_threads):
437 | reader = dataset.reader()
438 | _, example_serialized = reader.read(filename_queue)
439 |
440 | # Parse a serialized Example proto to extract the image and metadata.
441 | image_buffer, label_index, bbox, _ = parse_example_proto(
442 | example_serialized)
443 | image = image_preprocessing(image_buffer, bbox, train, thread_id)
444 | images_and_labels.append([image, label_index])
445 |
446 | # Approximate number of examples per shard.
447 | examples_per_shard = 1024
448 | # Size the random shuffle queue to balance between good global
449 | # mixing (more examples) and memory use (fewer examples).
450 | # 1 image uses 299*299*3*4 bytes = 1MB
451 | # The default input_queue_memory_factor is 16 implying a shuffling queue
452 | # size: examples_per_shard * 16 * 1MB = 17.6GB
453 | min_queue_examples = examples_per_shard * FLAGS.input_queue_memory_factor
454 |
455 | # Create a queue that produces the examples in batches after shuffling.
456 | if train:
457 | images, label_index_batch = tf.train.shuffle_batch_join(
458 | images_and_labels,
459 | batch_size=batch_size,
460 | capacity=min_queue_examples + 3 * batch_size,
461 | min_after_dequeue=min_queue_examples)
462 | else:
463 | images, label_index_batch = tf.train.batch_join(
464 | images_and_labels,
465 | batch_size=batch_size,
466 | capacity=min_queue_examples + 3 * batch_size)
467 |
468 | # Reshape images into these desired dimensions.
469 | height = FLAGS.image_size
470 | width = FLAGS.image_size
471 | depth = 3
472 |
473 | images = tf.cast(images, tf.float32)
474 | images = tf.reshape(images, shape=[batch_size, height, width, depth])
475 |
476 | # Display the training images in the visualizer.
477 | tf.summary.image('images', images)
478 |
479 | return images, tf.reshape(label_index_batch, [batch_size])
480 |
--------------------------------------------------------------------------------
/fathom/seq2seq/seq2seq.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import math
3 | import random
4 | import sys
5 | import time
6 |
7 | import tensorflow as tf
8 | import numpy as np
9 |
10 | from fathom.nn import NeuralNetworkModel, default_runstep
11 |
12 | from . import data_utils
13 |
14 | class Seq2Seq(NeuralNetworkModel):
15 | """Based on TensorFlow example of sequence-to-sequence translation."""
16 | def build_inputs(self):
17 | # Feeds for inputs.
18 | self.encoder_inputs = []
19 | self.decoder_inputs = []
20 | for i in range(self.buckets[-1][0]): # Last bucket is the biggest one.
21 | self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
22 | name="encoder{0}".format(i)))
23 | for i in range(self.buckets[-1][1] + 1):
24 | self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None],
25 | name="decoder{0}".format(i)))
26 |
27 | @property
28 | def inputs(self):
29 | return self.encoder_inputs, self.decoder_inputs
30 |
31 | @property
32 | def labels(self):
33 | return self.target_weights
34 |
35 | def build_labels(self):
36 | # Our targets are decoder inputs shifted by one.
37 | self.targets = [self.decoder_inputs[i + 1]
38 | for i in range(len(self.decoder_inputs) - 1)]
39 |
40 | self.target_weights = []
41 | for i in range(self.buckets[-1][1] + 1):
42 | self.target_weights.append(tf.placeholder(tf.float32, shape=[None],
43 | name="weight{0}".format(i)))
44 |
45 | def build_evaluation(self):
46 | pass
47 |
48 | def build_inference(self, xs):
49 | with self.G.as_default():
50 | # If we use sampled softmax, we need an output projection.
51 | output_projection = None
52 | softmax_loss_function = None
53 | # Sampled softmax only makes sense if we sample less than vocabulary size.
54 | num_samples = self.num_samples
55 | if num_samples > 0 and num_samples < self.target_vocab_size:
56 | w = tf.get_variable("proj_w", [self.size, self.target_vocab_size])
57 | w_t = tf.transpose(w)
58 | b = tf.get_variable("proj_b", [self.target_vocab_size])
59 | output_projection = (w, b)
60 |
61 | def sampled_loss(labels, logits):
62 | labels = tf.reshape(labels, [-1, 1])
63 | # We need to compute the sampled_softmax_loss using 32bit floats to
64 | # avoid numerical instabilities.
65 | local_w_t = tf.cast(w_t, tf.float32)
66 | local_b = tf.cast(b, tf.float32)
67 | local_inputs = tf.cast(logits, tf.float32)
68 | return tf.nn.sampled_softmax_loss(
69 | weights=local_w_t,
70 | biases=local_b,
71 | labels=labels,
72 | inputs=local_inputs,
73 | num_sampled=num_samples,
74 | num_classes=self.target_vocab_size)
75 | softmax_loss_function = sampled_loss
76 |
77 | # Create the internal multi-layer cell for our RNN.
78 | def single_cell():
79 | if self.use_lstm:
80 | return tf.contrib.rnn.BasicLSTMCell(self.size, reuse=tf.get_variable_scope().reuse)
81 | else:
82 | return tf.contrib.rnn.GRUCell(self.size, reuse=tf.get_variable_scope().reuse)
83 |
84 | # The seq2seq function: we use embedding for the input and attention.
85 | def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
86 | if self.num_layers > 1:
87 | cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range (self.num_layers)])
88 | else:
89 | cell = single_cell()
90 | return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
91 | encoder_inputs, decoder_inputs, cell,
92 | num_encoder_symbols=self.source_vocab_size,
93 | num_decoder_symbols=self.target_vocab_size,
94 | embedding_size=self.size,
95 | output_projection=output_projection,
96 | feed_previous=do_decode)
97 |
98 | # Training outputs and losses.
99 | if self.forward_only:
100 | self._outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
101 | self.encoder_inputs, self.decoder_inputs, self.targets,
102 | self.target_weights, self.buckets, lambda x, y: seq2seq_f(x, y, True),
103 | softmax_loss_function=softmax_loss_function)
104 | # If we use output projection, we need to project outputs for decoding.
105 | if output_projection is not None:
106 | for b in range(len(self.buckets)):
107 | self._outputs[b] = [
108 | tf.matmul(output, output_projection[0]) + output_projection[1]
109 | for output in self._outputs[b]
110 | ]
111 | else:
112 | self._outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
113 | self.encoder_inputs, self.decoder_inputs, self.targets,
114 | self.target_weights, self.buckets,
115 | lambda x, y: seq2seq_f(x, y, False),
116 | softmax_loss_function=softmax_loss_function)
117 |
118 | return self._outputs
119 |
120 | @property
121 | def loss(self):
122 | return self.losses
123 |
124 | @property
125 | def train(self):
126 | return self.updates
127 |
128 | def build_loss(self, logits, labels):
129 | with self.G.as_default():
130 | # TODO: how to handle this in seq2seq? refactoring needed
131 | self.loss_op = self.losses
132 | return self.losses
133 |
134 | def build_train(self, losses):
135 | # TODO: modify total_loss to handle buckets
136 | self.updates = None
137 | with self.G.as_default():
138 | # Gradients and SGD update operation for training the model.
139 | params = tf.trainable_variables()
140 | if not self.forward_only:
141 | self.gradient_norms = []
142 | self.updates = []
143 | self.opt = tf.train.GradientDescentOptimizer(self.learning_rate)
144 | for b in range(len(self.buckets)):
145 | gradients = tf.gradients(self.losses[b], params)
146 | clipped_gradients, norm = tf.clip_by_global_norm(gradients,
147 | self.max_gradient_norm)
148 | self.gradient_norms.append(norm)
149 | self.updates.append(self.opt.apply_gradients(
150 | list(zip(clipped_gradients, params)), global_step=self.global_step))
151 |
152 | return self.updates # note: this is per-bucket
153 |
154 | def load_data(self):
155 | # TODO: make configurable
156 | self.data_dir = "/data/WMT15/"
157 |
158 | print("Preparing WMT data in %s" % self.data_dir)
159 | en_train, fr_train, en_dev, fr_dev, _, _ = data_utils.prepare_wmt_data(
160 | self.data_dir, self.en_vocab_size, self.fr_vocab_size)
161 |
162 | # Read data into buckets and compute their sizes.
163 | print("Reading development and training data (limit: %d)."
164 | % self.max_train_data_size)
165 | self.dev_set = self.read_data(en_dev, fr_dev)
166 | self.train_set = self.read_data(en_train, fr_train, self.max_train_data_size)
167 | train_bucket_sizes = [len(self.train_set[b]) for b in range(len(self._buckets))]
168 | train_total_size = float(sum(train_bucket_sizes))
169 |
170 | # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use
171 | # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to
172 | # the size if i-th training bucket, as used later.
173 | self.train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
174 | for i in range(len(train_bucket_sizes))]
175 |
176 | def read_data(self, source_path, target_path, max_size=None):
177 | """Read data from source and target files and put into buckets.
178 |
179 | Args:
180 | source_path: path to the files with token-ids for the source language.
181 | target_path: path to the file with token-ids for the target language;
182 | it must be aligned with the source file: n-th line contains the desired
183 | output for n-th line from the source_path.
184 | max_size: maximum number of lines to read, all other will be ignored;
185 | if 0 or None, data files will be read completely (no limit).
186 |
187 | Returns:
188 | data_set: a list of length len(_buckets); data_set[n] contains a list of
189 | (source, target) pairs read from the provided data files that fit
190 | into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and
191 | len(target) < _buckets[n][1]; source and target are lists of token-ids.
192 | """
193 | data_set = [[] for _ in self._buckets]
194 | with tf.gfile.GFile(source_path, mode="r") as source_file:
195 | with tf.gfile.GFile(target_path, mode="r") as target_file:
196 | source, target = source_file.readline(), target_file.readline()
197 | counter = 0
198 | while source and target and (not max_size or counter < max_size):
199 | counter += 1
200 | if counter % 100000 == 0:
201 | print(" reading data line %d" % counter)
202 | sys.stdout.flush()
203 | source_ids = [int(x) for x in source.split()]
204 | target_ids = [int(x) for x in target.split()]
205 | target_ids.append(data_utils.EOS_ID)
206 | for bucket_id, (source_size, target_size) in enumerate(self._buckets):
207 | if len(source_ids) < source_size and len(target_ids) < target_size:
208 | data_set[bucket_id].append([source_ids, target_ids])
209 | break
210 | source, target = source_file.readline(), target_file.readline()
211 | return data_set
212 |
213 | @property
214 | def outputs(self):
215 | return self._outputs
216 |
217 | def build_hyperparameters(self):
218 | # data-specific
219 | self.en_vocab_size = 40000
220 | self.fr_vocab_size = 40000
221 | self.max_train_data_size = 1 # 0 is no limit
222 |
223 | # We use a number of buckets and pad to the closest one for efficiency.
224 | # See seq2seq_model.Seq2SeqModel for details of how they work.
225 | self._buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]
226 | # Parameters
227 | self.source_vocab_size = self.en_vocab_size
228 | self.target_vocab_size = self.fr_vocab_size
229 | self.buckets = self._buckets # FIXME: better bucket names
230 | self.num_samples = 512
231 | self.size = 256
232 | self.num_layers = 3
233 | self.use_lstm = True # else GRU
234 |
235 | self.batch_size = 64
236 | if self.init_options:
237 | self.batch_size = self.init_options.get('batch_size', self.batch_size)
238 |
239 | self.display_step = 1
240 | self.global_step = tf.Variable(0, trainable=False)
241 | if not self.forward_only:
242 | self.learning_rate = tf.Variable(0.5, trainable=False)
243 | self.learning_rate_decay_factor = 0.99
244 | self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * self.learning_rate_decay_factor)
245 | self.max_gradient_norm = 5.0
246 |
247 | def run(self, runstep=None, n_steps=1):
248 | # Grab the dataset from the internet, if necessary
249 | self.load_data()
250 |
251 | # This is the training loop.
252 | step_time, loss = 0.0, 0.0
253 | current_step = 0
254 | previous_losses = []
255 | while True:
256 | if current_step >= n_steps:
257 | return
258 | # Choose a bucket according to data distribution. We pick a random number
259 | # in [0, 1] and use the corresponding interval in train_buckets_scale.
260 | random_number_01 = np.random.random_sample()
261 | bucket_id = min([i for i in range(len(self.train_buckets_scale))
262 | if self.train_buckets_scale[i] > random_number_01])
263 |
264 | # Get a batch and make a step.
265 | start_time = time.time()
266 | encoder_inputs, decoder_inputs, target_weights = self.get_batch(
267 | self.train_set, bucket_id)
268 | output_feeds, input_feeds = self.step_feeds(encoder_inputs, decoder_inputs,
269 | target_weights, bucket_id, self.forward_only)
270 |
271 | outputs = runstep(
272 | self.session,
273 | output_feeds,
274 | input_feeds,
275 | #options=run_options, run_metadata=values
276 | )
277 |
278 | # TODO: do this in a runstep
279 | if not self.forward_only:
280 | _, step_loss, _ = outputs[1], outputs[2], None # Gradient norm, loss, no outputs.
281 | else:
282 | _, step_loss, _ = None, outputs[0], outputs[1:] # No gradient norm, loss, outputs.
283 |
284 | step_time += (time.time() - start_time) / self.display_step
285 | loss += step_loss / self.display_step
286 | current_step += 1
287 |
288 | if not self.forward_only:
289 | # Once in a while, we save checkpoint, print statistics, and run evals.
290 | if current_step % self.display_step == 0:
291 | # Print statistics for the previous epoch.
292 | perplexity = math.exp(loss) if loss < 300 else float('inf')
293 | with self.session.as_default():
294 | print("global step %d learning rate %.4f step-time %.2f perplexity "
295 | "%.2f" % (self.global_step.eval(), self.learning_rate.eval(),
296 | step_time, perplexity))
297 | # Decrease learning rate if no improvement was seen over last 3 times.
298 | if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
299 | self.session.run(self.learning_rate_decay_op)
300 | previous_losses.append(loss)
301 | # Save checkpoint and zero timer and loss.
302 | #checkpoint_path = os.path.join(self.train_dir, "translate.ckpt")
303 | #self.saver.save(sess, checkpoint_path, global_step=self.global_step)
304 | step_time, loss = 0.0, 0.0
305 | # Run evals on development set and print their perplexity.
306 | for bucket_id in range(len(self._buckets)):
307 | if len(self.dev_set[bucket_id]) == 0:
308 | print(" eval: empty bucket %d" % (bucket_id))
309 | continue
310 | encoder_inputs, decoder_inputs, target_weights = self.get_batch(
311 | self.dev_set, bucket_id)
312 | output_feeds, input_feeds = self.step_feeds(encoder_inputs, decoder_inputs,
313 | target_weights, bucket_id, True)
314 |
315 | outputs = self.session.run(
316 | output_feeds,
317 | input_feeds,
318 | #options=run_options, run_metadata=values
319 | )
320 |
321 | # TODO: do this in a runstep
322 | if not self.forward_only:
323 | _, eval_loss, _ = outputs[1], outputs[2], None # Gradient norm, loss, no outputs.
324 |
325 | if False: # FIXME: remove this temporary
326 | eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
327 | print(" eval: bucket %d perplexity %.2f" % (bucket_id, eval_ppx))
328 | else:
329 | _, eval_loss, _ = None, outputs[0], outputs[1:] # No gradient norm, loss, outputs.
330 |
331 | sys.stdout.flush()
332 |
333 | def step_feeds(self, encoder_inputs, decoder_inputs, target_weights,
334 | bucket_id, forward_only):
335 | """Construct feeds for given inputs.
336 |
337 | Args:
338 | encoder_inputs: list of numpy int vectors to feed as encoder inputs.
339 | decoder_inputs: list of numpy int vectors to feed as decoder inputs.
340 | target_weights: list of numpy float vectors to feed as target weights.
341 | bucket_id: which bucket of the model to use.
342 | forward_only: whether to do the backward step or only forward.
343 |
344 | Returns:
345 | A triple consisting of gradient norm (or None if we did not do backward),
346 | average perplexity, and the outputs.
347 |
348 | Raises:
349 | ValueError: if length of encoder_inputs, decoder_inputs, or
350 | target_weights disagrees with bucket size for the specified bucket_id.
351 | """
352 | # Check if the sizes match.
353 | encoder_size, decoder_size = self.buckets[bucket_id]
354 | if len(encoder_inputs) != encoder_size:
355 | raise ValueError("Encoder length must be equal to the one in bucket,"
356 | " %d != %d." % (len(encoder_inputs), encoder_size))
357 | if len(decoder_inputs) != decoder_size:
358 | raise ValueError("Decoder length must be equal to the one in bucket,"
359 | " %d != %d." % (len(decoder_inputs), decoder_size))
360 | if len(target_weights) != decoder_size:
361 | raise ValueError("Weights length must be equal to the one in bucket,"
362 | " %d != %d." % (len(target_weights), decoder_size))
363 |
364 | # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
365 | input_feed = {}
366 | for l in range(encoder_size):
367 | input_feed[self.encoder_inputs[l].name] = encoder_inputs[l]
368 | #print("encoder", len(encoder_inputs[l]), self.encoder_inputs[l].get_shape())
369 | for l in range(decoder_size):
370 | input_feed[self.decoder_inputs[l].name] = decoder_inputs[l]
371 | #print("decoder", len(decoder_inputs[l]), self.decoder_inputs[l].get_shape())
372 | input_feed[self.target_weights[l].name] = target_weights[l]
373 | #print("target", len(target_weights[l]), self.target_weights[l].get_shape())
374 |
375 | # Since our targets are decoder inputs shifted by one, we need one more.
376 | #last_target = self.decoder_inputs[decoder_size].name
377 | last_target = self.decoder_inputs[decoder_size]
378 | input_feed[last_target] = np.zeros([self.batch_size], dtype=np.int32)
379 |
380 | # Output feed: depends on whether we do a backward step or not.
381 | if not forward_only:
382 | output_feed = [self.updates[bucket_id], # Update Op that does SGD.
383 | self.gradient_norms[bucket_id], # Gradient norm.
384 | self.losses[bucket_id]] # Loss for this batch.
385 | else:
386 | output_feed = [self.losses[bucket_id]] # Loss for this batch.
387 | for l in range(decoder_size): # Output logits.
388 | output_feed.append(self._outputs[bucket_id][l])
389 |
390 | return output_feed, input_feed
391 |
392 | def get_batch(self, data, bucket_id):
393 | """Get a random batch of data from the specified bucket, prepare for step.
394 |
395 | To feed data in step(..) it must be a list of batch-major vectors, while
396 | data here contains single length-major cases. So the main logic of this
397 | function is to re-index data cases to be in the proper format for feeding.
398 |
399 | Args:
400 | data: a tuple of size len(self.buckets) in which each element contains
401 | lists of pairs of input and output data that we use to create a batch.
402 | bucket_id: integer, which bucket to get the batch for.
403 |
404 | Returns:
405 | The triple (encoder_inputs, decoder_inputs, target_weights) for
406 | the constructed batch that has the proper format to call step(...) later.
407 | """
408 | encoder_size, decoder_size = self.buckets[bucket_id]
409 | encoder_inputs, decoder_inputs = [], []
410 |
411 | # Get a random batch of encoder and decoder inputs from data,
412 | # pad them if needed, reverse encoder inputs and add GO to decoder.
413 | for _ in range(self.batch_size):
414 | encoder_input, decoder_input = random.choice(data[bucket_id])
415 |
416 | # Encoder inputs are padded and then reversed.
417 | encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input))
418 | encoder_inputs.append(list(reversed(encoder_input + encoder_pad)))
419 |
420 | # Decoder inputs get an extra "GO" symbol, and are padded then.
421 | decoder_pad_size = decoder_size - len(decoder_input) - 1
422 | decoder_inputs.append([data_utils.GO_ID] + decoder_input +
423 | [data_utils.PAD_ID] * decoder_pad_size)
424 |
425 | # Now we create batch-major vectors from the data selected above.
426 | batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
427 |
428 | # Batch encoder inputs are just re-indexed encoder_inputs.
429 | for length_idx in range(encoder_size):
430 | batch_encoder_inputs.append(
431 | np.array([encoder_inputs[batch_idx][length_idx]
432 | for batch_idx in range(self.batch_size)], dtype=np.int32))
433 |
434 | # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
435 | for length_idx in range(decoder_size):
436 | batch_decoder_inputs.append(
437 | np.array([decoder_inputs[batch_idx][length_idx]
438 | for batch_idx in range(self.batch_size)], dtype=np.int32))
439 |
440 | # Create target_weights to be 0 for targets that are padding.
441 | batch_weight = np.ones(self.batch_size, dtype=np.float32)
442 | for batch_idx in range(self.batch_size):
443 | # We set weight to 0 if the corresponding target is a PAD symbol.
444 | # The corresponding target is decoder_input shifted by 1 forward.
445 | if length_idx < decoder_size - 1:
446 | target = decoder_inputs[batch_idx][length_idx + 1]
447 | if length_idx == decoder_size - 1 or target == data_utils.PAD_ID:
448 | batch_weight[batch_idx] = 0.0
449 | batch_weights.append(batch_weight)
450 | return batch_encoder_inputs, batch_decoder_inputs, batch_weights
451 |
452 | class Seq2SeqFwd(Seq2Seq):
453 | forward_only = True
454 |
455 | if __name__=='__main__':
456 | m = Seq2Seq()
457 | m.setup()
458 | m.run(runstep=default_runstep, n_steps=10)
459 | m.teardown()
460 |
461 |
--------------------------------------------------------------------------------