├── tensorflow
    ├── SQuAD
    │   ├── my
    │   │   ├── __init__.py
    │   │   ├── tensorflow
    │   │   │   ├── __init__.py
    │   │   │   ├── rnn.py
    │   │   │   ├── general.py
    │   │   │   └── nn.py
    │   │   ├── utils.py
    │   │   ├── corenlp_interface.py
    │   │   ├── zip_save.py
    │   │   └── nltk_utils.py
    │   ├── tree
    │   │   ├── __init__.py
    │   │   ├── trainer.py
    │   │   ├── graph_handler.py
    │   │   ├── templates
    │   │   │   └── visualizer.html
    │   │   ├── cli.py
    │   │   ├── visualizer.py
    │   │   ├── read_data.py
    │   │   ├── main.py
    │   │   ├── test.ipynb
    │   │   └── evaluator.py
    │   ├── basic
    │   │   ├── __init__.py
    │   │   ├── run_single.sh
    │   │   ├── run_ensemble.sh
    │   │   ├── get_pr.py
    │   │   ├── ensemble_fast.py
    │   │   ├── templates
    │   │   │   └── visualizer.html
    │   │   ├── trainer.py
    │   │   ├── graph_handler.py
    │   │   ├── ensemble.py
    │   │   ├── visualizer.py
    │   │   └── cli.py
    │   ├── basic_cnn
    │   │   ├── __init__.py
    │   │   ├── superhighway.py
    │   │   ├── templates
    │   │   │   └── visualizer.html
    │   │   ├── graph_handler.py
    │   │   ├── trainer.py
    │   │   ├── visualizer.py
    │   │   └── cli.py
    │   ├── cnn_dm
    │   │   ├── __init__.py
    │   │   ├── evaluate.py
    │   │   └── prepro.py
    │   ├── squad
    │   │   ├── __init__.py
    │   │   ├── neg_squad.py
    │   │   ├── evaluate-v1.1.py
    │   │   ├── evaluate.py
    │   │   ├── utils.py
    │   │   ├── aug_squad.py
    │   │   ├── prepro_aug.py
    │   │   ├── eda_aug_dev.ipynb
    │   │   └── eda_aug_train.ipynb
    │   ├── requirements.txt
    │   ├── .gitignore
    │   ├── run_training.sh
    │   ├── download.sh
    │   └── README.md
    └── CIFAR10
    │   ├── README.md
    │   ├── time_inference.py
    │   ├── eval_checkpoints.py
    │   └── resnet
    │       ├── README.md
    │       └── cifar_input.py
├── pytorch
    ├── CIFAR10
    │   ├── benchmark
    │   │   ├── __init__.py
    │   │   ├── cifar10
    │   │   │   ├── __init__.py
    │   │   │   ├── __main__.py
    │   │   │   ├── models
    │   │   │   │   └── densenet.py
    │   │   │   ├── infer.py
    │   │   │   └── results.py
    │   │   ├── imagenet
    │   │   │   └── __main__.py
    │   │   └── utils.py
    │   ├── .gitignore
    │   ├── setup.py
    │   └── README.md
    └── .gitignore
├── .gitignore
└── README.md


/tensorflow/SQuAD/my/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/cnn_dm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | tqdm
3 | jinja2
4 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/.gitignore:
--------------------------------------------------------------------------------
1 | out/
2 | data/
3 | */__pycache__/
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .eggs/
4 | *.egg-info/
5 | .cache
6 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from my.tensorflow.general import *


--------------------------------------------------------------------------------
/pytorch/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .eggs/
4 | *.egg-info/
5 | .cache
6 | data/
7 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .eggs/
4 | *.egg-info/
5 | .cache
6 | data/
7 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/run_training.sh:
--------------------------------------------------------------------------------
1 | python3 -m basic.cli --mode train --noload --len_opt --cluster
2 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/imagenet/__main__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from benchmark.imagenet.train import train
 4 | 
 5 | 
 6 | @click.group()
 7 | def cli():
 8 |     pass
 9 | 
10 | 
11 | cli.add_command(train, name='train')
12 | 
13 | if __name__ == '__main__':
14 |     cli()
15 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/__main__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from benchmark.cifar10.train import train
 4 | from benchmark.cifar10.infer import infer
 5 | 
 6 | 
 7 | @click.group()
 8 | def cli():
 9 |     pass
10 | 
11 | 
12 | cli.add_command(train, name='train')
13 | cli.add_command(infer, name='infer')
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     cli()
18 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='benchmark',
 5 |     version='0.0.0',
 6 |     url='http://www.codycoleman.com',
 7 |     author='Cody Austun Coleman',
 8 |     author_email='cody.coleman@cs.stanford.edu',
 9 |     packages=['benchmark'],
10 |     entry_points={
11 |         'console_scripts': [
12 |             'cifar10 = benchmark.cifar10.__main__:cli',
13 |             'imagenet = benchmark.imagenet.__main__:cli'
14 |         ]
15 |     },
16 |     install_requires=[
17 |         'tqdm',
18 |         'torchvision',
19 |         'click',
20 |     ]
21 | )
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DAWNBench Models
 2 | 
 3 | This repository contains implementations for various models presented in the DAWNBench Leaderboard:
 4 | - ResNet models for CIFAR10, implemented in TensorFlow, located at
 5 |   [`tensorflow/CIFAR10`](https://github.com/stanford-futuredata/dawn-bench-models/tree/master/tensorflow/CIFAR10)
 6 | - ResNet models for CIFAR10, implemented in PyTorch, located at
 7 |   [`pytorch/CIFAR10`](https://github.com/stanford-futuredata/dawn-bench-models/tree/master/pytorch/CIFAR10)
 8 | - BiDAF model for SQuAD, implemented in TensorFlow, located at
 9 |   [`tensorflow/SQuAD`](https://github.com/stanford-futuredata/dawn-bench-models/tree/master/tensorflow/SQuAD)
10 | 
11 | You can email us at [dawn-benchmark@lists.stanford.edu](mailto:dawn-benchmark@lists.stanford.edu) with any
12 | questions.
13 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DATA_DIR=$HOME/data
 4 | mkdir $DATA_DIR
 5 | 
 6 | # Download SQuAD
 7 | SQUAD_DIR=$DATA_DIR/squad
 8 | mkdir $SQUAD_DIR
 9 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
10 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
11 | 
12 | 
13 | # Download CNN and DailyMail
14 | # Download at: http://cs.nyu.edu/~kcho/DMQA/
15 | 
16 | 
17 | # Download GloVe
18 | GLOVE_DIR=$DATA_DIR/glove
19 | mkdir $GLOVE_DIR
20 | wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip
21 | unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR
22 | 
23 | # Download NLTK (for tokenizer)
24 | # Make sure that nltk is installed!
25 | python3 -m nltk.downloader -d $HOME/nltk_data punkt
26 | 


--------------------------------------------------------------------------------
/tensorflow/CIFAR10/README.md:
--------------------------------------------------------------------------------
 1 | # ResNets on TensorFlow
 2 | 
 3 | To train a ResNet, run,
 4 | 
 5 | ```bash
 6 | python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* --log_root=data/resnet20/log_root \
 7 |                               --train_dir=data/resnet20/log_root/train --dataset='cifar10' --model=resnet20 \
 8 |                               --num_gpus=1 --checkpoint_dir=data/resnet20/checkpoints --data_format=NCHW
 9 | ```
10 | 
11 | To evaluate resulting checkpoints, run,
12 | 
13 | ```bash
14 | python3 eval_checkpoints.py -i data/resnet20/checkpoints \
15 |                             -c "python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin --eval_dir=data/resnet20/log_root/eval --dataset='cifar10' --model=resnet20 --num_gpus=1 --eval_batch_count=100 --eval_once=True --data_format=NCHW"
16 | ```
17 | 
18 | Make sure to first follow the instructions in `resnet/README.md` to get necessary data, etc.
19 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/run_single.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source_path=$1
 3 | target_path=$2
 4 | inter_dir="inter_single"
 5 | root_dir="save"
 6 | 
 7 | parg=""
 8 | marg=""
 9 | if [ "$3" = "debug" ]
10 | then
11 |     parg="-d"
12 |     marg="--debug"
13 | fi
14 | 
15 | # Preprocess data
16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
17 | 
18 | num=37
19 | load_path="$root_dir/$num/save"
20 | shared_path="$root_dir/$num/shared.json"
21 | eval_path="$inter_dir/eval.pklz"
22 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema
23 | 
24 | # Ensemble (for single run, just one input)
25 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/run_ensemble.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | source_path=$1
 3 | target_path=$2
 4 | inter_dir="inter_ensemble"
 5 | root_dir="save"
 6 | 
 7 | parg=""
 8 | marg=""
 9 | if [ "$3" = "debug" ]
10 | then
11 |     parg="-d"
12 |     marg="--debug"
13 | fi
14 | 
15 | # Preprocess data
16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
17 | 
18 | eargs=""
19 | for num in 31 33 34 35 36 37 40 41 43 44 45 46; do
20 |     load_path="$root_dir/$num/save"
21 |     shared_path="$root_dir/$num/shared.json"
22 |     eval_path="$inter_dir/eval-$num.pklz"
23 |     eargs="$eargs $eval_path"
24 |     python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema &
25 | done
26 | wait
27 | 
28 | # Ensemble
29 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs
30 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/get_pr.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | 
 4 | 
 5 | def get_args():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("path")
 8 |     parser.add_argument("-t", "--th", type=float, default=0.5)
 9 |     # TODO : put more args here
10 |     return parser.parse_args()
11 | 
12 | 
13 | def get_pr(args):
14 |     with open(args.path, 'r') as fp:
15 |         answers = json.load(fp)
16 | 
17 |     na = answers['na']
18 | 
19 |     tp = sum(int(not id_.startswith("neg") and score < args.th) for id_, score in na.items())
20 |     fp = sum(int(id_.startswith("neg") and score < args.th) for id_, score in na.items())
21 |     tn = sum(int(id_.startswith("neg") and score >= args.th) for id_, score in na.items())
22 |     fn = sum(int(not id_.startswith("neg") and score >= args.th) for id_, score in na.items())
23 | 
24 |     p = tp / (tp + fp)
25 |     r = tp / (tp + fn)
26 |     print("p={:.3f}, r={:.3f}".format(p, r))
27 | 
28 | 
29 | def main():
30 |     args = get_args()
31 |     get_pr(args)
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 
36 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/cnn_dm/evaluate.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import sys
 4 | 
 5 | root_dir = sys.argv[1]
 6 | answer_path = sys.argv[2]
 7 | file_names = os.listdir(root_dir)
 8 | 
 9 | num_correct = 0
10 | num_wrong = 0
11 | 
12 | with open(answer_path, 'r') as fh:
13 |     id2answer_dict = json.load(fh)
14 | 
15 | for file_name in file_names:
16 |     if not file_name.endswith(".question"):
17 |         continue
18 |     with open(os.path.join(root_dir, file_name), 'r') as fh:
19 |         url = fh.readline().strip()
20 |         _ = fh.readline()
21 |         para = fh.readline().strip()
22 |         _ = fh.readline()
23 |         ques = fh.readline().strip()
24 |         _ = fh.readline()
25 |         answer = fh.readline().strip()
26 |         _ = fh.readline()
27 |         if file_name in id2answer_dict:
28 |             pred = id2answer_dict[file_name]
29 |             if pred == answer:
30 |                 num_correct += 1
31 |             else:
32 |                 num_wrong += 1
33 |         else:
34 |             num_wrong += 1
35 | 
36 | total = num_correct + num_wrong
37 | acc = float(num_correct) / total
38 | print("{} = {} / {}".format(acc, num_correct, total))


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/ensemble_fast.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | from collections import Counter, defaultdict
 4 | import re
 5 | 
 6 | def key_func(pair):
 7 |     return pair[1]
 8 | 
 9 | 
10 | def get_func(vals, probs):
11 |     counter = Counter(vals)
12 |     # return max(zip(vals, probs), key=lambda pair: pair[1])[0]
13 |     # return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
14 |     # return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
15 |     d = defaultdict(float)
16 |     for val, prob in zip(vals, probs):
17 |         d[val] += prob
18 |     d[''] = 0
19 |     return max(d.items(), key=lambda pair: pair[1])[0]
20 | 
21 | third_path = sys.argv[1]
22 | other_paths = sys.argv[2:]
23 | 
24 | others = [json.load(open(path, 'r')) for path in other_paths]
25 | 
26 | 
27 | c = {}
28 | 
29 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others))
30 | 
31 | for key in others[0].keys():
32 |     if key == 'scores':
33 |         continue
34 |     probs = [other['scores'][key] for other in others]
35 |     vals = [other[key] for other in others]
36 |     largest_val = get_func(vals, probs)
37 |     c[key] = largest_val
38 | 
39 | json.dump(c, open(third_path, 'w'))


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/trainer.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from tree.model import Model
 4 | 
 5 | 
 6 | class Trainer(object):
 7 |     def __init__(self, config, model):
 8 |         assert isinstance(model, Model)
 9 |         self.config = config
10 |         self.model = model
11 |         self.opt = tf.train.AdagradOptimizer(config.init_lr)
12 |         self.loss = model.get_loss()
13 |         self.var_list = model.get_var_list()
14 |         self.global_step = model.get_global_step()
15 |         self.ema_op = model.ema_op
16 |         self.summary = model.summary
17 |         self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 |         opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 | 
20 |         # Define train op
21 |         with tf.control_dependencies([opt_op]):
22 |             self.train_op = tf.group(self.ema_op)
23 | 
24 |     def get_train_op(self):
25 |         return self.train_op
26 | 
27 |     def step(self, sess, batch, get_summary=False):
28 |         assert isinstance(sess, tf.Session)
29 |         feed_dict = self.model.get_feed_dict(batch, True)
30 |         if get_summary:
31 |             loss, summary, train_op = \
32 |                 sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
33 |         else:
34 |             loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
35 |             summary = None
36 |         return loss, summary, train_op
37 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import deque
 3 | 
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | def mytqdm(list_, desc="", show=True):
 9 |     if show:
10 |         pbar = tqdm(list_)
11 |         pbar.set_description(desc)
12 |         return pbar
13 |     return list_
14 | 
15 | 
16 | def json_pretty_dump(obj, fh):
17 |     return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': '))
18 | 
19 | 
20 | def index(l, i):
21 |     return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]]
22 | 
23 | 
24 | def fill(l, shape, dtype=None):
25 |     out = np.zeros(shape, dtype=dtype)
26 |     stack = deque()
27 |     stack.appendleft(((), l))
28 |     while len(stack) > 0:
29 |         indices, cur = stack.pop()
30 |         if len(indices) < shape:
31 |             for i, sub in enumerate(cur):
32 |                 stack.appendleft([indices + (i,), sub])
33 |         else:
34 |             out[indices] = cur
35 |     return out
36 | 
37 | 
38 | def short_floats(o, precision):
39 |     class ShortFloat(float):
40 |         def __repr__(self):
41 |             return '%.{}g'.format(precision) % self
42 | 
43 |     def _short_floats(obj):
44 |         if isinstance(obj, float):
45 |             return ShortFloat(obj)
46 |         elif isinstance(obj, dict):
47 |             return dict((k, _short_floats(v)) for k, v in obj.items())
48 |         elif isinstance(obj, (list, tuple)):
49 |             return tuple(map(_short_floats, obj))
50 |         return obj
51 | 
52 |     return _short_floats(o)
53 | 
54 | 
55 | def argmax(x):
56 |     return np.unravel_index(x.argmax(), x.shape)
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/corenlp_interface.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import requests
 4 | import nltk
 5 | import json
 6 | import networkx as nx
 7 | import time
 8 | 
 9 | 
10 | class CoreNLPInterface(object):
11 |     def __init__(self, url, port):
12 |         self._url = url
13 |         self._port = port
14 | 
15 |     def get(self, type_, in_, num_max_requests=100):
16 |         in_ = in_.encode("utf-8")
17 |         url = "http://{}:{}/{}".format(self._url, self._port, type_)
18 |         out = None
19 |         for _ in range(num_max_requests):
20 |             try:
21 |                 r = requests.post(url, data=in_)
22 |                 out = r.content.decode('utf-8')
23 |                 if out == 'error':
24 |                     out = None
25 |                 break
26 |             except:
27 |                 time.sleep(1)
28 |         return out
29 | 
30 |     def split_doc(self, doc):
31 |         out = self.get("doc", doc)
32 |         return out if out is None else json.loads(out)
33 | 
34 |     def split_sent(self, sent):
35 |         out = self.get("sent", sent)
36 |         return out if out is None else json.loads(out)
37 | 
38 |     def get_dep(self, sent):
39 |         out = self.get("dep", sent)
40 |         return out if out is None else json.loads(out)
41 | 
42 |     def get_const(self, sent):
43 |         out = self.get("const", sent)
44 |         return out
45 | 
46 |     def get_const_tree(self, sent):
47 |         out = self.get_const(sent)
48 |         return out if out is None else nltk.tree.Tree.fromstring(out)
49 | 
50 |     @staticmethod
51 |     def dep2tree(dep):
52 |         tree = nx.DiGraph()
53 |         for dep, i, gov, j, label in dep:
54 |             tree.add_edge(gov, dep, label=label)
55 |         return tree
56 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/neg_squad.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | # data: q, cq, (dq), (pq), y, *x, *cx
 5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
 6 | # no metadata
 7 | import random
 8 | from collections import Counter
 9 | 
10 | from tqdm import tqdm
11 | 
12 | from squad.utils import get_word_span, get_word_idx, process_tokens
13 | 
14 | 
15 | def main():
16 |     args = get_args()
17 |     neg_squad(args)
18 | 
19 | 
20 | def get_args():
21 |     parser = argparse.ArgumentParser()
22 |     home = os.path.expanduser("~")
23 |     parser.add_argument("source_path")
24 |     parser.add_argument("target_path")
25 |     parser.add_argument('-d', "--debug", action='store_true')
26 |     parser.add_argument('-r', "--aug_ratio", default=1, type=int)
27 |     # TODO : put more args here
28 |     return parser.parse_args()
29 | 
30 | 
31 | def neg_squad(args):
32 |     with open(args.source_path, 'r') as fp:
33 |         squad = json.load(fp)
34 |     with open(args.source_path, 'r') as fp:
35 |         ref_squad = json.load(fp)
36 | 
37 |     for ai, article in enumerate(ref_squad['data']):
38 |         for pi, para in enumerate(article['paragraphs']):
39 |             cands = list(range(pi)) + list(range(pi+1, len(article['paragraphs'])))
40 |             samples = random.sample(cands, args.aug_ratio)
41 |             for sample in samples:
42 |                 for qi, ques in enumerate(article['paragraphs'][sample]['qas']):
43 |                     new_ques = {'question': ques['question'], 'answers': [], 'answer_start': 0, 'id': "neg_" + ques['id']}
44 |                     squad['data'][ai]['paragraphs'][pi]['qas'].append(new_ques)
45 | 
46 |     with open(args.target_path, 'w') as fp:
47 |         json.dump(squad, fp)
48 | 
49 | if __name__ == "__main__":
50 |     main()


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/superhighway.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.ops.rnn_cell import RNNCell
 3 | 
 4 | from my.tensorflow.nn import linear
 5 | 
 6 | 
 7 | class SHCell(RNNCell):
 8 |     """
 9 |     Super-Highway Cell
10 |     """
11 |     def __init__(self, input_size, logit_func='tri_linear', scalar=False):
12 |         self._state_size = input_size
13 |         self._output_size = input_size
14 |         self._logit_func = logit_func
15 |         self._scalar = scalar
16 | 
17 |     @property
18 |     def state_size(self):
19 |         return self._state_size
20 | 
21 |     @property
22 |     def output_size(self):
23 |         return self._output_size
24 | 
25 |     def __call__(self, inputs, state, scope=None):
26 |         with tf.variable_scope(scope or "SHCell"):
27 |             a_size = 1 if self._scalar else self._state_size
28 |             h, u = tf.split(axis=1, num_or_size_splits=2, value=inputs)
29 |             if self._logit_func == 'mul_linear':
30 |                 args = [h * u, state * u]
31 |                 a = tf.nn.sigmoid(linear(args, a_size, True))
32 |             elif self._logit_func == 'linear':
33 |                 args = [h, u, state]
34 |                 a = tf.nn.sigmoid(linear(args, a_size, True))
35 |             elif self._logit_func == 'tri_linear':
36 |                 args = [h, u, state, h * u, state * u]
37 |                 a = tf.nn.sigmoid(linear(args, a_size, True))
38 |             elif self._logit_func == 'double':
39 |                 args = [h, u, state]
40 |                 a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True))
41 | 
42 |             else:
43 |                 raise Exception()
44 |             new_state = a * state + (1 - a) * h
45 |             outputs = state
46 |             return outputs, new_state
47 | 
48 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/zip_save.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import shutil
 5 | from zipfile import ZipFile
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | def get_args():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('paths', nargs='+')
13 |     parser.add_argument('-o', '--out', default='save.zip')
14 |     args = parser.parse_args()
15 |     return args
16 | 
17 | 
18 | def zip_save(args):
19 |     temp_dir = "."
20 |     save_dir = os.path.join(temp_dir, "save")
21 |     if not os.path.exists(save_dir):
22 |         os.makedirs(save_dir)
23 |     for save_source_path in tqdm(args.paths):
24 |         # path = "out/basic/30/save/basic-18000"
25 |         # target_path = "save_dir/30/save"
26 |         # also output full path name to "save_dir/30/readme.txt
27 |         # need to also extract "out/basic/30/shared.json"
28 |         temp, _ = os.path.split(save_source_path)  # "out/basic/30/save", _
29 |         model_dir, _ = os.path.split(temp)  # "out/basic/30, _
30 |         _, model_name = os.path.split(model_dir)
31 |         cur_dir = os.path.join(save_dir, model_name)
32 |         if not os.path.exists(cur_dir):
33 |             os.makedirs(cur_dir)
34 |         save_target_path = os.path.join(cur_dir, "save")
35 |         shared_target_path = os.path.join(cur_dir, "shared.json")
36 |         readme_path = os.path.join(cur_dir, "readme.txt")
37 |         shared_source_path = os.path.join(model_dir, "shared.json")
38 |         shutil.copy(save_source_path, save_target_path)
39 |         shutil.copy(shared_source_path, shared_target_path)
40 |         with open(readme_path, 'w') as fh:
41 |             fh.write(save_source_path)
42 | 
43 |     os.system("zip {} -r {}".format(args.out, save_dir))
44 | 
45 | def main():
46 |     args = get_args()
47 |     zip_save(args)
48 | 
49 | if __name__ == "__main__":
50 |     main()
51 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import re
 4 | from functools import reduce
 5 | 
 6 | 
 7 | class AverageMeter(object):
 8 |     """Computes and stores the average and current value"""
 9 |     def __init__(self):
10 |         self.reset()
11 | 
12 |     def reset(self):
13 |         self.val = 0
14 |         self.avg = 0
15 |         self.sum = 0
16 |         self.count = 0
17 | 
18 |     def update(self, val, n=1):
19 |         self.val = val
20 |         self.sum += val * n
21 |         self.count += n
22 |         self.avg = self.sum / self.count
23 | 
24 | 
25 | def count_parameters(model):
26 |     c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
27 |     return sum(c)
28 | 
29 | 
30 | def latest_file(model):
31 |     restore = f'./run/{model}'
32 |     timestamps = sorted(os.listdir(restore))
33 |     assert len(timestamps) > 0
34 |     run_dir = os.path.join(restore, timestamps[-1])
35 |     files = os.listdir(run_dir)
36 |     max_checkpoint = -1
37 |     for filename in files:
38 |         if re.search('checkpoint_\d+.t7', filename):
39 |             num = int(re.search('\d+', filename).group())
40 | 
41 |             if num > max_checkpoint:
42 |                 max_checkpoint = num
43 |                 max_checkpoint_file = filename
44 | 
45 |     assert max_checkpoint != -1
46 |     return os.path.join(run_dir, max_checkpoint_file)
47 | 
48 | 
49 | def save_result(result, path):
50 |     write_heading = not os.path.exists(path)
51 |     with open(path, mode='a') as out:
52 |         if write_heading:
53 |             out.write(",".join([str(k) for k, v in result.items()]) + '\n')
54 |         out.write(",".join([str(v) for k, v in result.items()]) + '\n')
55 | 
56 | 
57 | def save_config(config, run_dir):
58 |     path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
59 |     with open(path, 'w') as config_file:
60 |         json.dump(config, config_file)
61 |         config_file.write('\n')
62 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/README.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | 1. Install PyTorch v0.1.12. If you don't already have it set up, [please follow the official install instructions](http://pytorch.org/).
 4 | 2. Clone this repo and go to this directory
 5 | 
 6 | ```bash
 7 | git clone git@github.com:stanford-futuredata/dawn-bench-models.git
 8 | cd dawn-bench-models/pytorch/CIFAR10
 9 | ```
10 | 
11 | 3. Install this package
12 | 
13 | ```bash
14 | pip install -e .
15 | ```
16 | 
17 | # Quick start
18 | 
19 | This package adds <code>cifar10</code> and <code>imagenet</code> command line interfaces.
20 | Both include the <code>train</code> subcommands to learn a model from scratch.
21 | As an example, here is how to train ResNet164 with preactivation on CIFAR10:
22 | 
23 | ```bash
24 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 5 -l 0.01
25 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 90 -l 0.1 --restore latest
26 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 45 -l 0.01 --restore latest
27 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 45 -l 0.001 --restore latest
28 | ```
29 | 
30 | The first command creates a new run of ResNet164 with preactivation (`--arch preact164`) in the `./run/preact164/[TIMESTAMP]` directory and starts a warm up of 5 epochs (`-e 5`) with SGD (`--optimizer sgd`) and a learning rate of 0.01 (`-l 0.01`).
31 | `-c last` indicates that we only want to save a checkpoint after the last epoch of the warm up.
32 | `-b 128` sets the batch size to 128.
33 | `--augmentation` turns on standard data augmentation, i.e. random crop and flip.
34 | `--tracking` saves training and validation results to csv files at `./run/preact164/[TIMESTAMP]/[train|valid]_results.csv`
35 | 
36 | The second command resumes the run from the first command (`--restore latest`) for another 90 epochs (`-e 90`) but with a new learning rate (`-l 0.1`). The third and fourth commands function similarly to the second command, changing the learning rate and running for more epochs.
37 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/graph_handler.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from json import encoder
 3 | import os
 4 | 
 5 | import tensorflow as tf
 6 | 
 7 | from tree.evaluator import Evaluation
 8 | from my.utils import short_floats
 9 | 
10 | 
11 | class GraphHandler(object):
12 |     def __init__(self, config):
13 |         self.config = config
14 |         self.saver = tf.train.Saver()
15 |         self.writer = None
16 |         self.save_path = os.path.join(config.save_dir, config.model_name)
17 | 
18 |     def initialize(self, sess):
19 |         if self.config.load:
20 |             self._load(sess)
21 |         else:
22 |             sess.run(tf.global_variables_initializer())
23 | 
24 |         if self.config.mode == 'train':
25 |             self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
26 | 
27 |     def save(self, sess, global_step=None):
28 |         self.saver.save(sess, self.save_path, global_step=global_step)
29 | 
30 |     def _load(self, sess):
31 |         config = self.config
32 |         if config.load_step > 0:
33 |             save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
34 |         else:
35 |             save_dir = config.save_dir
36 |             checkpoint = tf.train.get_checkpoint_state(save_dir)
37 |             assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
38 |             save_path = checkpoint.model_checkpoint_path
39 |         print("Loading saved model from {}".format(save_path))
40 |         self.saver.restore(sess, save_path)
41 | 
42 |     def add_summary(self, summary, global_step):
43 |         self.writer.add_summary(summary, global_step)
44 | 
45 |     def add_summaries(self, summaries, global_step):
46 |         for summary in summaries:
47 |             self.add_summary(summary, global_step)
48 | 
49 |     def dump_eval(self, e, precision=2):
50 |         assert isinstance(e, Evaluation)
51 |         path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
52 |         with open(path, 'w') as fh:
53 |             json.dump(short_floats(e.dict, precision), fh)
54 | 
55 | 


--------------------------------------------------------------------------------
/tensorflow/CIFAR10/time_inference.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | 
 6 | def main(checkpoint_path, model, use_bottleneck):
 7 |   print("Number of images\tInference time")
 8 |   num_trials = 10
 9 |   for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
10 |     command = ("python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin "
11 |                "--eval_dir=data/%(model)s/log_root/eval --dataset='cifar10' --model=%(model)s "
12 |                "--use_bottleneck=%(use_bottleneck)s --eval_batch_count=%(num_trials)d --eval_once=True --num_gpus=1 "
13 |                "--data_format=NHWC --time_inference=True --eval_batch_count=1 --batch_size=%(batch_size)d" %
14 |                {"model": model, "use_bottleneck": "True" if use_bottleneck else "False", "batch_size": batch_size,
15 |                 "num_trials": num_trials})
16 |     full_command = command + " --log_root=%s 2>/dev/null" % checkpoint_path
17 |     try:
18 |       output = subprocess.check_output(full_command, shell=True)
19 |       output = output.decode('utf8').strip()
20 |       for line in output.split('\n'):
21 |         if "Time for inference" in line:
22 |           line = line.strip()
23 |           inference_time = float(line.split(": ")[1]) / num_trials
24 |           stats = [batch_size, inference_time]
25 |           print("\t".join([str(stat) for stat in stats]))
26 |           sys.stdout.flush()
27 |     except:
28 |       stats = [batch_size, ""]
29 |       print("\t".join([str(stat) for stat in stats]))
30 |       sys.stdout.flush()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |   parser = argparse.ArgumentParser(
35 |     description=("Backup model checkpoints periodically")
36 |   )
37 |   parser.add_argument('-i', "--checkpoint_path", type=str, required=True,
38 |                       help="Path to dumped model checkpoints")
39 |   parser.add_argument('-m', "--model", type=str, required=True,
40 |                       help="Model name")
41 |   parser.add_argument('-b', "--use_bottleneck", type=bool, default=False,
42 |                       help="Use bottleneck")
43 | 
44 |   cmdline_args = parser.parse_args()
45 |   opt_dict = vars(cmdline_args)
46 | 
47 |   checkpoint_path = opt_dict["checkpoint_path"]
48 |   model = opt_dict["model"]
49 |   use_bottleneck = opt_dict["use_bottleneck"]
50 | 
51 |   main(checkpoint_path, model, use_bottleneck)
52 | 


--------------------------------------------------------------------------------
/tensorflow/CIFAR10/eval_checkpoints.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import subprocess
 4 | import sys
 5 | 
 6 | def main(checkpoints_path, command, start_cnt):
 7 |   cnt = start_cnt
 8 | 
 9 |   times = {}
10 |   cum_time = 0.0
11 |   with open(os.path.join(checkpoints_path, "times.log"), 'r') as f:
12 |     output = f.read().strip()
13 |     output_lines = output.split('\n')
14 |     for output_line in output_lines:
15 |         [step, time] = output_line.split('\t')
16 |         step = int(step.split(': ')[1])
17 |         time = float(time.split(': ')[1])
18 |         cum_time += time
19 |         times[step] = cum_time
20 | 
21 |   print("Time (in secs)\tNumber of minibatches\tTop 1 accuracy\tTop 5 accuracy")
22 |   while True:
23 |     ckpt_path = ("%5d" % cnt).replace(' ', '0')
24 |     full_ckpt_path = os.path.join(checkpoints_path, ckpt_path)
25 |     if not os.path.exists(full_ckpt_path):
26 |       break
27 |     if len(os.listdir(full_ckpt_path)) <= 2:
28 |       cnt += 1
29 |       continue
30 |     full_command = command + " --log_root=%s 2>/dev/null" % full_ckpt_path
31 |     output = subprocess.check_output(full_command, shell=True)
32 |     output = output.decode('utf8').strip()
33 |     for line in output.split('\n'):
34 |       if "Precision" in line and "Recall" in line:
35 |         tokens = line.split(", ")  # TODO: Nasty hack, make more robust.
36 |         precision_at_1 = float(tokens[0].split()[-1])
37 |         recall_at_5 = float(tokens[1].split()[-1])
38 |         step = int(tokens[2].split()[3])
39 |         stats = [times[step], step, precision_at_1, recall_at_5]
40 |         print("\t".join([str(stat) for stat in stats]))
41 |         sys.stdout.flush()
42 |     cnt += 1
43 | 
44 | 
45 | if __name__ == '__main__':
46 |   parser = argparse.ArgumentParser(
47 |     description=("Backup model checkpoints periodically")
48 |   )
49 |   parser.add_argument('-i', "--checkpoints_path", type=str, required=True,
50 |                       help="Path to dumped model checkpoints")
51 |   parser.add_argument('-c', "--command", type=str, required=True,
52 |                       help="Command to evaluate each individual checkpoint")
53 |   parser.add_argument('-s', "--start_cnt", type=int, default=1,
54 |                       help="Count to start evaluating checkpoints from")
55 | 
56 |   cmdline_args = parser.parse_args()
57 |   opt_dict = vars(cmdline_args)
58 | 
59 |   main(opt_dict["checkpoints_path"], opt_dict["command"], opt_dict["start_cnt"])
60 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/templates/visualizer.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>{{ title }}</title>
 6 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
 7 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
 8 |     <script>
 9 |         $(document).ready(function(){
10 |             $(".att").each(function() {
11 |                 // var val = parseFloat($(this).text());
12 |                 var val = parseFloat($(this).attr("color"));
13 |                 var scale = chroma.scale(['white', 'red']);
14 |                 var color = scale(val).hex();
15 |                 $(this).attr("bgcolor", color);
16 |             });
17 |         })
18 |     </script>
19 | </head>
20 | <style>
21 |     table, th, td {border: 1px solid black}
22 | </style>
23 | <body>
24 |     <h2>{{ title }}</h2>
25 |     <table>
26 |         <tr>
27 |             <th>ID</th>
28 |             <th>Question</th>
29 |             <th>Answer</th>
30 |             <th>Paragraph</th>
31 |         </tr>
32 |         {% for row in rows %}
33 |             <tr>
34 |                 <td>{{ row.id }}</td>
35 |                 <td>
36 |                     {% for qj in row.ques %}
37 |                         {{ qj }}
38 |                     {% endfor %}
39 |                 </td>
40 |                 <td>{{ row.a }}</td>
41 |                 <td>
42 |                     <table>
43 |                     {% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %}
44 |                         <tr>
45 |                         {% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %}
46 |                             <td class="att" color="{{ ypjk }}">
47 |                             {% if yjk or y2jk %}
48 |                                 <b>{{ xjk }}</b>
49 |                             {% else %}
50 |                                 {{ xjk }}
51 |                             {% endif %}
52 |                             </td>
53 |                         {% endfor %}
54 |                         </tr>
55 |                         <tr>
56 |                         {% for xjk, yp2jk in zip(xj, yp2j) %}
57 |                             <td class="att" color="{{ yp2jk }}">-</td>
58 |                         {% endfor %}
59 |                         </tr>
60 |                     {% endfor %}
61 |                     </table>
62 |                 </td>
63 |             </tr>
64 |         {% endfor %}
65 |     </table>
66 | </body>
67 | </html>


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/cli.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pprint import pprint
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | from tree.main import main as m
 7 | 
 8 | flags = tf.app.flags
 9 | 
10 | flags.DEFINE_string("model_name", "tree", "Model name [tree]")
11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
12 | flags.DEFINE_integer("run_id", 0, "Run ID [0]")
13 | 
14 | flags.DEFINE_integer("batch_size", 128, "Batch size [128]")
15 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
16 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
17 | flags.DEFINE_integer("num_steps", 0, "Number of steps [0]")
18 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
19 | flags.DEFINE_integer("load_step", 0, "load step [0]")
20 | flags.DEFINE_integer("early_stop", 4, "early stop [4]")
21 | 
22 | flags.DEFINE_string("mode", "test", "train | test | forward [test]")
23 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
24 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
25 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
26 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
27 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
28 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
29 | 
30 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
31 | 
32 | flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]")
33 | flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]")
34 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
35 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
36 | flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]")
37 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
38 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
39 | 
40 | flags.DEFINE_integer("word_count_th", 100, "word count th [100]")
41 | flags.DEFINE_integer("char_count_th", 500, "char count th [500]")
42 | flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]")
43 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
44 | flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]")
45 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
46 | flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]")
47 | 
48 | 
49 | def main(_):
50 |     config = flags.FLAGS
51 | 
52 |     config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2))
53 | 
54 |     m(config)
55 | 
56 | if __name__ == "__main__":
57 |     tf.app.run()
58 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/templates/visualizer.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>{{ title }}</title>
 6 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
 7 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
 8 |     <script>
 9 |         $(document).ready(function(){
10 |             $(".att").each(function() {
11 |                 // var val = parseFloat($(this).text());
12 |                 var val = parseFloat($(this).attr("color"));
13 |                 var scale = chroma.scale(['white', 'red']);
14 |                 var color = scale(val).hex();
15 |                 $(this).attr("bgcolor", color);
16 |             });
17 |         })
18 |     </script>
19 | </head>
20 | <style>
21 |     table, th, td {border: 1px solid black}
22 | </style>
23 | <body>
24 |     <h2>{{ title }}</h2>
25 |     <table>
26 |         <tr>
27 |             <th>ID</th>
28 |             <th>Question</th>
29 |             <th>Answers</th>
30 |             <th>Predicted</th>
31 |             <th>Score</th>
32 |             <th>Paragraph</th>
33 |         </tr>
34 |         {% for row in rows %}
35 |             <tr>
36 |                 <td>{{ row.id }}</td>
37 |                 <td>
38 |                     {% for qj in row.ques %}
39 |                         {{ qj }}
40 |                     {% endfor %}
41 |                 </td>
42 |                 <td>
43 |                     {% for aa in row.a %}
44 |                         <li>{{ aa }}</li>
45 |                     {% endfor %}
46 |                 </td>
47 |                 <td>{{ row.ap }}</td>
48 |                 <td>{{ row.score }}</td>
49 |                 <td>
50 |                     <table>
51 |                     {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
52 |                         <tr>
53 |                         {% set rowloop = loop %}
54 |                         {% for xjk, ypjk in zip(xj, ypj) %}
55 |                             <td class="att" color="{{ ypjk }}">
56 |                             {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
57 |                                 <b>{{ xjk }}</b>
58 |                             {% else %}
59 |                                 {{ xjk }}
60 |                             {% endif %}
61 |                             </td>
62 |                         {% endfor %}
63 |                         </tr>
64 |                         <tr>
65 |                         {% for xjk, yp2jk in zip(xj, yp2j) %}
66 |                             <td class="att" color="{{ yp2jk }}">-</td>
67 |                         {% endfor %}
68 |                         </tr>
69 |                     {% endfor %}
70 |                     </table>
71 |                 </td>
72 |             </tr>
73 |         {% endfor %}
74 |     </table>
75 | </body>
76 | </html>


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/templates/visualizer.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>{{ title }}</title>
 6 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
 7 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/chroma-js/1.1.1/chroma.min.js"></script>
 8 |     <script>
 9 |         $(document).ready(function(){
10 |             $(".att").each(function() {
11 |                 // var val = parseFloat($(this).text());
12 |                 var val = parseFloat($(this).attr("color"));
13 |                 var scale = chroma.scale(['white', 'red']);
14 |                 var color = scale(val).hex();
15 |                 $(this).attr("bgcolor", color);
16 |             });
17 |         })
18 |     </script>
19 | </head>
20 | <style>
21 |     table, th, td {border: 1px solid black}
22 | </style>
23 | <body>
24 |     <h2>{{ title }}</h2>
25 |     <table>
26 |         <tr>
27 |             <th>ID</th>
28 |             <th>Question</th>
29 |             <th>Answers</th>
30 |             <th>Predicted</th>
31 |             <th>Score</th>
32 |             <th>Paragraph</th>
33 |         </tr>
34 |         {% for row in rows %}
35 |             <tr>
36 |                 <td>{{ row.id }}</td>
37 |                 <td>
38 |                     {% for qj in row.ques %}
39 |                         {{ qj }}
40 |                     {% endfor %}
41 |                 </td>
42 |                 <td>
43 |                     {% for aa in row.a %}
44 |                         <li>{{ aa }}</li>
45 |                     {% endfor %}
46 |                 </td>
47 |                 <td>{{ row.ap }}</td>
48 |                 <td>{{ row.score }}</td>
49 |                 <td>
50 |                     <table>
51 |                     {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
52 |                         <tr>
53 |                         {% set rowloop = loop %}
54 |                         {% for xjk, ypjk in zip(xj, ypj) %}
55 |                             <td class="att" color="{{ ypjk }}">
56 |                             {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
57 |                                 <b>{{ xjk }}</b>
58 |                             {% else %}
59 |                                 {{ xjk }}
60 |                             {% endif %}
61 |                             </td>
62 |                         {% endfor %}
63 |                         </tr>
64 |                         <tr>
65 |                         {% for xjk, yp2jk in zip(xj, yp2j) %}
66 |                             <td class="att" color="{{ yp2jk }}">-</td>
67 |                         {% endfor %}
68 |                         </tr>
69 |                     {% endfor %}
70 |                     </table>
71 |                 </td>
72 |             </tr>
73 |         {% endfor %}
74 |     </table>
75 | </body>
76 | </html>


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/graph_handler.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import json
 3 | from json import encoder
 4 | import os
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | from basic_cnn.evaluator import Evaluation, F1Evaluation
 9 | from my.utils import short_floats
10 | 
11 | import pickle
12 | 
13 | 
14 | class GraphHandler(object):
15 |     def __init__(self, config):
16 |         self.config = config
17 |         self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
18 |         self.writer = None
19 |         self.save_path = os.path.join(config.save_dir, config.model_name)
20 | 
21 |     def initialize(self, sess):
22 |         if self.config.load:
23 |             self._load(sess)
24 |         else:
25 |             sess.run(tf.global_variables_initializer())
26 | 
27 |         if self.config.mode == 'train':
28 |             self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
29 | 
30 |     def save(self, sess, global_step=None):
31 |         self.saver.save(sess, self.save_path, global_step=global_step)
32 | 
33 |     def _load(self, sess):
34 |         config = self.config
35 |         if config.load_path:
36 |             save_path = config.load_path
37 |         elif config.load_step > 0:
38 |             save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
39 |         else:
40 |             save_dir = config.save_dir
41 |             checkpoint = tf.train.get_checkpoint_state(save_dir)
42 |             assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
43 |             save_path = checkpoint.model_checkpoint_path
44 |         print("Loading saved model from {}".format(save_path))
45 |         self.saver.restore(sess, save_path)
46 | 
47 |     def add_summary(self, summary, global_step):
48 |         self.writer.add_summary(summary, global_step)
49 | 
50 |     def add_summaries(self, summaries, global_step):
51 |         for summary in summaries:
52 |             self.add_summary(summary, global_step)
53 | 
54 |     def dump_eval(self, e, precision=2, path=None):
55 |         assert isinstance(e, Evaluation)
56 |         if self.config.dump_pickle:
57 |             path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
58 |             with gzip.open(path, 'wb', compresslevel=3) as fh:
59 |                 pickle.dump(e.dict, fh)
60 |         else:
61 |             path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
62 |             with open(path, 'w') as fh:
63 |                 json.dump(short_floats(e.dict, precision), fh)
64 | 
65 |     def dump_answer(self, e, path=None):
66 |         assert isinstance(e, Evaluation)
67 |         path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
68 |         with open(path, 'w') as fh:
69 |             json.dump(e.id2answer_dict, fh)
70 | 
71 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/trainer.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from basic_cnn.model import Model
 4 | from my.tensorflow import average_gradients
 5 | 
 6 | 
 7 | class Trainer(object):
 8 |     def __init__(self, config, model):
 9 |         assert isinstance(model, Model)
10 |         self.config = config
11 |         self.model = model
12 |         self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
13 |         self.loss = model.get_loss()
14 |         self.var_list = model.get_var_list()
15 |         self.global_step = model.get_global_step()
16 |         self.summary = model.summary
17 |         self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 |         self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 | 
20 |     def get_train_op(self):
21 |         return self.train_op
22 | 
23 |     def step(self, sess, batch, get_summary=False):
24 |         assert isinstance(sess, tf.Session)
25 |         _, ds = batch
26 |         feed_dict = self.model.get_feed_dict(ds, True)
27 |         if get_summary:
28 |             loss, summary, train_op = \
29 |                 sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
30 |         else:
31 |             loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
32 |             summary = None
33 |         return loss, summary, train_op
34 | 
35 | 
36 | class MultiGPUTrainer(object):
37 |     def __init__(self, config, models):
38 |         model = models[0]
39 |         assert isinstance(model, Model)
40 |         self.config = config
41 |         self.model = model
42 |         self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
43 |         self.var_list = model.get_var_list()
44 |         self.global_step = model.get_global_step()
45 |         self.summary = model.summary
46 |         self.models = models
47 |         losses = []
48 |         grads_list = []
49 |         for gpu_idx, model in enumerate(models):
50 |             with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)):
51 |                 loss = model.get_loss()
52 |                 grads = self.opt.compute_gradients(loss, var_list=self.var_list)
53 |                 losses.append(loss)
54 |                 grads_list.append(grads)
55 | 
56 |         self.loss = tf.add_n(losses)/len(losses)
57 |         self.grads = average_gradients(grads_list)
58 |         self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
59 | 
60 |     def step(self, sess, batches, get_summary=False):
61 |         assert isinstance(sess, tf.Session)
62 |         feed_dict = {}
63 |         for batch, model in zip(batches, self.models):
64 |             _, ds = batch
65 |             feed_dict.update(model.get_feed_dict(ds, True))
66 | 
67 |         if get_summary:
68 |             loss, summary, train_op = \
69 |                 sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
70 |         else:
71 |             loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
72 |             summary = None
73 |         return loss, summary, train_op
74 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/trainer.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from basic.model import Model
 4 | from my.tensorflow import average_gradients
 5 | 
 6 | 
 7 | class Trainer(object):
 8 |     def __init__(self, config, model):
 9 |         assert isinstance(model, Model)
10 |         self.config = config
11 |         self.model = model
12 |         self.opt = tf.train.AdamOptimizer(config.init_lr)
13 |         self.loss = model.get_loss()
14 |         self.var_list = model.get_var_list()
15 |         self.global_step = model.get_global_step()
16 |         self.summary = model.summary
17 |         self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 |         self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 | 
20 |     def get_train_op(self):
21 |         return self.train_op
22 | 
23 |     def step(self, sess, batch, get_summary=False):
24 |         assert isinstance(sess, tf.Session)
25 |         _, ds = batch
26 |         feed_dict = self.model.get_feed_dict(ds, True)
27 |         if get_summary:
28 |             loss, summary, train_op = \
29 |                 sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
30 |         else:
31 |             loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
32 |             summary = None
33 |         return loss, summary, train_op
34 | 
35 | 
36 | class MultiGPUTrainer(object):
37 |     def __init__(self, config, models):
38 |         model = models[0]
39 |         assert isinstance(model, Model)
40 |         self.config = config
41 |         self.model = model
42 |         self.opt = tf.train.AdamOptimizer(config.init_lr)
43 |         self.var_list = model.get_var_list()
44 |         self.global_step = model.get_global_step()
45 |         self.summary = model.summary
46 |         self.models = models
47 |         losses = []
48 |         grads_list = []
49 |         for gpu_idx, model in enumerate(models):
50 |             with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)):
51 |                 loss = model.get_loss()
52 |                 grads = self.opt.compute_gradients(loss, var_list=self.var_list)
53 |                 losses.append(loss)
54 |                 grads_list.append(grads)
55 | 
56 |         self.loss = tf.add_n(losses)/len(losses)
57 |         self.grads = average_gradients(grads_list)
58 |         self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
59 | 
60 |     def step(self, sess, batches, get_summary=False):
61 |         assert isinstance(sess, tf.Session)
62 |         feed_dict = {}
63 |         for batch, model in zip(batches, self.models):
64 |             _, ds = batch
65 |             feed_dict.update(model.get_feed_dict(ds, True))
66 | 
67 |         if get_summary:
68 |             loss, summary, train_op = \
69 |                 sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
70 |         else:
71 |             loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
72 |             summary = None
73 |         return loss, summary, train_op
74 | 


--------------------------------------------------------------------------------
/tensorflow/CIFAR10/resnet/README.md:
--------------------------------------------------------------------------------
 1 | # ResNet on CIFAR10 and CIFAR100
 2 | 
 3 | (Borrowed from the tensorflow/models repository)
 4 | 
 5 | ## Dataset
 6 | 
 7 | https://www.cs.toronto.edu/~kriz/cifar.html
 8 | 
 9 | ## Related papers
10 | 
11 | - [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027v2.pdf)
12 | - [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385v1.pdf)
13 | - [Wide Residual Networks](https://arxiv.org/pdf/1605.07146v1.pdf)
14 | 
15 | ## Setting
16 | 
17 | * Pad to 36x36 and random crop. Horizontal flip. Per-image whitening.
18 | * Momentum optimizer (momentum = 0.9).
19 | * Learning rate schedule: 0.01 (1 epoch), 0.1 (90 epochs), 0.01 (45 epochs), 0.001 (45 epochs).
20 | * L2 weight decay: 0.005.
21 | * Batch size: 128. (28-10 wide and 1001 layer bottleneck use 64)
22 | 
23 | ## Results
24 | 
25 | CIFAR-10 Model|Best Precision|Steps
26 | --------------|--------------|------
27 | 32 layer|92.5%|~80k
28 | 110 layer|93.6%|~80k
29 | 164 layer bottleneck|94.5%|~80k
30 | 1001 layer bottleneck|94.9%|~80k
31 | 28-10 wide|95%|~90k
32 | 
33 | CIFAR-100 Model|Best Precision|Steps
34 | ---------------|--------------|-----
35 | 32 layer|68.1%|~45k
36 | 110 layer|71.3%|~60k
37 | 164 layer bottleneck|75.7%|~50k
38 | 1001 layer bottleneck|78.2%|~70k
39 | 28-10 wide|78.3%|~70k
40 | 
41 | ## Prerequisites
42 | 
43 | 1. Install TensorFlow 1.2 (preferably from source for higher performance) and Python 3.6.2.
44 | 
45 | 2. Download CIFAR-10/CIFAR-100 dataset.
46 | 
47 | ```shell
48 | curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
49 | curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
50 | ```
51 | 
52 | ## How to run
53 | 
54 | ```shell
55 | # cd to the models repository and run with bash. Expected command output shown.
56 | # The directory should contain an empty WORKSPACE file, the resnet code, and the cifar10 dataset.
57 | # Note: The user can split 5k from train set for eval set.
58 | $ ls -R
59 | .:
60 | cifar10  resnet  WORKSPACE
61 | 
62 | ./cifar10:
63 | data_batch_1.bin  data_batch_2.bin  data_batch_3.bin  data_batch_4.bin
64 | data_batch_5.bin  test_batch.bin
65 | 
66 | ./resnet:
67 | cifar_input.py  README.md  resnet_main.py  resnet_model.py
68 | 
69 | # Train the model.
70 | $ python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* \
71 |                                 --log_root=/tmp/resnet_model \
72 |                                 --train_dir=/tmp/resnet_model/train \
73 |                                 --dataset='cifar10' \
74 |                                 --num_gpus=1
75 | 
76 | # While the model is training, you can also check on its progress using tensorboard:
77 | $ tensorboard --logdir=/tmp/resnet_model
78 | 
79 | # Evaluate the model.
80 | # Avoid running on the same GPU as the training job at the same time,
81 | # otherwise, you might run out of memory.
82 | $ python3 resnet/resnet_main.py --eval_data_path=cifar10/test_batch.bin \
83 |                                 --log_root=/tmp/resnet_model \
84 |                                 --eval_dir=/tmp/resnet_model/test \
85 |                                 --mode=eval \
86 |                                 --dataset='cifar10' \
87 |                                 --num_gpus=0
88 | ```
89 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/graph_handler.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import json
 3 | from json import encoder
 4 | import os
 5 | 
 6 | import tensorflow as tf
 7 | 
 8 | from basic.evaluator import Evaluation, F1Evaluation
 9 | from my.utils import short_floats
10 | 
11 | import pickle
12 | 
13 | 
14 | class GraphHandler(object):
15 |     def __init__(self, config, model):
16 |         self.config = config
17 |         self.model = model
18 |         self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
19 |         self.writer = None
20 |         self.save_path = os.path.join(config.save_dir, config.model_name)
21 | 
22 |     def initialize(self, sess):
23 |         sess.run(tf.global_variables_initializer())
24 |         if self.config.load:
25 |             self._load(sess)
26 | 
27 |         if self.config.mode == 'train':
28 |             self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
29 | 
30 |     def save(self, sess, global_step=None):
31 |         saver = tf.train.Saver(max_to_keep=self.config.max_to_keep)
32 |         saver.save(sess, self.save_path, global_step=global_step)
33 | 
34 |     def _load(self, sess):
35 |         config = self.config
36 |         vars_ = {var.name.split(":")[0]: var for var in tf.global_variables()}
37 |         if config.load_ema:
38 |             ema = self.model.var_ema
39 |             for var in tf.trainable_variables():
40 |                 del vars_[var.name.split(":")[0]]
41 |                 vars_[ema.average_name(var)] = var
42 |         saver = tf.train.Saver(vars_, max_to_keep=config.max_to_keep)
43 | 
44 |         if config.load_path:
45 |             save_path = config.load_path
46 |         elif config.load_step > 0:
47 |             save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
48 |         else:
49 |             save_dir = config.save_dir
50 |             checkpoint = tf.train.get_checkpoint_state(save_dir)
51 |             assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
52 |             save_path = checkpoint.model_checkpoint_path
53 |         print("Loading saved model from {}".format(save_path))
54 |         saver.restore(sess, save_path)
55 | 
56 |     def add_summary(self, summary, global_step):
57 |         self.writer.add_summary(summary, global_step)
58 | 
59 |     def add_summaries(self, summaries, global_step):
60 |         for summary in summaries:
61 |             self.add_summary(summary, global_step)
62 | 
63 |     def dump_eval(self, e, precision=2, path=None):
64 |         assert isinstance(e, Evaluation)
65 |         if self.config.dump_pickle:
66 |             path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
67 |             with gzip.open(path, 'wb', compresslevel=3) as fh:
68 |                 pickle.dump(e.dict, fh)
69 |         else:
70 |             path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
71 |             with open(path, 'w') as fh:
72 |                 json.dump(short_floats(e.dict, precision), fh)
73 | 
74 |     def dump_answer(self, e, path=None):
75 |         assert isinstance(e, Evaluation)
76 |         path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
77 |         with open(path, 'w') as fh:
78 |             json.dump(e.id2answer_dict, fh)
79 | 
80 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/evaluate-v1.1.py:
--------------------------------------------------------------------------------
 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
 2 | from __future__ import print_function
 3 | from collections import Counter
 4 | import string
 5 | import re
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | 
11 | def normalize_answer(s):
12 |     """Lower text and remove punctuation, articles and extra whitespace."""
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     prediction_tokens = normalize_answer(prediction).split()
31 |     ground_truth_tokens = normalize_answer(ground_truth).split()
32 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 |     num_same = sum(common.values())
34 |     if num_same == 0:
35 |         return 0
36 |     precision = 1.0 * num_same / len(prediction_tokens)
37 |     recall = 1.0 * num_same / len(ground_truth_tokens)
38 |     f1 = (2 * precision * recall) / (precision + recall)
39 |     return f1
40 | 
41 | 
42 | def exact_match_score(prediction, ground_truth):
43 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 | 
45 | 
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 |     scores_for_ground_truths = []
48 |     for ground_truth in ground_truths:
49 |         score = metric_fn(prediction, ground_truth)
50 |         scores_for_ground_truths.append(score)
51 |     return max(scores_for_ground_truths)
52 | 
53 | 
54 | def evaluate(dataset, predictions):
55 |     f1 = exact_match = total = 0
56 |     for article in dataset:
57 |         for paragraph in article['paragraphs']:
58 |             for qa in paragraph['qas']:
59 |                 total += 1
60 |                 if qa['id'] not in predictions:
61 |                     message = 'Unanswered question ' + qa['id'] + \
62 |                               ' will receive score 0.'
63 |                     print(message, file=sys.stderr)
64 |                     continue
65 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 |                 prediction = predictions[qa['id']]
67 |                 exact_match += metric_max_over_ground_truths(
68 |                     exact_match_score, prediction, ground_truths)
69 |                 f1 += metric_max_over_ground_truths(
70 |                     f1_score, prediction, ground_truths)
71 | 
72 |     exact_match = 100.0 * exact_match / total
73 |     f1 = 100.0 * f1 / total
74 | 
75 |     return {'exact_match': exact_match, 'f1': f1}
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     expected_version = '1.1'
80 |     parser = argparse.ArgumentParser(
81 |         description='Evaluation for SQuAD ' + expected_version)
82 |     parser.add_argument('dataset_file', help='Dataset file')
83 |     parser.add_argument('prediction_file', help='Prediction File')
84 |     args = parser.parse_args()
85 |     with open(args.dataset_file) as dataset_file:
86 |         dataset_json = json.load(dataset_file)
87 |         if (dataset_json['version'] != expected_version):
88 |             print('Evaluation expects v-' + expected_version +
89 |                   ', but got dataset with v-' + dataset_json['version'],
90 |                   file=sys.stderr)
91 |         dataset = dataset_json['data']
92 |     with open(args.prediction_file) as prediction_file:
93 |         predictions = json.load(prediction_file)
94 |     print(json.dumps(evaluate(dataset, predictions)))
95 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/evaluate.py:
--------------------------------------------------------------------------------
 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. [Changed name for external importing]"""
 2 | from __future__ import print_function
 3 | from collections import Counter
 4 | import string
 5 | import re
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | 
11 | def normalize_answer(s):
12 |     """Lower text and remove punctuation, articles and extra whitespace."""
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     prediction_tokens = normalize_answer(prediction).split()
31 |     ground_truth_tokens = normalize_answer(ground_truth).split()
32 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 |     num_same = sum(common.values())
34 |     if num_same == 0:
35 |         return 0
36 |     precision = 1.0 * num_same / len(prediction_tokens)
37 |     recall = 1.0 * num_same / len(ground_truth_tokens)
38 |     f1 = (2 * precision * recall) / (precision + recall)
39 |     return f1
40 | 
41 | 
42 | def exact_match_score(prediction, ground_truth):
43 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 | 
45 | 
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 |     scores_for_ground_truths = []
48 |     for ground_truth in ground_truths:
49 |         score = metric_fn(prediction, ground_truth)
50 |         scores_for_ground_truths.append(score)
51 |     return max(scores_for_ground_truths)
52 | 
53 | 
54 | def evaluate(dataset, predictions):
55 |     f1 = exact_match = total = 0
56 |     for article in dataset:
57 |         for paragraph in article['paragraphs']:
58 |             for qa in paragraph['qas']:
59 |                 total += 1
60 |                 if qa['id'] not in predictions:
61 |                     message = 'Unanswered question ' + qa['id'] + \
62 |                               ' will receive score 0.'
63 |                     print(message, file=sys.stderr)
64 |                     continue
65 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 |                 prediction = predictions[qa['id']]
67 |                 exact_match += metric_max_over_ground_truths(
68 |                     exact_match_score, prediction, ground_truths)
69 |                 f1 += metric_max_over_ground_truths(
70 |                     f1_score, prediction, ground_truths)
71 | 
72 |     exact_match = 100.0 * exact_match / total
73 |     f1 = 100.0 * f1 / total
74 | 
75 |     return {'exact_match': exact_match, 'f1': f1}
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     expected_version = '1.1'
80 |     parser = argparse.ArgumentParser(
81 |         description='Evaluation for SQuAD ' + expected_version)
82 |     parser.add_argument('dataset_file', help='Dataset file')
83 |     parser.add_argument('prediction_file', help='Prediction File')
84 |     args = parser.parse_args()
85 |     with open(args.dataset_file) as dataset_file:
86 |         dataset_json = json.load(dataset_file)
87 |         if (dataset_json['version'] != expected_version):
88 |             print('Evaluation expects v-' + expected_version +
89 |                   ', but got dataset with v-' + dataset_json['version'],
90 |                   file=sys.stderr)
91 |         dataset = dataset_json['data']
92 |     with open(args.prediction_file) as prediction_file:
93 |         predictions = json.load(prediction_file)
94 |     print(json.dumps(evaluate(dataset, predictions)))
95 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/nltk_utils.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | import numpy as np
  3 | 
  4 | 
  5 | def _set_span(t, i):
  6 |     if isinstance(t[0], str):
  7 |         t.span = (i, i+len(t))
  8 |     else:
  9 |         first = True
 10 |         for c in t:
 11 |             cur_span = _set_span(c, i)
 12 |             i = cur_span[1]
 13 |             if first:
 14 |                 min_ = cur_span[0]
 15 |                 first = False
 16 |         max_ = cur_span[1]
 17 |         t.span = (min_, max_)
 18 |     return t.span
 19 | 
 20 | 
 21 | def set_span(t):
 22 |     assert isinstance(t, nltk.tree.Tree)
 23 |     try:
 24 |         return _set_span(t, 0)
 25 |     except:
 26 |         print(t)
 27 |         exit()
 28 | 
 29 | 
 30 | def tree_contains_span(tree, span):
 31 |     """
 32 |     Assumes that tree span has been set with set_span
 33 |     Returns true if any subtree of t has exact span as the given span
 34 |     :param t:
 35 |     :param span:
 36 |     :return bool:
 37 |     """
 38 |     return span in set(t.span for t in tree.subtrees())
 39 | 
 40 | 
 41 | def span_len(span):
 42 |     return span[1] - span[0]
 43 | 
 44 | 
 45 | def span_overlap(s1, s2):
 46 |     start = max(s1[0], s2[0])
 47 |     stop = min(s1[1], s2[1])
 48 |     if stop > start:
 49 |         return start, stop
 50 |     return None
 51 | 
 52 | 
 53 | def span_prec(true_span, pred_span):
 54 |     overlap = span_overlap(true_span, pred_span)
 55 |     if overlap is None:
 56 |         return 0
 57 |     return span_len(overlap) / span_len(pred_span)
 58 | 
 59 | 
 60 | def span_recall(true_span, pred_span):
 61 |     overlap = span_overlap(true_span, pred_span)
 62 |     if overlap is None:
 63 |         return 0
 64 |     return span_len(overlap) / span_len(true_span)
 65 | 
 66 | 
 67 | def span_f1(true_span, pred_span):
 68 |     p = span_prec(true_span, pred_span)
 69 |     r = span_recall(true_span, pred_span)
 70 |     if p == 0 or r == 0:
 71 |         return 0.0
 72 |     return 2 * p * r / (p + r)
 73 | 
 74 | 
 75 | def find_max_f1_span(tree, span):
 76 |     return find_max_f1_subtree(tree, span).span
 77 | 
 78 | 
 79 | def find_max_f1_subtree(tree, span):
 80 |     return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0]
 81 | 
 82 | 
 83 | def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'):
 84 |     set_span(tree)
 85 |     D = tree.height() - 1
 86 |     B = len(tree.leaves())
 87 |     row_size = row_size or D
 88 |     col_size = col_size or B
 89 |     matrix = np.zeros([row_size, col_size], dtype=dtype)
 90 |     mask = np.zeros([row_size, col_size, col_size], dtype='bool')
 91 | 
 92 |     for subtree in tree.subtrees():
 93 |         row = subtree.height() - 2
 94 |         col = subtree.span[0]
 95 |         matrix[row, col] = node2num(subtree)
 96 |         for subsub in subtree.subtrees():
 97 |             if isinstance(subsub, nltk.tree.Tree):
 98 |                 mask[row, col, subsub.span[0]] = True
 99 |                 if not isinstance(subsub[0], nltk.tree.Tree):
100 |                     c = subsub.span[0]
101 |                     for r in range(row):
102 |                         mask[r, c, c] = True
103 |             else:
104 |                 mask[row, col, col] = True
105 | 
106 |     return matrix, mask
107 | 
108 | 
109 | def load_compressed_tree(s):
110 | 
111 |     def compress_tree(tree):
112 |         assert not isinstance(tree, str)
113 |         if len(tree) == 1:
114 |             if isinstance(tree[0], nltk.tree.Tree):
115 |                 return compress_tree(tree[0])
116 |             else:
117 |                 return tree
118 |         else:
119 |             for i, t in enumerate(tree):
120 |                 if isinstance(t, nltk.tree.Tree):
121 |                     tree[i] = compress_tree(t)
122 |                 else:
123 |                     tree[i] = t
124 |             return tree
125 | 
126 |     return compress_tree(nltk.tree.Tree.fromstring(s))
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/ensemble.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import functools
  3 | import gzip
  4 | import json
  5 | import pickle
  6 | from collections import defaultdict
  7 | from operator import mul
  8 | 
  9 | from tqdm import tqdm
 10 | from squad.utils import get_phrase, get_best_span, get_span_score_pairs
 11 | 
 12 | 
 13 | def get_args():
 14 |     parser = argparse.ArgumentParser()
 15 |     parser.add_argument('paths', nargs='+')
 16 |     parser.add_argument('-o', '--out', default='ensemble.json')
 17 |     parser.add_argument("--data_path", default="data/squad/data_test.json")
 18 |     parser.add_argument("--shared_path", default="data/squad/shared_test.json")
 19 |     args = parser.parse_args()
 20 |     return args
 21 | 
 22 | 
 23 | def ensemble(args):
 24 |     e_list = []
 25 |     for path in tqdm(args.paths):
 26 |         with gzip.open(path, 'r') as fh:
 27 |             e = pickle.load(fh)
 28 |             e_list.append(e)
 29 | 
 30 |     with open(args.data_path, 'r') as fh:
 31 |         data = json.load(fh)
 32 | 
 33 |     with open(args.shared_path, 'r') as fh:
 34 |         shared = json.load(fh)
 35 | 
 36 |     out = {}
 37 |     for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])):
 38 |         if idx >= len(e['yp']):
 39 |             # for debugging purpose
 40 |             break
 41 |         context = shared['p'][rx[0]][rx[1]]
 42 |         wordss = shared['x'][rx[0]][rx[1]]
 43 |         yp_list = [e['yp'][idx] for e in e_list]
 44 |         yp2_list = [e['yp2'][idx] for e in e_list]
 45 |         answer = ensemble4(context, wordss, yp_list, yp2_list)
 46 |         out[id_] = answer
 47 | 
 48 |     with open(args.out, 'w') as fh:
 49 |         json.dump(out, fh)
 50 | 
 51 | 
 52 | def ensemble1(context, wordss, y1_list, y2_list):
 53 |     """
 54 | 
 55 |     :param context: Original context
 56 |     :param wordss: tokenized words (nested 2D list)
 57 |     :param y1_list: list of start index probs (each element corresponds to probs form single model)
 58 |     :param y2_list: list of stop index probs
 59 |     :return:
 60 |     """
 61 |     sum_y1 = combine_y_list(y1_list)
 62 |     sum_y2 = combine_y_list(y2_list)
 63 |     span, score = get_best_span(sum_y1, sum_y2)
 64 |     return get_phrase(context, wordss, span)
 65 | 
 66 | 
 67 | def ensemble2(context, wordss, y1_list, y2_list):
 68 |     start_dict = defaultdict(float)
 69 |     stop_dict = defaultdict(float)
 70 |     for y1, y2 in zip(y1_list, y2_list):
 71 |         span, score = get_best_span(y1, y2)
 72 |         start_dict[span[0]] += y1[span[0][0]][span[0][1]]
 73 |         stop_dict[span[1]] += y2[span[1][0]][span[1][1]]
 74 |     start = max(start_dict.items(), key=lambda pair: pair[1])[0]
 75 |     stop = max(stop_dict.items(), key=lambda pair: pair[1])[0]
 76 |     best_span = (start, stop)
 77 |     return get_phrase(context, wordss, best_span)
 78 | 
 79 | 
 80 | def ensemble3(context, wordss, y1_list, y2_list):
 81 |     d = defaultdict(float)
 82 |     for y1, y2 in zip(y1_list, y2_list):
 83 |         span, score = get_best_span(y1, y2)
 84 |         phrase = get_phrase(context, wordss, span)
 85 |         d[phrase] += score
 86 |     return max(d.items(), key=lambda pair: pair[1])[0]
 87 | 
 88 | 
 89 | def ensemble4(context, wordss, y1_list, y2_list):
 90 |     d = defaultdict(lambda: 0.0)
 91 |     for y1, y2 in zip(y1_list, y2_list):
 92 |         for span, score in get_span_score_pairs(y1, y2):
 93 |             d[span] += score
 94 |     span = max(d.items(), key=lambda pair: pair[1])[0]
 95 |     phrase = get_phrase(context, wordss, span)
 96 |     return phrase
 97 | 
 98 | 
 99 | def combine_y_list(y_list, op='*'):
100 |     if op == '+':
101 |         func = sum
102 |     elif op == '*':
103 |         def func(l): return functools.reduce(mul, l)
104 |     else:
105 |         func = op
106 |     return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)]
107 | 
108 | 
109 | def main():
110 |     args = get_args()
111 |     ensemble(args)
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/models/densenet.py:
--------------------------------------------------------------------------------
  1 | '''DenseNet in PyTorch.'''
  2 | import math
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class Bottleneck(nn.Module):
 10 |     def __init__(self, in_planes, growth_rate):
 11 |         super(Bottleneck, self).__init__()
 12 |         self.bn1 = nn.BatchNorm2d(in_planes)
 13 |         self.conv1 = nn.Conv2d(in_planes, 4 * growth_rate, kernel_size=1, bias=False)
 14 |         self.bn2 = nn.BatchNorm2d(4 * growth_rate)
 15 |         self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
 16 | 
 17 |     def forward(self, x):
 18 |         out = self.conv1(F.relu(self.bn1(x)))
 19 |         out = self.conv2(F.relu(self.bn2(out)))
 20 |         out = torch.cat([out, x], 1)
 21 |         return out
 22 | 
 23 | 
 24 | class Transition(nn.Module):
 25 |     def __init__(self, in_planes, out_planes, last=False, pool_size=2):
 26 |         super(Transition, self).__init__()
 27 |         self.last = last
 28 |         self.pool_size = pool_size
 29 |         self.bn = nn.BatchNorm2d(in_planes)
 30 |         if not self.last:
 31 |             self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
 32 | 
 33 |     def forward(self, x):
 34 |         out = F.relu(self.bn(x))
 35 |         if not self.last:
 36 |             out = self.conv(out)
 37 |         out = F.avg_pool2d(out, self.pool_size)
 38 |         return out
 39 | 
 40 | 
 41 | class DenseNet(nn.Module):
 42 |     def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
 43 |         super(DenseNet, self).__init__()
 44 |         # TODO: Add drop for CIFAR10 without data augmentation
 45 |         self.growth_rate = growth_rate
 46 | 
 47 |         num_planes = 2 * growth_rate
 48 |         self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
 49 | 
 50 |         self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
 51 |         num_planes += nblocks[0] * growth_rate
 52 |         out_planes = int(math.floor(num_planes*reduction))
 53 |         self.trans1 = Transition(num_planes, out_planes)
 54 |         num_planes = out_planes
 55 | 
 56 |         self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
 57 |         num_planes += nblocks[1] * growth_rate
 58 |         out_planes = int(math.floor(num_planes*reduction))
 59 |         self.trans2 = Transition(num_planes, out_planes)
 60 |         num_planes = out_planes
 61 | 
 62 |         self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
 63 |         num_planes += nblocks[2] * growth_rate
 64 |         self.trans3 = Transition(num_planes, num_planes, last=True, pool_size=8)
 65 | 
 66 |         self.linear = nn.Linear(num_planes, num_classes)
 67 | 
 68 |         for m in self.modules():
 69 |             if isinstance(m, nn.Conv2d):
 70 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 71 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 72 |             elif isinstance(m, nn.BatchNorm2d):
 73 |                 m.weight.data.fill_(1)
 74 |                 m.bias.data.zero_()
 75 | 
 76 |     def _make_dense_layers(self, block, in_planes, nblock):
 77 |         layers = []
 78 |         for i in range(nblock):
 79 |             layers.append(block(in_planes, self.growth_rate))
 80 |             in_planes += self.growth_rate
 81 |         return nn.Sequential(*layers)
 82 | 
 83 |     def forward(self, x):
 84 |         out = self.conv1(x)
 85 |         out = self.trans1(self.dense1(out))
 86 |         out = self.trans2(self.dense2(out))
 87 |         out = self.trans3(self.dense3(out))
 88 |         out = out.view(out.size(0), -1)
 89 |         out = self.linear(out)
 90 |         return out
 91 | 
 92 | 
 93 | def DenseNetBC(L, k):
 94 |     assert (L - 4) % 6 == 0
 95 |     num_blocks = int((L - 4) / 6)
 96 |     return DenseNet(Bottleneck, [num_blocks] * 3, growth_rate=k, reduction=0.5)
 97 | 
 98 | 
 99 | def DenseNetBC100():
100 |     return DenseNetBC(100, 12)
101 | 
102 | 
103 | def DenseNetBC250():
104 |     return DenseNetBC(250, 24)
105 | 
106 | 
107 | def DenseNetBC190():
108 |     return DenseNetBC(190, 40)
109 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/rnn.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \
 3 |     bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn
 4 | 
 5 | from my.tensorflow import flatten, reconstruct
 6 | 
 7 | 
 8 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
 9 |                 dtype=None, parallel_iterations=None, swap_memory=False,
10 |                 time_major=False, scope=None):
11 |     assert not time_major  # TODO : to be implemented later!
12 |     flat_inputs = flatten(inputs, 2)  # [-1, J, d]
13 |     flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
14 | 
15 |     flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
16 |                                              initial_state=initial_state, dtype=dtype,
17 |                                              parallel_iterations=parallel_iterations, swap_memory=swap_memory,
18 |                                              time_major=time_major, scope=scope)
19 | 
20 |     outputs = reconstruct(flat_outputs, inputs, 2)
21 |     return outputs, final_state
22 | 
23 | 
24 | def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
25 |                    dtype=None, parallel_iterations=None, swap_memory=False,
26 |                    time_major=False, scope=None):
27 |     assert not time_major  # TODO : to be implemented later!
28 | 
29 |     flat_inputs = flatten(inputs, 2)  # [-1, J, d]
30 |     flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
31 | 
32 |     flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \
33 |         else tf.reverse_sequence(flat_inputs, sequence_length, 1)
34 |     flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
35 |                                              initial_state=initial_state, dtype=dtype,
36 |                                              parallel_iterations=parallel_iterations, swap_memory=swap_memory,
37 |                                              time_major=time_major, scope=scope)
38 |     flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \
39 |         else tf.reverse_sequence(flat_outputs, sequence_length, 1)
40 | 
41 |     outputs = reconstruct(flat_outputs, inputs, 2)
42 |     return outputs, final_state
43 | 
44 | 
45 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
46 |                               initial_state_fw=None, initial_state_bw=None,
47 |                               dtype=None, parallel_iterations=None,
48 |                               swap_memory=False, time_major=False, scope=None):
49 |     assert not time_major
50 | 
51 |     flat_inputs = flatten(inputs, 2)  # [-1, J, d]
52 |     flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
53 | 
54 |     (flat_fw_outputs, flat_bw_outputs), final_state = \
55 |         _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
56 |                                    initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
57 |                                    dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory,
58 |                                    time_major=time_major, scope=scope)
59 | 
60 |     fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
61 |     bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
62 |     # FIXME : final state is not reshaped!
63 |     return (fw_outputs, bw_outputs), final_state
64 | 
65 | 
66 | def bidirectional_rnn(cell_fw, cell_bw, inputs,
67 |                       initial_state_fw=None, initial_state_bw=None,
68 |                       dtype=None, sequence_length=None, scope=None):
69 | 
70 |     flat_inputs = flatten(inputs, 2)  # [-1, J, d]
71 |     flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
72 | 
73 |     (flat_fw_outputs, flat_bw_outputs), final_state = \
74 |         tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
75 |                                         initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
76 |                                         dtype=dtype, scope=scope)
77 | 
78 |     fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
79 |     bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
80 |     # FIXME : final state is not reshaped!
81 |     return (fw_outputs, bw_outputs), final_state
82 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/visualizer.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | from collections import OrderedDict
  3 | import http.server
  4 | import socketserver
  5 | import argparse
  6 | import json
  7 | import os
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | from jinja2 import Environment, FileSystemLoader
 12 | 
 13 | 
 14 | def bool_(string):
 15 |     if string == 'True':
 16 |         return True
 17 |     elif string == 'False':
 18 |         return False
 19 |     else:
 20 |         raise Exception()
 21 | 
 22 | def get_args():
 23 |     parser = argparse.ArgumentParser()
 24 |     parser.add_argument("--model_name", type=str, default='basic')
 25 |     parser.add_argument("--data_type", type=str, default='dev')
 26 |     parser.add_argument("--step", type=int, default=5000)
 27 |     parser.add_argument("--template_name", type=str, default="visualizer.html")
 28 |     parser.add_argument("--num_per_page", type=int, default=100)
 29 |     parser.add_argument("--data_dir", type=str, default="data/squad")
 30 |     parser.add_argument("--port", type=int, default=8000)
 31 |     parser.add_argument("--host", type=str, default="0.0.0.0")
 32 |     parser.add_argument("--open", type=str, default='False')
 33 |     parser.add_argument("--run_id", type=str, default="0")
 34 | 
 35 |     args = parser.parse_args()
 36 |     return args
 37 | 
 38 | 
 39 | def _decode(decoder, sent):
 40 |     return " ".join(decoder[idx] for idx in sent)
 41 | 
 42 | 
 43 | def accuracy2_visualizer(args):
 44 |     model_name = args.model_name
 45 |     data_type = args.data_type
 46 |     num_per_page = args.num_per_page
 47 |     data_dir = args.data_dir
 48 |     run_id = args.run_id.zfill(2)
 49 |     step = args.step
 50 | 
 51 |     eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
 52 |     eval_ = json.load(open(eval_path, 'r'))
 53 | 
 54 |     _id = 0
 55 |     html_dir = "/tmp/list_results%d" % _id
 56 |     while os.path.exists(html_dir):
 57 |         _id += 1
 58 |         html_dir = "/tmp/list_results%d" % _id
 59 | 
 60 |     if os.path.exists(html_dir):
 61 |         shutil.rmtree(html_dir)
 62 |     os.mkdir(html_dir)
 63 | 
 64 |     cur_dir = os.path.dirname(os.path.realpath(__file__))
 65 |     templates_dir = os.path.join(cur_dir, 'templates')
 66 |     env = Environment(loader=FileSystemLoader(templates_dir))
 67 |     env.globals.update(zip=zip, reversed=reversed)
 68 |     template = env.get_template(args.template_name)
 69 | 
 70 |     data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
 71 |     shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
 72 |     data = json.load(open(data_path, 'r'))
 73 |     shared = json.load(open(shared_path, 'r'))
 74 | 
 75 |     rows = []
 76 |     for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])):
 77 |         id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x'))
 78 |         x = shared['x'][rx[0]][rx[1]]
 79 |         ques = [" ".join(q)]
 80 |         para = [[word for word in sent] for sent in x]
 81 |         row = {
 82 |             'id': id_,
 83 |             'title': "Hello world!",
 84 |             'ques': ques,
 85 |             'para': para,
 86 |             'y': yi,
 87 |             'y2': yi,
 88 |             'yp': ypi,
 89 |             'yp2': ypi,
 90 |             'a': ""
 91 |                }
 92 |         rows.append(row)
 93 | 
 94 |         if i % num_per_page == 0:
 95 |             html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
 96 | 
 97 |         if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
 98 |             var_dict = {'title': "Accuracy Visualization",
 99 |                         'rows': rows
100 |                         }
101 |             with open(html_path, "wb") as f:
102 |                 f.write(template.render(**var_dict).encode('UTF-8'))
103 |             rows = []
104 | 
105 |     os.chdir(html_dir)
106 |     port = args.port
107 |     host = args.host
108 |     # Overriding to suppress log message
109 |     class MyHandler(http.server.SimpleHTTPRequestHandler):
110 |         def log_message(self, format, *args):
111 |             pass
112 |     handler = MyHandler
113 |     httpd = socketserver.TCPServer((host, port), handler)
114 |     if args.open == 'True':
115 |         os.system("open http://%s:%d" % (args.host, args.port))
116 |     print("serving at %s:%d" % (host, port))
117 |     httpd.serve_forever()
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     ARGS = get_args()
122 |     accuracy2_visualizer(ARGS)


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/visualizer.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | from collections import OrderedDict
  3 | import http.server
  4 | import socketserver
  5 | import argparse
  6 | import json
  7 | import os
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | from jinja2 import Environment, FileSystemLoader
 12 | 
 13 | from basic_cnn.evaluator import get_span_score_pairs, get_best_span
 14 | 
 15 | 
 16 | def bool_(string):
 17 |     if string == 'True':
 18 |         return True
 19 |     elif string == 'False':
 20 |         return False
 21 |     else:
 22 |         raise Exception()
 23 | 
 24 | def get_args():
 25 |     parser = argparse.ArgumentParser()
 26 |     parser.add_argument("--model_name", type=str, default='basic')
 27 |     parser.add_argument("--data_type", type=str, default='dev')
 28 |     parser.add_argument("--step", type=int, default=5000)
 29 |     parser.add_argument("--template_name", type=str, default="visualizer.html")
 30 |     parser.add_argument("--num_per_page", type=int, default=100)
 31 |     parser.add_argument("--data_dir", type=str, default="data/squad")
 32 |     parser.add_argument("--port", type=int, default=8000)
 33 |     parser.add_argument("--host", type=str, default="0.0.0.0")
 34 |     parser.add_argument("--open", type=str, default='False')
 35 |     parser.add_argument("--run_id", type=str, default="0")
 36 | 
 37 |     args = parser.parse_args()
 38 |     return args
 39 | 
 40 | 
 41 | def _decode(decoder, sent):
 42 |     return " ".join(decoder[idx] for idx in sent)
 43 | 
 44 | 
 45 | def accuracy2_visualizer(args):
 46 |     model_name = args.model_name
 47 |     data_type = args.data_type
 48 |     num_per_page = args.num_per_page
 49 |     data_dir = args.data_dir
 50 |     run_id = args.run_id.zfill(2)
 51 |     step = args.step
 52 | 
 53 |     eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
 54 |     print("loading {}".format(eval_path))
 55 |     eval_ = json.load(open(eval_path, 'r'))
 56 | 
 57 |     _id = 0
 58 |     html_dir = "/tmp/list_results%d" % _id
 59 |     while os.path.exists(html_dir):
 60 |         _id += 1
 61 |         html_dir = "/tmp/list_results%d" % _id
 62 | 
 63 |     if os.path.exists(html_dir):
 64 |         shutil.rmtree(html_dir)
 65 |     os.mkdir(html_dir)
 66 | 
 67 |     cur_dir = os.path.dirname(os.path.realpath(__file__))
 68 |     templates_dir = os.path.join(cur_dir, 'templates')
 69 |     env = Environment(loader=FileSystemLoader(templates_dir))
 70 |     env.globals.update(zip=zip, reversed=reversed)
 71 |     template = env.get_template(args.template_name)
 72 | 
 73 |     data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
 74 |     shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
 75 |     print("loading {}".format(data_path))
 76 |     data = json.load(open(data_path, 'r'))
 77 |     print("loading {}".format(shared_path))
 78 |     shared = json.load(open(shared_path, 'r'))
 79 | 
 80 |     rows = []
 81 |     for i, (idx, yi, ypi, yp2i) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2')])), total=len(eval_['idxs'])):
 82 |         id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
 83 |         x = shared['x'][rx[0]][rx[1]]
 84 |         ques = [" ".join(q)]
 85 |         para = [[word for word in sent] for sent in x]
 86 |         span = get_best_span(ypi, yp2i)
 87 |         ap = get_segment(para, span)
 88 |         score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
 89 | 
 90 |         row = {
 91 |             'id': id_,
 92 |             'title': "Hello world!",
 93 |             'ques': ques,
 94 |             'para': para,
 95 |             'y': yi[0][0],
 96 |             'y2': yi[0][1],
 97 |             'yp': ypi,
 98 |             'yp2': yp2i,
 99 |             'a': answers,
100 |             'ap': ap,
101 |             'score': score
102 |                }
103 |         rows.append(row)
104 | 
105 |         if i % num_per_page == 0:
106 |             html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
107 | 
108 |         if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
109 |             var_dict = {'title': "Accuracy Visualization",
110 |                         'rows': rows
111 |                         }
112 |             with open(html_path, "wb") as f:
113 |                 f.write(template.render(**var_dict).encode('UTF-8'))
114 |             rows = []
115 | 
116 |     os.chdir(html_dir)
117 |     port = args.port
118 |     host = args.host
119 |     # Overriding to suppress log message
120 |     class MyHandler(http.server.SimpleHTTPRequestHandler):
121 |         def log_message(self, format, *args):
122 |             pass
123 |     handler = MyHandler
124 |     httpd = socketserver.TCPServer((host, port), handler)
125 |     if args.open == 'True':
126 |         os.system("open http://%s:%d" % (args.host, args.port))
127 |     print("serving at %s:%d" % (host, port))
128 |     httpd.serve_forever()
129 | 
130 | 
131 | def get_segment(para, span):
132 |     return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     ARGS = get_args()
137 |     accuracy2_visualizer(ARGS)


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | 
  4 | 
  5 | def get_2d_spans(text, tokenss):
  6 |     spanss = []
  7 |     cur_idx = 0
  8 |     for tokens in tokenss:
  9 |         spans = []
 10 |         for token in tokens:
 11 |             if text.find(token, cur_idx) < 0:
 12 |                 print(tokens)
 13 |                 print("{} {} {}".format(token, cur_idx, text))
 14 |                 raise Exception()
 15 |             cur_idx = text.find(token, cur_idx)
 16 |             spans.append((cur_idx, cur_idx + len(token)))
 17 |             cur_idx += len(token)
 18 |         spanss.append(spans)
 19 |     return spanss
 20 | 
 21 | 
 22 | def get_word_span(context, wordss, start, stop):
 23 |     spanss = get_2d_spans(context, wordss)
 24 |     idxs = []
 25 |     for sent_idx, spans in enumerate(spanss):
 26 |         for word_idx, span in enumerate(spans):
 27 |             if not (stop <= span[0] or start >= span[1]):
 28 |                 idxs.append((sent_idx, word_idx))
 29 | 
 30 |     assert len(idxs) > 0, "{} {} {} {}".format(context, spanss, start, stop)
 31 |     return idxs[0], (idxs[-1][0], idxs[-1][1] + 1)
 32 | 
 33 | 
 34 | def get_phrase(context, wordss, span):
 35 |     """
 36 |     Obtain phrase as substring of context given start and stop indices in word level
 37 |     :param context:
 38 |     :param wordss:
 39 |     :param start: [sent_idx, word_idx]
 40 |     :param stop: [sent_idx, word_idx]
 41 |     :return:
 42 |     """
 43 |     start, stop = span
 44 |     flat_start = get_flat_idx(wordss, start)
 45 |     flat_stop = get_flat_idx(wordss, stop)
 46 |     words = sum(wordss, [])
 47 |     char_idx = 0
 48 |     char_start, char_stop = None, None
 49 |     for word_idx, word in enumerate(words):
 50 |         char_idx = context.find(word, char_idx)
 51 |         assert char_idx >= 0
 52 |         if word_idx == flat_start:
 53 |             char_start = char_idx
 54 |         char_idx += len(word)
 55 |         if word_idx == flat_stop - 1:
 56 |             char_stop = char_idx
 57 |     assert char_start is not None
 58 |     assert char_stop is not None
 59 |     return context[char_start:char_stop]
 60 | 
 61 | 
 62 | def get_flat_idx(wordss, idx):
 63 |     return sum(len(words) for words in wordss[:idx[0]]) + idx[1]
 64 | 
 65 | 
 66 | def get_word_idx(context, wordss, idx):
 67 |     spanss = get_2d_spans(context, wordss)
 68 |     return spanss[idx[0]][idx[1]][0]
 69 | 
 70 | 
 71 | def process_tokens(temp_tokens):
 72 |     tokens = []
 73 |     for token in temp_tokens:
 74 |         flag = False
 75 |         l = ("-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
 76 |         # \u2013 is en-dash. Used for number to nubmer
 77 |         # l = ("-", "\u2212", "\u2014", "\u2013")
 78 |         # l = ("\u2013",)
 79 |         tokens.extend(re.split("([{}])".format("".join(l)), token))
 80 |     return tokens
 81 | 
 82 | 
 83 | def get_best_span(ypi, yp2i):
 84 |     max_val = 0
 85 |     best_word_span = (0, 1)
 86 |     best_sent_idx = 0
 87 |     for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
 88 |         argmax_j1 = 0
 89 |         for j in range(len(ypif)):
 90 |             val1 = ypif[argmax_j1]
 91 |             if val1 < ypif[j]:
 92 |                 val1 = ypif[j]
 93 |                 argmax_j1 = j
 94 | 
 95 |             val2 = yp2if[j]
 96 |             if val1 * val2 > max_val:
 97 |                 best_word_span = (argmax_j1, j)
 98 |                 best_sent_idx = f
 99 |                 max_val = val1 * val2
100 |     return ((best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)), float(max_val)
101 | 
102 | 
103 | def get_best_span_wy(wypi, th):
104 |     chunk_spans = []
105 |     scores = []
106 |     chunk_start = None
107 |     score = 0
108 |     l = 0
109 |     th = min(th, np.max(wypi))
110 |     for f, wypif in enumerate(wypi):
111 |         for j, wypifj in enumerate(wypif):
112 |             if wypifj >= th:
113 |                 if chunk_start is None:
114 |                     chunk_start = f, j
115 |                 score += wypifj
116 |                 l += 1
117 |             else:
118 |                 if chunk_start is not None:
119 |                     chunk_stop = f, j
120 |                     chunk_spans.append((chunk_start, chunk_stop))
121 |                     scores.append(score/l)
122 |                     score = 0
123 |                     l = 0
124 |                     chunk_start = None
125 |         if chunk_start is not None:
126 |             chunk_stop = f, j+1
127 |             chunk_spans.append((chunk_start, chunk_stop))
128 |             scores.append(score/l)
129 |             score = 0
130 |             l = 0
131 |             chunk_start = None
132 | 
133 |     return max(zip(chunk_spans, scores), key=lambda pair: pair[1])
134 | 
135 | 
136 | def get_span_score_pairs(ypi, yp2i):
137 |     span_score_pairs = []
138 |     for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
139 |         for j in range(len(ypif)):
140 |             for k in range(j, len(yp2if)):
141 |                 span = ((f, j), (f, k+1))
142 |                 score = ypif[j] * yp2if[k]
143 |                 span_score_pairs.append((span, score))
144 |     return span_score_pairs
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/tensorflow/CIFAR10/resnet/cifar_input.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """CIFAR dataset input module.
 17 | """
 18 | 
 19 | import tensorflow as tf
 20 | 
 21 | def build_input(dataset, data_path, batch_size, mode, data_format):
 22 |   """Build CIFAR image and labels.
 23 | 
 24 |   Args:
 25 |     dataset: Either 'cifar10' or 'cifar100'.
 26 |     data_path: Filename for data.
 27 |     batch_size: Input batch size.
 28 |     mode: Either 'train' or 'eval'.
 29 |     data_format: Either 'NCHW' or 'NHWC'.
 30 |   Returns:
 31 |     images: Batches of images. [batch_size, image_size, image_size, 3]
 32 |     labels: Batches of labels. [batch_size, num_classes]
 33 |   Raises:
 34 |     ValueError: when the specified dataset is not supported.
 35 |   """
 36 |   with tf.device('/cpu:0'):
 37 |     image_size = 32
 38 |     if dataset == 'cifar10':
 39 |       label_bytes = 1
 40 |       label_offset = 0
 41 |       num_classes = 10
 42 |     elif dataset == 'cifar100':
 43 |       label_bytes = 1
 44 |       label_offset = 1
 45 |       num_classes = 100
 46 |     else:
 47 |       raise ValueError('Not supported dataset %s', dataset)
 48 | 
 49 |     depth = 3
 50 |     image_bytes = image_size * image_size * depth
 51 |     record_bytes = label_bytes + label_offset + image_bytes
 52 | 
 53 |     data_files = tf.gfile.Glob(data_path)
 54 |     file_queue = tf.train.string_input_producer(data_files, shuffle=True)
 55 |     # Read examples from files in the filename queue.
 56 |     reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
 57 |     _, value = reader.read(file_queue)
 58 | 
 59 |     # Convert these examples to dense labels and processed images.
 60 |     record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
 61 |     label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
 62 |     # Convert from string to [depth * height * width] to [depth, height, width].
 63 |     depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
 64 |                              [depth, image_size, image_size])
 65 |     # Convert from [depth, height, width] to [height, width, depth].
 66 |     image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
 67 | 
 68 |     if mode == 'train':
 69 |       image = tf.image.resize_image_with_crop_or_pad(
 70 |           image, image_size+4, image_size+4)
 71 |       image = tf.random_crop(image, [image_size, image_size, 3])
 72 |       image = tf.image.random_flip_left_right(image)
 73 |       # Brightness/saturation/constrast provides small gains .2%~.5% on cifar.
 74 |       # image = tf.image.random_brightness(image, max_delta=63. / 255.)
 75 |       # image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
 76 |       # image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
 77 |       image = tf.image.per_image_standardization(image)
 78 | 
 79 |       example_queue = tf.RandomShuffleQueue(
 80 |           capacity=16 * batch_size,
 81 |           min_after_dequeue=8 * batch_size,
 82 |           dtypes=[tf.float32, tf.int32],
 83 |           shapes=[[image_size, image_size, depth], [1]])
 84 |       num_threads = 16
 85 |     else:
 86 |       image = tf.image.resize_image_with_crop_or_pad(
 87 |           image, image_size, image_size)
 88 |       image = tf.image.per_image_standardization(image)
 89 | 
 90 |       example_queue = tf.FIFOQueue(
 91 |           3 * batch_size,
 92 |           dtypes=[tf.float32, tf.int32],
 93 |           shapes=[[image_size, image_size, depth], [1]])
 94 |       num_threads = 1
 95 | 
 96 |     example_enqueue_op = example_queue.enqueue([image, label])
 97 |     tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner(
 98 |         example_queue, [example_enqueue_op] * num_threads))
 99 | 
100 |     # Read 'batch' labels + images from the example queue.
101 |     images, labels = example_queue.dequeue_many(batch_size)
102 |     labels = tf.reshape(labels, [batch_size, 1])
103 |     indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
104 |     labels = tf.sparse_to_dense(
105 |         tf.concat(values=[indices, labels], axis=1),
106 |         [batch_size, num_classes], 1.0, 0.0)
107 | 
108 |     if data_format == 'NCHW':
109 |       images = tf.transpose(images, [0, 3, 1, 2])
110 | 
111 |     assert len(images.get_shape()) == 4
112 |     assert images.get_shape()[0] == batch_size
113 |     if data_format == 'NCHW':
114 |       assert images.get_shape()[1] == 3
115 |     else:
116 |       assert images.get_shape()[-1] == 3
117 |     assert len(labels.get_shape()) == 2
118 |     assert labels.get_shape()[0] == batch_size
119 |     assert labels.get_shape()[1] == num_classes
120 | 
121 |   return images, labels
122 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/visualizer.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | from collections import OrderedDict
  3 | import http.server
  4 | import socketserver
  5 | import argparse
  6 | import json
  7 | import os
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | import pickle
 11 | import gzip
 12 | 
 13 | from jinja2 import Environment, FileSystemLoader
 14 | 
 15 | from squad.utils import get_best_span, get_best_span_wy
 16 | 
 17 | 
 18 | def bool_(string):
 19 |     if string == 'True':
 20 |         return True
 21 |     elif string == 'False':
 22 |         return False
 23 |     else:
 24 |         raise Exception()
 25 | 
 26 | def get_args():
 27 |     parser = argparse.ArgumentParser()
 28 |     parser.add_argument("--model_name", type=str, default='basic')
 29 |     parser.add_argument("--data_type", type=str, default='dev')
 30 |     parser.add_argument("--step", type=int, default=5000)
 31 |     parser.add_argument("--template_name", type=str, default="visualizer.html")
 32 |     parser.add_argument("--num_per_page", type=int, default=100)
 33 |     parser.add_argument("--data_dir", type=str, default="data/squad")
 34 |     parser.add_argument("--port", type=int, default=8000)
 35 |     parser.add_argument("--host", type=str, default="0.0.0.0")
 36 |     parser.add_argument("--open", type=str, default='False')
 37 |     parser.add_argument("--run_id", type=str, default="0")
 38 |     parser.add_argument("-w", "--wy", action='store_true')
 39 | 
 40 |     args = parser.parse_args()
 41 |     return args
 42 | 
 43 | 
 44 | def _decode(decoder, sent):
 45 |     return " ".join(decoder[idx] for idx in sent)
 46 | 
 47 | 
 48 | def accuracy2_visualizer(args):
 49 |     model_name = args.model_name
 50 |     data_type = args.data_type
 51 |     num_per_page = args.num_per_page
 52 |     data_dir = args.data_dir
 53 |     run_id = args.run_id.zfill(2)
 54 |     step = args.step
 55 | 
 56 |     eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.pklz".format(data_type, str(step).zfill(6)))
 57 |     print("loading {}".format(eval_path))
 58 |     eval_ = pickle.load(gzip.open(eval_path, 'r'))
 59 | 
 60 |     _id = 0
 61 |     html_dir = "/tmp/list_results%d" % _id
 62 |     while os.path.exists(html_dir):
 63 |         _id += 1
 64 |         html_dir = "/tmp/list_results%d" % _id
 65 | 
 66 |     if os.path.exists(html_dir):
 67 |         shutil.rmtree(html_dir)
 68 |     os.mkdir(html_dir)
 69 | 
 70 |     cur_dir = os.path.dirname(os.path.realpath(__file__))
 71 |     templates_dir = os.path.join(cur_dir, 'templates')
 72 |     env = Environment(loader=FileSystemLoader(templates_dir))
 73 |     env.globals.update(zip=zip, reversed=reversed)
 74 |     template = env.get_template(args.template_name)
 75 | 
 76 |     data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
 77 |     shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
 78 |     print("loading {}".format(data_path))
 79 |     data = json.load(open(data_path, 'r'))
 80 |     print("loading {}".format(shared_path))
 81 |     shared = json.load(open(shared_path, 'r'))
 82 | 
 83 |     rows = []
 84 |     for i, (idx, yi, ypi, yp2i, wypi) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2', 'wyp')])), total=len(eval_['idxs'])):
 85 |         id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
 86 |         x = shared['x'][rx[0]][rx[1]]
 87 |         ques = [" ".join(q)]
 88 |         para = [[word for word in sent] for sent in x]
 89 |         span, score = get_best_span_wy(wypi, 0.5) if args.wy else get_best_span(ypi, yp2i)
 90 |         ap = get_segment(para, span)
 91 |         # score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
 92 | 
 93 |         row = {
 94 |             'id': id_,
 95 |             'title': "Hello world!",
 96 |             'ques': ques,
 97 |             'para': para,
 98 |             'y': yi[0][0],
 99 |             'y2': yi[0][1],
100 |             'yp': wypi if args.wy else ypi,
101 |             'yp2': wypi if args.wy else yp2i,
102 |             'a': answers,
103 |             'ap': ap,
104 |             'score': score
105 |                }
106 |         rows.append(row)
107 | 
108 |         if i % num_per_page == 0:
109 |             html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
110 | 
111 |         if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
112 |             var_dict = {'title': "Accuracy Visualization",
113 |                         'rows': rows
114 |                         }
115 |             with open(html_path, "wb") as f:
116 |                 f.write(template.render(**var_dict).encode('UTF-8'))
117 |             rows = []
118 | 
119 |     os.chdir(html_dir)
120 |     port = args.port
121 |     host = args.host
122 |     # Overriding to suppress log message
123 |     class MyHandler(http.server.SimpleHTTPRequestHandler):
124 |         def log_message(self, format, *args):
125 |             pass
126 |     handler = MyHandler
127 |     httpd = socketserver.TCPServer((host, port), handler)
128 |     if args.open == 'True':
129 |         os.system("open http://%s:%d" % (args.host, args.port))
130 |     print("serving at %s:%d" % (host, port))
131 |     httpd.serve_forever()
132 | 
133 | 
134 | def get_segment(para, span):
135 |     return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     ARGS = get_args()
140 |     accuracy2_visualizer(ARGS)


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/infer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import timeit
  3 | from glob import glob
  4 | from collections import OrderedDict
  5 | 
  6 | import click
  7 | import torch
  8 | import numpy as np
  9 | from torch.autograd import Variable
 10 | from torchvision import transforms
 11 | from torchvision import datasets
 12 | 
 13 | from benchmark.utils import save_result
 14 | from benchmark.cifar10.train import MEAN, STD, MODELS
 15 | 
 16 | 
 17 | class PyTorchEngine:
 18 |     def __init__(self, path, arch, use_cuda=False):
 19 |         self.path = path
 20 |         self.use_cuda = use_cuda
 21 |         self.arch = arch
 22 |         model = MODELS[self.arch]()
 23 |         restored_state = torch.load(path)
 24 |         model = model.load_state_dict(restored_state['model'])
 25 |         accuracy = restored_state['accuracy']
 26 |         epoch = restored_state['epoch'] + 1
 27 | 
 28 |         if self.use_cuda:
 29 |             self.model = model.cuda()
 30 |         else:
 31 |             self.model = model.cpu()
 32 |         self.epoch = epoch
 33 |         self.accuracy = accuracy
 34 | 
 35 |     def pred(self, inputs):
 36 |         inputs = Variable(inputs, requires_grad=False, volatile=True)
 37 | 
 38 |         if self.use_cuda:
 39 |             inputs = inputs.cuda()
 40 |             return self.model(inputs).data.cpu().numpy()
 41 |         else:
 42 |             return self.model(inputs).data.numpy()
 43 | 
 44 | 
 45 | def time_batch_size(dataset, batch_size, pred, use_cuda, repeat=100, bestof=3):
 46 |     loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
 47 |                                          shuffle=False, pin_memory=use_cuda)
 48 |     inputs, targets = loader.__iter__().next()
 49 |     assert inputs.size(0) == batch_size
 50 | 
 51 |     times = timeit.repeat('pred(inputs)', globals=locals(),
 52 |                           repeat=repeat, number=1)
 53 | 
 54 |     return times
 55 | 
 56 | 
 57 | def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True,
 58 |                   output=None):
 59 |     if log2:
 60 |         start = int(np.floor(np.log2(start)))
 61 |         end = int(np.ceil(np.log2(end)))
 62 |         assert start >= 0
 63 |         assert end >= start
 64 |         batch_sizes = map(lambda x: 2**x, range(start, end + 1))
 65 |     else:
 66 |         batch_sizes = range(start, end + 1)
 67 |     results = []
 68 |     for batch_size in batch_sizes:
 69 |         times = time_batch_size(dataset, batch_size, engine.pred,
 70 |                                 engine.use_cuda, repeat=repeat)
 71 | 
 72 |         result = OrderedDict()
 73 |         result['nodename'] = os.uname().nodename
 74 |         result['model'] = engine.arch
 75 |         result['use_cuda'] = engine.use_cuda
 76 |         result['batch_size'] = batch_size
 77 |         result['mean'] = np.mean(times)
 78 |         result['std'] = np.std(times)
 79 |         result['throughput'] = batch_size / np.mean(times)
 80 |         result['path'] = engine.path
 81 |         if output is not None:
 82 |             save_result(result, output)
 83 | 
 84 |         print('batch_size: {batch_size:4d}'
 85 |               ' - mean: {mean:.4f}'
 86 |               ' - std: {std:.4f}'
 87 |               ' - throughput: {throughput:.4f}'.format(**result))
 88 |         results.append(result)
 89 | 
 90 |     return results
 91 | 
 92 | 
 93 | @click.command()
 94 | @click.option('--dataset-dir', default='./data/cifar10')
 95 | @click.option('--run-dir', default='./run/')
 96 | @click.option('--output-file', default='inference.csv')
 97 | @click.option('--start', '-s', default=1)
 98 | @click.option('--end', '-e', default=128)
 99 | @click.option('--repeat', '-r', default=100)
100 | @click.option('--log2/--no-log2', default=True)
101 | @click.option('--cpu/--no-cpu', default=True)
102 | @click.option('--gpu/--no-gpu', default=True)
103 | @click.option('--append', is_flag=True)
104 | @click.option('--models', '-m', type=click.Choice(MODELS.keys()),
105 |               multiple=True)
106 | def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2,
107 |           cpu, gpu, append, models):
108 | 
109 |     transform_test = transforms.Compose([
110 |         transforms.ToTensor(),
111 |         transforms.Normalize(MEAN, STD)
112 |     ])
113 | 
114 |     testset = datasets.CIFAR10(root=dataset_dir, train=False, download=True,
115 |                                transform=transform_test)
116 |     models = models or os.listdir(run_dir)
117 |     output_path = os.path.join(run_dir, output_file)
118 |     assert not os.path.exists(output_path) or append
119 |     for model in models:
120 |         model_dir = os.path.join(run_dir, model)
121 |         paths = glob(f"{model_dir}/*/checkpoint_best_model.t7")
122 |         assert len(paths) > 0
123 |         path = os.path.abspath(paths[0])
124 | 
125 |         print(f'Model: {model}')
126 |         print(f'Path: {path}')
127 | 
128 |         if cpu:
129 |             print('With CPU:')
130 |             engine = PyTorchEngine(path, use_cuda=False, arch=model)
131 |             infer_cifar10(testset, engine, start=start, end=end, log2=log2,
132 |                           repeat=repeat, output=output_path)
133 | 
134 |         if gpu and torch.cuda.is_available():
135 |             print('With GPU:')
136 |             engine = PyTorchEngine(path, use_cuda=True, arch=model)
137 |             # Warmup
138 |             time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1)
139 | 
140 |             infer_cifar10(testset, engine, start=start, end=end, log2=log2,
141 |                           repeat=repeat, output=output_path)
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     infer()
146 | 


--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | import pandas as pd
  5 | 
  6 | from benchmark.cifar10.train import MODELS
  7 | from benchmark.utils import count_parameters
  8 | 
  9 | 
 10 | MODEL_SIZES = {key: count_parameters(MODELS[key]()) for key in MODELS.keys()}
 11 | 
 12 | 
 13 | def single_run_acc(df):
 14 |     df = df.copy()
 15 |     df['duration'] = (df['timestamp'] - df['prev_timestamp']).apply(lambda x: x.total_seconds())
 16 |     df['batch_duration'] = df['batch_duration'].apply(lambda x: x.total_seconds())
 17 | 
 18 |     tmp = df.loc[:, ['epoch', 'batch_size', 'ncorrect', 'duration', 'batch_duration']].groupby('epoch').sum()
 19 |     tmp['accuracy'] = tmp['ncorrect'] / tmp['batch_size']
 20 |     tmp['throughput'] = tmp['batch_size'] / tmp['duration']
 21 |     tmp['_throughput'] = tmp['batch_size'] / tmp['batch_duration']
 22 |     tmp['elapsed'] = df.groupby('epoch')['elapsed'].agg('max')
 23 |     tmp.reset_index(inplace=True)
 24 | 
 25 |     return tmp
 26 | 
 27 | 
 28 | def load_file(file, start_timestamp=None):
 29 |     df = pd.read_csv(file)
 30 |     df['timestamp'] = pd.to_datetime(df['timestamp'])
 31 |     df['batch_duration'] = pd.to_timedelta(df['batch_duration'])
 32 |     df['ncorrect'] = df['top1_correct']
 33 |     start_timestamp = start_timestamp or df['timestamp'].iloc[0]
 34 |     df['elapsed'] = df['timestamp'] - start_timestamp
 35 |     df['batch_accuracy'] = df['ncorrect'] / df['batch_size']
 36 |     return df
 37 | 
 38 | 
 39 | def load_data(directory, verbose=True):
 40 |     train_file = os.path.join(directory, 'train_results.csv')
 41 |     train = load_file(train_file)
 42 |     start_timestamp = train['timestamp'].iloc[0]
 43 | 
 44 |     if verbose:
 45 |         print(train_file)
 46 |         print("Training results shape: {}".format(train.shape))
 47 | 
 48 |     try:
 49 |         test_file = os.path.join(directory, 'test_results.csv')
 50 |         test = load_file(test_file, start_timestamp=start_timestamp)
 51 |     except FileNotFoundError:
 52 |         test_file = os.path.join(directory, 'valid_results.csv')
 53 |         test = load_file(test_file, start_timestamp=start_timestamp)
 54 | 
 55 |     if verbose:
 56 |         print(test_file)
 57 |         print('Test results shape: {}'.format(test.shape))
 58 | 
 59 |     train['mode'] = 'train'
 60 |     test['mode'] = 'test'
 61 | 
 62 |     combined = pd.concat([train, test], ignore_index=True).sort_values(by=['timestamp'])
 63 |     combined['prev_timestamp'] = combined['timestamp'].shift(1)
 64 |     combined.loc[0, 'prev_timestamp'] = combined.loc[0, 'timestamp'] - combined.loc[0, 'batch_duration']
 65 |     train = combined[combined['mode'] == 'train'].copy()
 66 |     test = combined[combined['mode'] == 'test'].copy()
 67 | 
 68 |     return single_run_acc(train), single_run_acc(test)
 69 | 
 70 | 
 71 | def load_multiple(directory, timestamps=None, verbose=False):
 72 |     timestamps = timestamps or os.listdir(directory)
 73 |     train_sets = []
 74 |     test_sets = []
 75 |     for timestamp in sorted(timestamps):
 76 |         _dir = os.path.join(directory, timestamp)
 77 |         train, test = load_data(_dir, verbose=verbose)
 78 |         if verbose:
 79 |             print()
 80 |         train['run'] = _dir
 81 |         test['run'] = _dir
 82 |         train['job_start'] = timestamp
 83 |         test['job_start'] = timestamp
 84 |         train_sets.append(train)
 85 |         test_sets.append(test)
 86 | 
 87 |     return pd.concat(train_sets), pd.concat(test_sets)
 88 | 
 89 | 
 90 | def load_multiple_models(directory, verbose=False):
 91 |     paths = os.listdir(directory)
 92 |     models = [path for path in paths if path in MODELS]
 93 | 
 94 |     train_sets = []
 95 |     test_sets = []
 96 |     for model in sorted(models):
 97 |         if verbose:
 98 |             print(f"Loading {model}")
 99 |         _dir = os.path.join(directory, model)
100 |         train, test = load_multiple(_dir, verbose=verbose)
101 |         train['model'] = model
102 |         train['nparameters'] = MODEL_SIZES[model]
103 |         test['model'] = model
104 |         test['nparameters'] = MODEL_SIZES[model]
105 | 
106 |         train_sets.append(train)
107 |         test_sets.append(test)
108 | 
109 |     return pd.concat(train_sets), pd.concat(test_sets)
110 | 
111 | 
112 | def concat_update(existing, other, repeat=False):
113 |     for key in other.keys():
114 |         if key in existing:
115 |             if existing[key] != other[key] or repeat:
116 |                 current = existing[key]
117 |                 if isinstance(current, list):
118 |                     current.append(other[key])
119 |                 else:
120 |                     existing[key] = [current, other[key]]
121 |         else:
122 |             existing[key] = other[key]
123 | 
124 | 
125 | def run_config(run, repeat=False):
126 |     full = {}
127 |     configs = (os.path.join(run, entry.name) for entry in os.scandir(run) if 'config' in entry.name)
128 | 
129 |     for config in sorted(configs):
130 |         with open(config) as file:
131 |             tmp = json.load(file)
132 | 
133 |         tmp['path'] = config
134 |         concat_update(full, tmp, repeat=repeat)
135 |     return full
136 | 
137 | 
138 | def search_configs(criteria, configs):
139 |     matches = []
140 |     for run, config in configs.items():
141 |         is_match = True
142 |         for key, value in criteria.items():
143 |             try:
144 |                 config_value = config[key]
145 |                 if config_value != value:
146 |                     is_match = False
147 |             except KeyError:
148 |                 is_match = False
149 | 
150 |         if is_match:
151 |             matches.append(run)
152 | 
153 |     return matches
154 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from basic_cnn.main import main as m
  6 | 
  7 | flags = tf.app.flags
  8 | 
  9 | flags.DEFINE_string("model_name", "basic_cnn", "Model name [basic]")
 10 | flags.DEFINE_string("data_dir", "data/cnn", "Data dir [data/cnn]")
 11 | flags.DEFINE_string("root_dir", "/Users/minjoons/data/cnn/questions", "root dir [~/data/cnn/questions]")
 12 | flags.DEFINE_string("run_id", "0", "Run ID [0]")
 13 | flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
 14 | 
 15 | flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
 16 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
 17 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
 18 | flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
 19 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
 20 | flags.DEFINE_integer("load_step", 0, "load step [0]")
 21 | flags.DEFINE_integer("early_stop", 4, "early stop [4]")
 22 | 
 23 | flags.DEFINE_string("mode", "test", "train | dev | test | forward [test]")
 24 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
 25 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
 26 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
 27 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
 28 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
 29 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
 30 | 
 31 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
 32 | 
 33 | flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
 34 | flags.DEFINE_integer("char_out_size", 100, "Char out size [100]")
 35 | flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob [0.8]")
 36 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
 37 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
 38 | flags.DEFINE_float("wd", 0.0, "Weight decay [0.0]")
 39 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
 40 | flags.DEFINE_bool("dump_eval", False, "dump eval? [True]")
 41 | flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
 42 | flags.DEFINE_string("model", "2", "config 1 |2 [2]")
 43 | flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
 44 | flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
 45 | 
 46 | flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
 47 | flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
 48 | flags.DEFINE_integer("sent_size_th", 60, "sent size th [64]")
 49 | flags.DEFINE_integer("num_sents_th", 200, "num sents th [8]")
 50 | flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
 51 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
 52 | flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
 53 | 
 54 | flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
 55 | flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
 56 | flags.DEFINE_bool("finetune", False, "finetune? [False]")
 57 | flags.DEFINE_bool("feed_gt", False, "feed gt prev token during training [False]")
 58 | flags.DEFINE_bool("feed_hard", False, "feed hard argmax prev token during testing [False]")
 59 | flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
 60 | flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
 61 | flags.DEFINE_bool("eval", True, "eval? [True]")
 62 | flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
 63 | flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
 64 | 
 65 | flags.DEFINE_string("forward_name", "single", "Forward name [single]")
 66 | flags.DEFINE_string("answer_path", "", "Answer path []")
 67 | flags.DEFINE_string("load_path", "", "Load path []")
 68 | flags.DEFINE_string("shared_path", "", "Shared path []")
 69 | flags.DEFINE_string("device", "/cpu:0", "default device [/cpu:0]")
 70 | flags.DEFINE_integer("num_gpus", 1, "num of gpus [1]")
 71 | 
 72 | flags.DEFINE_string("out_channel_dims", "100", "Out channel dims, separated by commas [100]")
 73 | flags.DEFINE_string("filter_heights", "5", "Filter heights, separated by commas [5]")
 74 | 
 75 | flags.DEFINE_bool("share_cnn_weights", True, "Share CNN weights [False]")
 76 | flags.DEFINE_bool("share_lstm_weights", True, "Share LSTM weights [True]")
 77 | flags.DEFINE_bool("two_prepro_layers", False, "Use two layers for preprocessing? [False]")
 78 | flags.DEFINE_bool("aug_att", False, "Augment attention layers with more features? [False]")
 79 | flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
 80 | flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
 81 | flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
 82 | flags.DEFINE_float("keep_prob", 1.0, "keep prob [1.0]")
 83 | flags.DEFINE_string("prev_mode", "a", "prev mode gy | y | a [a]")
 84 | flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
 85 | flags.DEFINE_bool("sh", False, "use superhighway [False]")
 86 | flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
 87 | flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
 88 | flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
 89 | flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
 90 | flags.DEFINE_float("filter_ratio", 1.0, "filter ratio [1.0]")
 91 | flags.DEFINE_bool("bi", False, "bi-directional attention? [False]")
 92 | flags.DEFINE_integer("width", 5, "width around entity [5]")
 93 | 
 94 | 
 95 | def main(_):
 96 |     config = flags.FLAGS
 97 | 
 98 |     config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
 99 | 
100 |     m(config)
101 | 
102 | if __name__ == "__main__":
103 |     tf.app.run()
104 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/aug_squad.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | 
  4 | from tqdm import tqdm
  5 | 
  6 | from my.corenlp_interface import CoreNLPInterface
  7 | 
  8 | in_path = sys.argv[1]
  9 | out_path = sys.argv[2]
 10 | url = sys.argv[3]
 11 | port = int(sys.argv[4])
 12 | data = json.load(open(in_path, 'r'))
 13 | 
 14 | h = CoreNLPInterface(url, port)
 15 | 
 16 | 
 17 | def find_all(a_str, sub):
 18 |     start = 0
 19 |     while True:
 20 |         start = a_str.find(sub, start)
 21 |         if start == -1: return
 22 |         yield start
 23 |         start += len(sub)  # use start += 1 to find overlapping matches
 24 | 
 25 | 
 26 | def to_hex(s):
 27 |     return " ".join(map(hex, map(ord, s)))
 28 | 
 29 | 
 30 | def handle_nobreak(cand, text):
 31 |     if cand == text:
 32 |         return cand
 33 |     if cand.replace(u'\u00A0', ' ') == text:
 34 |         return cand
 35 |     elif cand == text.replace(u'\u00A0', ' '):
 36 |         return text
 37 |     raise Exception("{} '{}' {} '{}'".format(cand, to_hex(cand), text, to_hex(text)))
 38 | 
 39 | 
 40 | # resolving unicode complication
 41 | 
 42 | wrong_loc_count = 0
 43 | loc_diffs = []
 44 | 
 45 | for article in data['data']:
 46 |     for para in article['paragraphs']:
 47 |         para['context'] = para['context'].replace(u'\u000A', '')
 48 |         para['context'] = para['context'].replace(u'\u00A0', ' ')
 49 |         context = para['context']
 50 |         for qa in para['qas']:
 51 |             for answer in qa['answers']:
 52 |                 answer['text'] = answer['text'].replace(u'\u00A0', ' ')
 53 |                 text = answer['text']
 54 |                 answer_start = answer['answer_start']
 55 |                 if context[answer_start:answer_start + len(text)] == text:
 56 |                     if text.lstrip() == text:
 57 |                         pass
 58 |                     else:
 59 |                         answer_start += len(text) - len(text.lstrip())
 60 |                         answer['answer_start'] = answer_start
 61 |                         text = text.lstrip()
 62 |                         answer['text'] = text
 63 |                 else:
 64 |                     wrong_loc_count += 1
 65 |                     text = text.lstrip()
 66 |                     answer['text'] = text
 67 |                     starts = list(find_all(context, text))
 68 |                     if len(starts) == 1:
 69 |                         answer_start = starts[0]
 70 |                     elif len(starts) > 1:
 71 |                         new_answer_start = min(starts, key=lambda s: abs(s - answer_start))
 72 |                         loc_diffs.append(abs(new_answer_start - answer_start))
 73 |                         answer_start = new_answer_start
 74 |                     else:
 75 |                         raise Exception()
 76 |                     answer['answer_start'] = answer_start
 77 | 
 78 |                 answer_stop = answer_start + len(text)
 79 |                 answer['answer_stop'] = answer_stop
 80 |                 assert para['context'][answer_start:answer_stop] == answer['text'], "{} {}".format(
 81 |                     para['context'][answer_start:answer_stop], answer['text'])
 82 | 
 83 | print(wrong_loc_count, loc_diffs)
 84 | 
 85 | mismatch_count = 0
 86 | dep_fail_count = 0
 87 | no_answer_count = 0
 88 | 
 89 | size = sum(len(article['paragraphs']) for article in data['data'])
 90 | pbar = tqdm(range(size))
 91 | 
 92 | for ai, article in enumerate(data['data']):
 93 |     for pi, para in enumerate(article['paragraphs']):
 94 |         context = para['context']
 95 |         sents = h.split_doc(context)
 96 |         words = h.split_sent(context)
 97 |         sent_starts = []
 98 |         ref_idx = 0
 99 |         for sent in sents:
100 |             new_idx = context.find(sent, ref_idx)
101 |             sent_starts.append(new_idx)
102 |             ref_idx = new_idx + len(sent)
103 |         para['sents'] = sents
104 |         para['words'] = words
105 |         para['sent_starts'] = sent_starts
106 | 
107 |         consts = list(map(h.get_const, sents))
108 |         para['consts'] = consts
109 |         deps = list(map(h.get_dep, sents))
110 |         para['deps'] = deps
111 | 
112 |         for qa in para['qas']:
113 |             question = qa['question']
114 |             question_const = h.get_const(question)
115 |             qa['const'] = question_const
116 |             question_dep = h.get_dep(question)
117 |             qa['dep'] = question_dep
118 |             qa['words'] = h.split_sent(question)
119 | 
120 |             for answer in qa['answers']:
121 |                 answer_start = answer['answer_start']
122 |                 text = answer['text']
123 |                 answer_stop = answer_start + len(text)
124 |                 # answer_words = h.split_sent(text)
125 |                 word_idxs = []
126 |                 answer_words = []
127 |                 for sent_idx, (sent, sent_start, dep) in enumerate(zip(sents, sent_starts, deps)):
128 |                     if dep is None:
129 |                         print("dep parse failed at {} {} {}".format(ai, pi, sent_idx))
130 |                         dep_fail_count += 1
131 |                         continue
132 |                     nodes, edges = dep
133 |                     words = [node[0] for node in nodes]
134 | 
135 |                     for word_idx, (word, _, _, start, _) in enumerate(nodes):
136 |                         global_start = sent_start + start
137 |                         global_stop = global_start + len(word)
138 |                         if answer_start <= global_start < answer_stop or answer_start < global_stop <= answer_stop:
139 |                             word_idxs.append((sent_idx, word_idx))
140 |                             answer_words.append(word)
141 |                 if len(word_idxs) > 0:
142 |                     answer['answer_word_start'] = word_idxs[0]
143 |                     answer['answer_word_stop'] = word_idxs[-1][0], word_idxs[-1][1] + 1
144 |                     if not text.startswith(answer_words[0]):
145 |                         print("'{}' '{}'".format(text, ' '.join(answer_words)))
146 |                         mismatch_count += 1
147 |                 else:
148 |                     answer['answer_word_start'] = None
149 |                     answer['answer_word_stop'] = None
150 |                     no_answer_count += 1
151 |         pbar.update(1)
152 | pbar.close()
153 | 
154 | print(mismatch_count, dep_fail_count, no_answer_count)
155 | 
156 | print("saving...")
157 | json.dump(data, open(out_path, 'w'))


--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | from basic.main import main as m
  6 | 
  7 | flags = tf.app.flags
  8 | 
  9 | # Names and directories
 10 | flags.DEFINE_string("model_name", "basic", "Model name [basic]")
 11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
 12 | flags.DEFINE_string("run_id", "0", "Run ID [0]")
 13 | flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
 14 | flags.DEFINE_string("forward_name", "single", "Forward name [single]")
 15 | flags.DEFINE_string("answer_path", "", "Answer path []")
 16 | flags.DEFINE_string("eval_path", "", "Eval path []")
 17 | flags.DEFINE_string("load_path", "", "Load path []")
 18 | flags.DEFINE_string("shared_path", "", "Shared path []")
 19 | 
 20 | # Device placement
 21 | flags.DEFINE_string("device", "/cpu:0", "default device for summing gradients. [/cpu:0]")
 22 | flags.DEFINE_string("device_type", "gpu", "device for computing gradients (parallelization). cpu | gpu [gpu]")
 23 | flags.DEFINE_integer("num_gpus", 1, "num of gpus or cpus for computing gradients [1]")
 24 | 
 25 | # Essential training and test options
 26 | flags.DEFINE_string("mode", "test", "trains | test | forward [test]")
 27 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
 28 | flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
 29 | flags.DEFINE_boolean("debug", False, "Debugging mode? [False]")
 30 | flags.DEFINE_bool('load_ema', True, "load exponential average of variables when testing?  [True]")
 31 | flags.DEFINE_bool("eval", True, "eval? [True]")
 32 | flags.DEFINE_bool("wy", False, "Use wy for loss / eval? [False]")
 33 | flags.DEFINE_bool("na", False, "Enable no answer strategy and learn bias? [False]")
 34 | flags.DEFINE_float("th", 0.5, "Threshold [0.5]")
 35 | 
 36 | # Training / test parameters
 37 | flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
 38 | flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]")
 39 | flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]")
 40 | flags.DEFINE_integer("num_epochs", 12, "Total number of epochs for training [12]")
 41 | flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
 42 | flags.DEFINE_integer("load_step", 0, "load step [0]")
 43 | flags.DEFINE_float("init_lr", 0.001, "Initial learning rate [0.001]")
 44 | flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]")
 45 | flags.DEFINE_float("keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]")
 46 | flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]")
 47 | flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
 48 | flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]")
 49 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
 50 | flags.DEFINE_string("out_channel_dims", "100", "Out channel dims of Char-CNN, separated by commas [100]")
 51 | flags.DEFINE_string("filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]")
 52 | flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]")
 53 | flags.DEFINE_bool("highway", True, "Use highway? [True]")
 54 | flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
 55 | flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]")
 56 | flags.DEFINE_bool("share_lstm_weights", True, "Share pre-processing (phrase-level) LSTM weights [True]")
 57 | flags.DEFINE_float("var_decay", 0.999, "Exponential moving average decay for variables [0.999]")
 58 | 
 59 | # Optimizations
 60 | flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
 61 | flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
 62 | flags.DEFINE_bool("cpu_opt", False, "CPU optimization? GPU computation can be slower [False]")
 63 | 
 64 | # Logging and saving options
 65 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
 66 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
 67 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
 68 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
 69 | flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
 70 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
 71 | flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
 72 | flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
 73 | flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
 74 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay for logging values [0.9]")
 75 | 
 76 | # Thresholds for speed and less memory usage
 77 | flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
 78 | flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
 79 | flags.DEFINE_integer("sent_size_th", 400, "sent size th [64]")
 80 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
 81 | flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
 82 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
 83 | flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
 84 | 
 85 | # Advanced training options
 86 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
 87 | flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
 88 | flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
 89 | flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
 90 | flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
 91 | flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
 92 | flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
 93 | flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
 94 | flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
 95 | 
 96 | # Ablation options
 97 | flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]")
 98 | flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
 99 | flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]")
100 | flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]")
101 | flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]")
102 | 
103 | 
104 | def main(_):
105 |     config = flags.FLAGS
106 | 
107 |     config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
108 | 
109 |     m(config)
110 | 
111 | if __name__ == "__main__":
112 |     tf.app.run()
113 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/README.md:
--------------------------------------------------------------------------------
  1 | # Bi-directional Attention Flow for Machine Comprehension
  2 |  
  3 | - This the original implementation of [Bi-directional Attention Flow for Machine Comprehension][paper] (Seo et al., 2016).
  4 | - This is tensorflow v1.1.0 comaptible version. This is not compatible with previous trained models, 
  5 | so if you want to use them, go to [v0.2.1][v0.2.1]. 
  6 | - The CodaLab worksheet for the [SQuAD Leaderboard][squad] submission is available [here][worksheet].
  7 | - Please contact [Minjoon Seo][minjoon] ([@seominjoon][minjoon-github]) for questions and suggestions.
  8 | 
  9 | ## 0. Requirements
 10 | #### General
 11 | - Python (developed on 3.5.2. Issues have been reported with Python 2!)
 12 | - unzip
 13 | 
 14 | #### Python Packages
 15 | - tensorflow (deep learning library, verified on 1.1.0)
 16 | - nltk (NLP tools, verified on 3.2.1)
 17 | - tqdm (progress bar, verified on 4.7.4)
 18 | - jinja2 (for visaulization; if you only train and test, not needed)
 19 | 
 20 | ## 1. Pre-processing
 21 | First, prepare data. Donwload SQuAD data and GloVe and nltk corpus
 22 | (~850 MB, this will download files to `$HOME/data`):
 23 | ```
 24 | chmod +x download.sh; ./download.sh
 25 | ```
 26 | 
 27 | Second, Preprocess Stanford QA dataset (along with GloVe vectors) and save them in `$PWD/data/squad` (~5 minutes):
 28 | ```
 29 | python -m squad.prepro
 30 | ```
 31 | 
 32 | ## 2. Training
 33 | The model was trained with NVidia Titan X (Pascal Architecture, 2016).
 34 | The model requires at least 12GB of GPU RAM.
 35 | If your GPU RAM is smaller than 12GB, you can either decrease batch size (performance might degrade),
 36 | or you can use multi GPU (see below).
 37 | The training converges at ~18k steps, and it took ~4s per step (i.e. ~20 hours).
 38 | 
 39 | Before training, it is recommended to first try the following code to verify everything is okay and memory is sufficient:
 40 | ```
 41 | python -m basic.cli --mode train --noload --debug
 42 | ```
 43 | 
 44 | Then to fully train, run:
 45 | ```
 46 | python -m basic.cli --mode train --noload
 47 | ```
 48 | 
 49 | You can speed up the training process with optimization flags:
 50 | ```
 51 | python -m basic.cli --mode train --noload --len_opt --cluster
 52 | ```
 53 | You can still omit them, but training will be much slower.
 54 | 
 55 | 
 56 | ## 3. Test
 57 | To test, run:
 58 | ```
 59 | python -m basic.cli
 60 | ```
 61 | 
 62 | Similarly to training, you can give the optimization flags to speed up test (5 minutes on dev data):
 63 | ```
 64 | python -m basic.cli --len_opt --cluster
 65 | ```
 66 | 
 67 | This command loads the most recently saved model during training and begins testing on the test data.
 68 | After the process ends, it prints F1 and EM scores, and also outputs a json file (`$PWD/out/basic/00/answer/test-####.json`,
 69 | where `####` is the step # that the model was saved).
 70 | Note that the printed scores are not official (our scoring scheme is a bit harsher).
 71 | To obtain the official number, use the official evaluator (copied in `squad` folder) and the output json file:
 72 | 
 73 | ```
 74 | python squad/evaluate-v1.1.py $HOME/data/squad/dev-v1.1.json out/basic/00/answer/test-####.json
 75 | ```
 76 | 
 77 | ### 3.1 Loading from pre-trained weights
 78 | NOTE: this version is not compatible with the following trained models. 
 79 | For compatibility, use [v0.2.1][v0.2.1]. 
 80 | 
 81 | Instead of training the model yourself, you can choose to use pre-trained weights that were used for [SQuAD Leaderboard][squad] submission.
 82 | Refer to [this worksheet][worksheet] in CodaLab to reproduce the results.
 83 | If you are unfamiliar with CodaLab, follow these simple steps (given that you met all prereqs above):
 84 | 
 85 | 1. Download `save.zip` from the [worksheet][worksheet] and unzip it in the current directory.
 86 | 2. Copy `glove.6B.100d.txt` from your glove data folder (`$HOME/data/glove/`) to the current directory.
 87 | 3. To reproduce single model:
 88 |   
 89 |   ```
 90 |   basic/run_single.sh $HOME/data/squad/dev-v1.1.json single.json
 91 |   ```
 92 |   
 93 |   This writes the answers to `single.json` in the current directory. You can then use the official evaluator to obtain EM and F1 scores. If you want to run on GPU (~5 mins), change the value of batch_size flag in the shell file to a higher number (60 for 12GB GPU RAM). 
 94 | 4. Similarly, to reproduce ensemble method:
 95 |   
 96 |   ```
 97 |   basic/run_ensemble.sh $HOME/data/squad/dev-v1.1.json ensemble.json 
 98 |   ```
 99 |   If you want to run on GPU, you should run the script sequentially by removing '&' in the forloop, or you will need to specify different GPUs for each run of the for loop.
100 | 
101 | ## Results
102 | 
103 | ### Dev Data
104 | 
105 | |          | EM (%) | F1 (%) |
106 | | -------- |:------:|:------:|
107 | | single   | 67.8   | 77.4   |
108 | 
109 | ###Dev Data (old)
110 | NOTE: These numbers are from [v0.2.1][v0.2.1]. 
111 | 
112 | |          | EM (%) | F1 (%) |
113 | | -------- |:------:|:------:|
114 | | single   | 67.7   | 77.3   |
115 | | ensemble | 72.6   | 80.7   |
116 | 
117 | 
118 | ###Test Data (old)
119 | NOTE: These numbers are from [v0.2.1][v0.2.1]. 
120 | 
121 | |          | EM (%) | F1 (%) |
122 | | -------- |:------:|:------:|
123 | | single   | 68.0   | 77.3   |
124 | | ensemble | 73.3   | 81.1   |
125 | 
126 | Refer to [our paper][paper] for more details.
127 | See [SQuAD Leaderboard][squad] to compare with other models.
128 | 
129 | 
130 | <!--
131 | ## Using Pre-trained Model
132 | 
133 | If you would like to use pre-trained model, it's very easy! 
134 | You can download the model weights [here][save] (make sure that its commit id matches the source code's).
135 | Extract them and put them in `$PWD/out/basic/00/save` directory, with names unchanged.
136 | Then do the testing again, but you need to specify the step # that you are loading from:
137 | ```
138 | python -m basic.cli --mode test --batch_size 8 --eval_num_batches 0 --load_step ####
139 | ```
140 | -->
141 | 
142 | 
143 | ## Multi-GPU Training & Testing
144 | Our model supports multi-GPU training.
145 | We follow the parallelization paradigm described in [TensorFlow Tutorial][multi-gpu].
146 | In short, if you want to use batch size of 60 (default) but if you have 3 GPUs with 4GB of RAM,
147 | then you initialize each GPU with batch size of 20, and combine the gradients on CPU.
148 | This can be easily done by running:
149 | ```
150 | python -m basic.cli --mode train --noload --num_gpus 3 --batch_size 20
151 | ```
152 | 
153 | Similarly, you can speed up your testing by:
154 | ```
155 | python -m basic.cli --num_gpus 3 --batch_size 20 
156 | ```
157 |  
158 | 
159 | [multi-gpu]: https://www.tensorflow.org/versions/r0.11/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards
160 | [squad]: http://stanford-qa.com
161 | [paper]: https://arxiv.org/abs/1611.01603
162 | [worksheet]: https://worksheets.codalab.org/worksheets/0x37a9b8c44f6845c28866267ef941c89d/
163 | [minjoon]: https://seominjoon.github.io
164 | [minjoon-github]: https://github.com/seominjoon
165 | [v0.2.1]: https://github.com/allenai/bi-att-flow/tree/v0.2.1
166 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/general.py:
--------------------------------------------------------------------------------
  1 | from itertools import zip_longest
  2 | 
  3 | import itertools
  4 | import tensorflow as tf
  5 | from functools import reduce
  6 | from operator import mul
  7 | import numpy as np
  8 | 
  9 | VERY_BIG_NUMBER = 1e30
 10 | VERY_SMALL_NUMBER = 1e-30
 11 | VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER
 12 | VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER
 13 | 
 14 | 
 15 | def get_initializer(matrix):
 16 |     def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix
 17 |     return _initializer
 18 | 
 19 | 
 20 | def variable_on_cpu(name, shape, initializer):
 21 |     """Helper to create a Variable stored on CPU memory.
 22 | 
 23 |     Args:
 24 |       name: name of the variable
 25 |       shape: list of ints
 26 |       initializer: initializer for Variable
 27 | 
 28 |     Returns:
 29 |       Variable Tensor
 30 |     """
 31 |     with tf.device('/cpu:0'):
 32 |         var = tf.get_variable(name, shape, initializer=initializer)
 33 |     return var
 34 | 
 35 | 
 36 | def variable_with_weight_decay(name, shape, stddev, wd):
 37 |     """Helper to create an initialized Variable with weight decay.
 38 | 
 39 |     Note that the Variable is initialized with a truncated normal distribution.
 40 |     A weight decay is added only if one is specified.
 41 | 
 42 |     Args:
 43 |       name: name of the variable
 44 |       shape: list of ints
 45 |       stddev: standard deviation of a truncated Gaussian
 46 |       wd: add L2Loss weight decay multiplied by this float. If None, weight
 47 |           decay is not added for this Variable.
 48 | 
 49 |     Returns:
 50 |       Variable Tensor
 51 |     """
 52 |     var = variable_on_cpu(name, shape,
 53 |                            tf.truncated_normal_initializer(stddev=stddev))
 54 |     if wd:
 55 |         weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
 56 |         tf.add_to_collection('losses', weight_decay)
 57 |     return var
 58 | 
 59 | 
 60 | def average_gradients(tower_grads):
 61 |     """Calculate the average gradient for each shared variable across all towers.
 62 | 
 63 |     Note that this function provides a synchronization point across all towers.
 64 | 
 65 |     Args:
 66 |       tower_grads: List of lists of (gradient, variable) tuples. The outer list
 67 |         is over individual gradients. The inner list is over the gradient
 68 |         calculation for each tower.
 69 |     Returns:
 70 |        List of pairs of (gradient, variable) where the gradient has been averaged
 71 |        across all towers.
 72 |     """
 73 |     average_grads = []
 74 |     for grad_and_vars in zip(*tower_grads):
 75 |         # Note that each grad_and_vars looks like the following:
 76 |         #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
 77 |         grads = []
 78 |         for g, var in grad_and_vars:
 79 |             # Add 0 dimension to the gradients to represent the tower.
 80 |             assert g is not None, var.name
 81 |             expanded_g = tf.expand_dims(g, 0)
 82 | 
 83 |             # Append on a 'tower' dimension which we will average over below.
 84 |             grads.append(expanded_g)
 85 | 
 86 |         # Average over the 'tower' dimension.
 87 |         grad = tf.concat(axis=0, values=grads)
 88 |         grad = tf.reduce_mean(grad, 0)
 89 | 
 90 |         # Keep in mind that the Variables are redundant because they are shared
 91 |         # across towers. So .. we will just return the first tower's pointer to
 92 |         # the Variable.
 93 |         v = grad_and_vars[0][1]
 94 |         grad_and_var = (grad, v)
 95 |         average_grads.append(grad_and_var)
 96 |     return average_grads
 97 | 
 98 | 
 99 | def mask(val, mask, name=None):
100 |     if name is None:
101 |         name = 'mask'
102 |     return tf.multiply(val, tf.cast(mask, 'float'), name=name)
103 | 
104 | 
105 | def exp_mask(val, mask, name=None):
106 |     """Give very negative number to unmasked elements in val.
107 |     For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9].
108 |     Typically, this effectively masks in exponential space (e.g. softmax)
109 |     Args:
110 |         val: values to be masked
111 |         mask: masking boolean tensor, same shape as tensor
112 |         name: name for output tensor
113 | 
114 |     Returns:
115 |         Same shape as val, where some elements are very small (exponentially zero)
116 |     """
117 |     if name is None:
118 |         name = "exp_mask"
119 |     return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name)
120 | 
121 | 
122 | def flatten(tensor, keep):
123 |     fixed_shape = tensor.get_shape().as_list()
124 |     start = len(fixed_shape) - keep
125 |     left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)])
126 |     out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))]
127 |     flat = tf.reshape(tensor, out_shape)
128 |     return flat
129 | 
130 | 
131 | def reconstruct(tensor, ref, keep):
132 |     ref_shape = ref.get_shape().as_list()
133 |     tensor_shape = tensor.get_shape().as_list()
134 |     ref_stop = len(ref_shape) - keep
135 |     tensor_start = len(tensor_shape) - keep
136 |     pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)]
137 |     keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))]
138 |     # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))]
139 |     # keep_shape = tensor.get_shape().as_list()[-keep:]
140 |     target_shape = pre_shape + keep_shape
141 |     out = tf.reshape(tensor, target_shape)
142 |     return out
143 | 
144 | 
145 | def add_wd(wd, scope=None):
146 |     scope = scope or tf.get_variable_scope().name
147 |     variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
148 |     with tf.name_scope("weight_decay"):
149 |         for var in variables:
150 |             weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name="{}/wd".format(var.op.name))
151 |             tf.add_to_collection('losses', weight_decay)
152 | 
153 | 
154 | def grouper(iterable, n, fillvalue=None, shorten=False, num_groups=None):
155 |     args = [iter(iterable)] * n
156 |     out = zip_longest(*args, fillvalue=fillvalue)
157 |     out = list(out)
158 |     if num_groups is not None:
159 |         default = (fillvalue, ) * n
160 |         assert isinstance(num_groups, int)
161 |         out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default))
162 |     if shorten:
163 |         assert fillvalue is None
164 |         out = (tuple(e for e in each if e is not None) for each in out)
165 |     return out
166 | 
167 | def padded_reshape(tensor, shape, mode='CONSTANT', name=None):
168 |     paddings = [[0, shape[i] - tf.shape(tensor)[i]] for i in range(len(shape))]
169 |     return tf.pad(tensor, paddings, mode=mode, name=name)
170 | 
171 | 
172 | def get_num_params():
173 |     num_params = 0
174 |     for variable in tf.trainable_variables():
175 |         shape = variable.get_shape()
176 |         num_params += reduce(mul, [dim.value for dim in shape], 1)
177 |     return num_params
178 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/read_data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import itertools
  5 | import math
  6 | 
  7 | import nltk
  8 | 
  9 | from my.nltk_utils import load_compressed_tree
 10 | from my.utils import index
 11 | 
 12 | 
 13 | class DataSet(object):
 14 |     def __init__(self, data, data_type, shared=None, valid_idxs=None):
 15 |         total_num_examples = len(next(iter(data.values())))
 16 |         self.data = data  # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
 17 |         self.data_type = data_type
 18 |         self.shared = shared
 19 |         self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
 20 |         self.num_examples = len(self.valid_idxs)
 21 | 
 22 |     def get_batches(self, batch_size, num_batches=None, shuffle=False):
 23 |         num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
 24 |         if num_batches is None:
 25 |             num_batches = num_batches_per_epoch
 26 |         num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
 27 | 
 28 |         idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs))
 29 |                                              if shuffle else self.valid_idxs
 30 |                                              for _ in range(num_epochs))
 31 |         for _ in range(num_batches):
 32 |             batch_idxs = tuple(itertools.islice(idxs, batch_size))
 33 |             batch_data = {}
 34 |             for key, val in self.data.items():
 35 |                 if key.startswith('*'):
 36 |                     assert self.shared is not None
 37 |                     shared_key = key[1:]
 38 |                     batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs]
 39 |                 else:
 40 |                     batch_data[key] = list(map(val.__getitem__, batch_idxs))
 41 | 
 42 |             batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
 43 |             yield batch_idxs, batch_ds
 44 | 
 45 | 
 46 | class SquadDataSet(DataSet):
 47 |     def __init__(self, data, data_type, shared=None, valid_idxs=None):
 48 |         super(SquadDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs)
 49 | 
 50 | 
 51 | def load_metadata(config, data_type):
 52 |     metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type))
 53 |     with open(metadata_path, 'r') as fh:
 54 |         metadata = json.load(fh)
 55 |         for key, val in metadata.items():
 56 |             config.__setattr__(key, val)
 57 |         return metadata
 58 | 
 59 | 
 60 | def read_data(config, data_type, ref, data_filter=None):
 61 |     data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type))
 62 |     shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
 63 |     with open(data_path, 'r') as fh:
 64 |         data = json.load(fh)
 65 |     with open(shared_path, 'r') as fh:
 66 |         shared = json.load(fh)
 67 | 
 68 |     num_examples = len(next(iter(data.values())))
 69 |     if data_filter is None:
 70 |         valid_idxs = range(num_examples)
 71 |     else:
 72 |         mask = []
 73 |         keys = data.keys()
 74 |         values = data.values()
 75 |         for vals in zip(*values):
 76 |             each = {key: val for key, val in zip(keys, vals)}
 77 |             mask.append(data_filter(each, shared))
 78 |         valid_idxs = [idx for idx in range(len(mask)) if mask[idx]]
 79 | 
 80 |     print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
 81 | 
 82 |     shared_path = os.path.join(config.out_dir, "shared.json")
 83 |     if not ref:
 84 |         word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
 85 |         char_counter = shared['char_counter']
 86 |         pos_counter = shared['pos_counter']
 87 |         shared['word2idx'] = {word: idx + 2 for idx, word in
 88 |                               enumerate(word for word, count in word_counter.items()
 89 |                                         if count > config.word_count_th)}
 90 |         shared['char2idx'] = {char: idx + 2 for idx, char in
 91 |                               enumerate(char for char, count in char_counter.items()
 92 |                                         if count > config.char_count_th)}
 93 |         shared['pos2idx'] = {pos: idx + 2 for idx, pos in enumerate(pos_counter.keys())}
 94 |         NULL = "-NULL-"
 95 |         UNK = "-UNK-"
 96 |         shared['word2idx'][NULL] = 0
 97 |         shared['word2idx'][UNK] = 1
 98 |         shared['char2idx'][NULL] = 0
 99 |         shared['char2idx'][UNK] = 1
100 |         shared['pos2idx'][NULL] = 0
101 |         shared['pos2idx'][UNK] = 1
102 |         json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx'],
103 |                    'pos2idx': shared['pos2idx']}, open(shared_path, 'w'))
104 |     else:
105 |         new_shared = json.load(open(shared_path, 'r'))
106 |         for key, val in new_shared.items():
107 |             shared[key] = val
108 | 
109 |     data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
110 |     return data_set
111 | 
112 | 
113 | def get_squad_data_filter(config):
114 |     def data_filter(data_point, shared):
115 |         assert shared is not None
116 |         rx, rcx, q, cq, y  = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y'))
117 |         x, cx, stx = shared['x'], shared['cx'], shared['stx']
118 |         if len(q) > config.ques_size_th:
119 |             return False
120 |         xi = x[rx[0]][rx[1]]
121 |         if len(xi) > config.num_sents_th:
122 |             return False
123 |         if any(len(xij) > config.sent_size_th for xij in xi):
124 |             return False
125 |         stxi = stx[rx[0]][rx[1]]
126 |         if any(nltk.tree.Tree.fromstring(s).height() > config.tree_height_th for s in stxi):
127 |             return False
128 |         return True
129 |     return data_filter
130 | 
131 | 
132 | def update_config(config, data_sets):
133 |     config.max_num_sents = 0
134 |     config.max_sent_size = 0
135 |     config.max_ques_size = 0
136 |     config.max_word_size = 0
137 |     config.max_tree_height = 0
138 |     for data_set in data_sets:
139 |         data = data_set.data
140 |         shared = data_set.shared
141 |         for idx in data_set.valid_idxs:
142 |             rx = data['*x'][idx]
143 |             q = data['q'][idx]
144 |             sents = shared['x'][rx[0]][rx[1]]
145 |             trees = map(nltk.tree.Tree.fromstring, shared['stx'][rx[0]][rx[1]])
146 |             config.max_tree_height = max(config.max_tree_height, max(tree.height() for tree in trees))
147 |             config.max_num_sents = max(config.max_num_sents, len(sents))
148 |             config.max_sent_size = max(config.max_sent_size, max(map(len, sents)))
149 |             config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent))
150 |             if len(q) > 0:
151 |                 config.max_ques_size = max(config.max_ques_size, len(q))
152 |                 config.max_word_size = max(config.max_word_size, max(len(word) for word in q))
153 | 
154 |     config.max_word_size = min(config.max_word_size, config.word_size_th)
155 | 
156 |     config.char_vocab_size = len(data_sets[0].shared['char2idx'])
157 |     config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
158 |     config.word_vocab_size = len(data_sets[0].shared['word2idx'])
159 |     config.pos_vocab_size = len(data_sets[0].shared['pos2idx'])
160 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import math
  4 | import os
  5 | import shutil
  6 | from pprint import pprint
  7 | 
  8 | import tensorflow as tf
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | 
 12 | from tree.evaluator import AccuracyEvaluator2, Evaluator
 13 | from tree.graph_handler import GraphHandler
 14 | from tree.model import Model
 15 | from tree.trainer import Trainer
 16 | 
 17 | from tree.read_data import load_metadata, read_data, get_squad_data_filter, update_config
 18 | 
 19 | 
 20 | def main(config):
 21 |     set_dirs(config)
 22 |     if config.mode == 'train':
 23 |         _train(config)
 24 |     elif config.mode == 'test':
 25 |         _test(config)
 26 |     elif config.mode == 'forward':
 27 |         _forward(config)
 28 |     else:
 29 |         raise ValueError("invalid value for 'mode': {}".format(config.mode))
 30 | 
 31 | 
 32 | def _config_draft(config):
 33 |     if config.draft:
 34 |         config.num_steps = 10
 35 |         config.eval_period = 10
 36 |         config.log_period = 1
 37 |         config.save_period = 10
 38 |         config.eval_num_batches = 1
 39 | 
 40 | 
 41 | def _train(config):
 42 |     # load_metadata(config, 'train')  # this updates the config file according to metadata file
 43 | 
 44 |     data_filter = get_squad_data_filter(config)
 45 |     train_data = read_data(config, 'train', config.load, data_filter=data_filter)
 46 |     dev_data = read_data(config, 'dev', True, data_filter=data_filter)
 47 |     update_config(config, [train_data, dev_data])
 48 | 
 49 |     _config_draft(config)
 50 | 
 51 |     word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
 52 |     word2idx_dict = train_data.shared['word2idx']
 53 |     idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
 54 |     print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
 55 |     emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
 56 |                         else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
 57 |                         for idx in range(config.word_vocab_size)])
 58 |     config.emb_mat = emb_mat
 59 | 
 60 |     # construct model graph and variables (using default graph)
 61 |     pprint(config.__flags, indent=2)
 62 |     model = Model(config)
 63 |     trainer = Trainer(config, model)
 64 |     evaluator = AccuracyEvaluator2(config, model)
 65 |     graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
 66 | 
 67 |     # Variables
 68 |     sess = tf.Session()
 69 |     graph_handler.initialize(sess)
 70 | 
 71 |     # begin training
 72 |     num_steps = config.num_steps or int(config.num_epochs * train_data.num_examples / config.batch_size)
 73 |     max_acc = 0
 74 |     noupdate_count = 0
 75 |     global_step = 0
 76 |     for _, batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_steps, shuffle=True), total=num_steps):
 77 |         global_step = sess.run(model.global_step) + 1  # +1 because all calculations are done after step
 78 |         get_summary = global_step % config.log_period == 0
 79 |         loss, summary, train_op = trainer.step(sess, batch, get_summary=get_summary)
 80 |         if get_summary:
 81 |             graph_handler.add_summary(summary, global_step)
 82 | 
 83 |         # Occasional evaluation and saving
 84 |         if global_step % config.save_period == 0:
 85 |             graph_handler.save(sess, global_step=global_step)
 86 |         if global_step % config.eval_period == 0:
 87 |             num_batches = math.ceil(dev_data.num_examples / config.batch_size)
 88 |             if 0 < config.eval_num_batches < num_batches:
 89 |                 num_batches = config.eval_num_batches
 90 |             e = evaluator.get_evaluation_from_batches(
 91 |                 sess, tqdm(dev_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
 92 |             graph_handler.add_summaries(e.summaries, global_step)
 93 |             if e.acc > max_acc:
 94 |                 max_acc = e.acc
 95 |                 noupdate_count = 0
 96 |             else:
 97 |                 noupdate_count += 1
 98 |                 if noupdate_count == config.early_stop:
 99 |                     break
100 |             if config.dump_eval:
101 |                 graph_handler.dump_eval(e)
102 |     if global_step % config.save_period != 0:
103 |         graph_handler.save(sess, global_step=global_step)
104 | 
105 | 
106 | def _test(config):
107 |     test_data = read_data(config, 'test', True)
108 |     update_config(config, [test_data])
109 | 
110 |     _config_draft(config)
111 | 
112 |     pprint(config.__flags, indent=2)
113 |     model = Model(config)
114 |     evaluator = AccuracyEvaluator2(config, model)
115 |     graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
116 | 
117 |     sess = tf.Session()
118 |     graph_handler.initialize(sess)
119 | 
120 |     num_batches = math.ceil(test_data.num_examples / config.batch_size)
121 |     if 0 < config.eval_num_batches < num_batches:
122 |         num_batches = config.eval_num_batches
123 |     e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
124 |     print(e)
125 |     if config.dump_eval:
126 |         graph_handler.dump_eval(e)
127 | 
128 | 
129 | def _forward(config):
130 | 
131 |     forward_data = read_data(config, 'forward', True)
132 | 
133 |     _config_draft(config)
134 | 
135 |     pprint(config.__flag, indent=2)
136 |     model = Model(config)
137 |     evaluator = Evaluator(config, model)
138 |     graph_handler = GraphHandler(config)  # controls all tensors and variables in the graph, including loading /saving
139 | 
140 |     sess = tf.Session()
141 |     graph_handler.initialize(sess)
142 | 
143 |     num_batches = math.ceil(forward_data.num_examples / config.batch_size)
144 |     if 0 < config.eval_num_batches < num_batches:
145 |         num_batches = config.eval_num_batches
146 |     e = evaluator.get_evaluation_from_batches(sess, tqdm(forward_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
147 |     print(e)
148 |     if config.dump_eval:
149 |         graph_handler.dump_eval(e)
150 | 
151 | 
152 | def set_dirs(config):
153 |     # create directories
154 |     if not config.load and os.path.exists(config.out_dir):
155 |         shutil.rmtree(config.out_dir)
156 | 
157 |     config.save_dir = os.path.join(config.out_dir, "save")
158 |     config.log_dir = os.path.join(config.out_dir, "log")
159 |     config.eval_dir = os.path.join(config.out_dir, "eval")
160 |     if not os.path.exists(config.out_dir):
161 |         os.makedirs(config.out_dir)
162 |     if not os.path.exists(config.save_dir):
163 |         os.mkdir(config.save_dir)
164 |     if not os.path.exists(config.log_dir):
165 |         os.mkdir(config.eval_dir)
166 | 
167 | 
168 | def _get_args():
169 |     parser = argparse.ArgumentParser()
170 |     parser.add_argument("config_path")
171 |     return parser.parse_args()
172 | 
173 | 
174 | class Config(object):
175 |     def __init__(self, **entries):
176 |         self.__dict__.update(entries)
177 | 
178 | 
179 | def _run():
180 |     args = _get_args()
181 |     with open(args.config_path, 'r') as fh:
182 |         config = Config(**json.load(fh))
183 |         main(config)
184 | 
185 | 
186 | if __name__ == "__main__":
187 |     _run()
188 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/cnn_dm/prepro.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | # data: q, cq, (dq), (pq), y, *x, *cx
  5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
  6 | # no metadata
  7 | from collections import Counter
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | from my.utils import process_tokens
 12 | from squad.utils import get_word_span, process_tokens
 13 | 
 14 | 
 15 | def bool_(arg):
 16 |     if arg == 'True':
 17 |         return True
 18 |     elif arg == 'False':
 19 |         return False
 20 |     raise Exception(arg)
 21 | 
 22 | 
 23 | def main():
 24 |     args = get_args()
 25 |     prepro(args)
 26 | 
 27 | 
 28 | def get_args():
 29 |     parser = argparse.ArgumentParser()
 30 |     home = os.path.expanduser("~")
 31 |     source_dir = os.path.join(home, "data", "cnn", 'questions')
 32 |     target_dir = "data/cnn"
 33 |     glove_dir = os.path.join(home, "data", "glove")
 34 |     parser.add_argument("--source_dir", default=source_dir)
 35 |     parser.add_argument("--target_dir", default=target_dir)
 36 |     parser.add_argument("--glove_dir", default=glove_dir)
 37 |     parser.add_argument("--glove_corpus", default='6B')
 38 |     parser.add_argument("--glove_vec_size", default=100, type=int)
 39 |     parser.add_argument("--debug", default=False, type=bool_)
 40 |     parser.add_argument("--num_sents_th", default=200, type=int)
 41 |     parser.add_argument("--ques_size_th", default=30, type=int)
 42 |     parser.add_argument("--width", default=5, type=int)
 43 |     # TODO : put more args here
 44 |     return parser.parse_args()
 45 | 
 46 | 
 47 | def prepro(args):
 48 |     prepro_each(args, 'train')
 49 |     prepro_each(args, 'dev')
 50 |     prepro_each(args, 'test')
 51 | 
 52 | 
 53 | def para2sents(para, width):
 54 |     """
 55 |     Turn para into double array of words (wordss)
 56 |     Where each sentence is up to 5 word neighbors of each entity
 57 |     :param para:
 58 |     :return:
 59 |     """
 60 |     words = para.split(" ")
 61 |     sents = []
 62 |     for i, word in enumerate(words):
 63 |         if word.startswith("@"):
 64 |             start = max(i - width, 0)
 65 |             stop = min(i + width + 1, len(words))
 66 |             sent = words[start:stop]
 67 |             sents.append(sent)
 68 |     return sents
 69 | 
 70 | 
 71 | def get_word2vec(args, word_counter):
 72 |     glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
 73 |     sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
 74 |     total = sizes[args.glove_corpus]
 75 |     word2vec_dict = {}
 76 |     with open(glove_path, 'r', encoding='utf-8') as fh:
 77 |         for line in tqdm(fh, total=total):
 78 |             array = line.lstrip().rstrip().split(" ")
 79 |             word = array[0]
 80 |             vector = list(map(float, array[1:]))
 81 |             if word in word_counter:
 82 |                 word2vec_dict[word] = vector
 83 |             elif word.capitalize() in word_counter:
 84 |                 word2vec_dict[word.capitalize()] = vector
 85 |             elif word.lower() in word_counter:
 86 |                 word2vec_dict[word.lower()] = vector
 87 |             elif word.upper() in word_counter:
 88 |                 word2vec_dict[word.upper()] = vector
 89 | 
 90 |     print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
 91 |     return word2vec_dict
 92 | 
 93 | 
 94 | def prepro_each(args, mode):
 95 |     source_dir = os.path.join(args.source_dir, mode)
 96 |     word_counter = Counter()
 97 |     lower_word_counter = Counter()
 98 |     ent_counter = Counter()
 99 |     char_counter = Counter()
100 |     max_sent_size = 0
101 |     max_word_size = 0
102 |     max_ques_size = 0
103 |     max_num_sents = 0
104 | 
105 |     file_names = list(os.listdir(source_dir))
106 |     if args.debug:
107 |         file_names = file_names[:1000]
108 |     lens = []
109 | 
110 |     out_file_names = []
111 |     for file_name in tqdm(file_names, total=len(file_names)):
112 |         if file_name.endswith(".question"):
113 |             with open(os.path.join(source_dir, file_name), 'r') as fh:
114 |                 url = fh.readline().strip()
115 |                 _ = fh.readline()
116 |                 para = fh.readline().strip()
117 |                 _ = fh.readline()
118 |                 ques = fh.readline().strip()
119 |                 _ = fh.readline()
120 |                 answer = fh.readline().strip()
121 |                 _ = fh.readline()
122 |                 cands = list(line.strip() for line in fh)
123 |                 cand_ents = list(cand.split(":")[0] for cand in cands)
124 |                 sents = para2sents(para, args.width)
125 |                 ques_words = ques.split(" ")
126 | 
127 |                 # Filtering
128 |                 if len(sents) > args.num_sents_th or len(ques_words) > args.ques_size_th:
129 |                     continue
130 | 
131 |                 max_sent_size = max(max(map(len, sents)), max_sent_size)
132 |                 max_ques_size = max(len(ques_words), max_ques_size)
133 |                 max_word_size = max(max(len(word) for sent in sents for word in sent), max_word_size)
134 |                 max_num_sents = max(len(sents), max_num_sents)
135 | 
136 |                 for word in ques_words:
137 |                     if word.startswith("@"):
138 |                         ent_counter[word] += 1
139 |                         word_counter[word] += 1
140 |                     else:
141 |                         word_counter[word] += 1
142 |                         lower_word_counter[word.lower()] += 1
143 |                         for c in word:
144 |                             char_counter[c] += 1
145 |                 for sent in sents:
146 |                     for word in sent:
147 |                         if word.startswith("@"):
148 |                             ent_counter[word] += 1
149 |                             word_counter[word] += 1
150 |                         else:
151 |                             word_counter[word] += 1
152 |                             lower_word_counter[word.lower()] += 1
153 |                             for c in word:
154 |                                 char_counter[c] += 1
155 | 
156 |                 out_file_names.append(file_name)
157 |                 lens.append(len(sents))
158 |     num_examples = len(out_file_names)
159 | 
160 |     assert len(out_file_names) == len(lens)
161 |     sorted_file_names, lens = zip(*sorted(zip(out_file_names, lens), key=lambda each: each[1]))
162 |     assert lens[-1] == max_num_sents
163 | 
164 |     word2vec_dict = get_word2vec(args, word_counter)
165 |     lower_word2vec_dit = get_word2vec(args, lower_word_counter)
166 | 
167 |     shared = {'word_counter': word_counter, 'ent_counter': ent_counter, 'char_counter': char_counter,
168 |               'lower_word_counter': lower_word_counter,
169 |               'max_num_sents': max_num_sents, 'max_sent_size': max_sent_size, 'max_word_size': max_word_size,
170 |               'max_ques_size': max_ques_size,
171 |               'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dit, 'sorted': sorted_file_names,
172 |               'num_examples': num_examples}
173 | 
174 |     print("max num sents: {}".format(max_num_sents))
175 |     print("max ques size: {}".format(max_ques_size))
176 | 
177 |     if not os.path.exists(args.target_dir):
178 |         os.makedirs(args.target_dir)
179 |     shared_path = os.path.join(args.target_dir, "shared_{}.json".format(mode))
180 |     with open(shared_path, 'w') as fh:
181 |         json.dump(shared, fh)
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()
186 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/prepro_aug.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | # data: q, cq, (dq), (pq), y, *x, *cx
  5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
  6 | # no metadata
  7 | from collections import Counter
  8 | 
  9 | import nltk
 10 | from tqdm import tqdm
 11 | 
 12 | from my.nltk_utils import load_compressed_tree
 13 | 
 14 | 
 15 | def bool_(arg):
 16 |     if arg == 'True':
 17 |         return True
 18 |     elif arg == 'False':
 19 |         return False
 20 |     raise Exception()
 21 | 
 22 | 
 23 | def main():
 24 |     args = get_args()
 25 |     prepro(args)
 26 | 
 27 | 
 28 | def get_args():
 29 |     parser = argparse.ArgumentParser()
 30 |     home = os.path.expanduser("~")
 31 |     source_dir = os.path.join(home, "data", "squad")
 32 |     target_dir = "data/squad"
 33 |     glove_dir = os.path.join(home, "data", "glove")
 34 |     parser.add_argument("--source_dir", default=source_dir)
 35 |     parser.add_argument("--target_dir", default=target_dir)
 36 |     parser.add_argument("--debug", default=False, type=bool_)
 37 |     parser.add_argument("--train_ratio", default=0.9, type=int)
 38 |     parser.add_argument("--glove_corpus", default="6B")
 39 |     parser.add_argument("--glove_dir", default=glove_dir)
 40 |     parser.add_argument("--glove_vec_size", default=100, type=int)
 41 |     parser.add_argument("--full_train", default=False, type=bool_)
 42 |     # TODO : put more args here
 43 |     return parser.parse_args()
 44 | 
 45 | 
 46 | def prepro(args):
 47 |     if not os.path.exists(args.target_dir):
 48 |         os.makedirs(args.target_dir)
 49 | 
 50 |     if args.full_train:
 51 |         data_train, shared_train = prepro_each(args, 'train')
 52 |         data_dev, shared_dev = prepro_each(args, 'dev')
 53 |     else:
 54 |         data_train, shared_train = prepro_each(args, 'train', 0.0, args.train_ratio)
 55 |         data_dev, shared_dev = prepro_each(args, 'train', args.train_ratio, 1.0)
 56 |     data_test, shared_test = prepro_each(args, 'dev')
 57 | 
 58 |     print("saving ...")
 59 |     save(args, data_train, shared_train, 'train')
 60 |     save(args, data_dev, shared_dev, 'dev')
 61 |     save(args, data_test, shared_test, 'test')
 62 | 
 63 | 
 64 | def save(args, data, shared, data_type):
 65 |     data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type))
 66 |     shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type))
 67 |     json.dump(data, open(data_path, 'w'))
 68 |     json.dump(shared, open(shared_path, 'w'))
 69 | 
 70 | 
 71 | def get_word2vec(args, word_counter):
 72 |     glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
 73 |     sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
 74 |     total = sizes[args.glove_corpus]
 75 |     word2vec_dict = {}
 76 |     with open(glove_path, 'r') as fh:
 77 |         for line in tqdm(fh, total=total):
 78 |             array = line.lstrip().rstrip().split(" ")
 79 |             word = array[0]
 80 |             vector = list(map(float, array[1:]))
 81 |             if word in word_counter:
 82 |                 word2vec_dict[word] = vector
 83 |             elif word.capitalize() in word_counter:
 84 |                 word2vec_dict[word.capitalize()] = vector
 85 |             elif word.lower() in word_counter:
 86 |                 word2vec_dict[word.lower()] = vector
 87 |             elif word.upper() in word_counter:
 88 |                 word2vec_dict[word.upper()] = vector
 89 | 
 90 |     print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
 91 |     return word2vec_dict
 92 | 
 93 | 
 94 | def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0):
 95 |     source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type))
 96 |     source_data = json.load(open(source_path, 'r'))
 97 | 
 98 |     q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
 99 |     x, cx, tx, stx = [], [], [], []
100 |     answerss = []
101 |     word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
102 |     pos_counter = Counter()
103 |     start_ai = int(round(len(source_data['data']) * start_ratio))
104 |     stop_ai = int(round(len(source_data['data']) * stop_ratio))
105 |     for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
106 |         xp, cxp, txp, stxp = [], [], [], []
107 |         x.append(xp)
108 |         cx.append(cxp)
109 |         tx.append(txp)
110 |         stx.append(stxp)
111 |         for pi, para in enumerate(article['paragraphs']):
112 |             xi = []
113 |             for dep in para['deps']:
114 |                 if dep is None:
115 |                     xi.append([])
116 |                 else:
117 |                     xi.append([node[0] for node in dep[0]])
118 |             cxi = [[list(xijk) for xijk in xij] for xij in xi]
119 |             xp.append(xi)
120 |             cxp.append(cxi)
121 |             txp.append(para['consts'])
122 |             stxp.append([str(load_compressed_tree(s)) for s in para['consts']])
123 |             trees = map(nltk.tree.Tree.fromstring, para['consts'])
124 |             for tree in trees:
125 |                 for subtree in tree.subtrees():
126 |                     pos_counter[subtree.label()] += 1
127 | 
128 |             for xij in xi:
129 |                 for xijk in xij:
130 |                     word_counter[xijk] += len(para['qas'])
131 |                     lower_word_counter[xijk.lower()] += len(para['qas'])
132 |                     for xijkl in xijk:
133 |                         char_counter[xijkl] += len(para['qas'])
134 | 
135 |             rxi = [ai, pi]
136 |             assert len(x) - 1 == ai
137 |             assert len(x[ai]) - 1 == pi
138 |             for qa in para['qas']:
139 |                 dep = qa['dep']
140 |                 qi = [] if dep is None else [node[0] for node in dep[0]]
141 |                 cqi = [list(qij) for qij in qi]
142 |                 yi = []
143 |                 answers = []
144 |                 for answer in qa['answers']:
145 |                     answers.append(answer['text'])
146 |                     yi0 = answer['answer_word_start'] or [0, 0]
147 |                     yi1 = answer['answer_word_stop'] or [0, 1]
148 |                     assert len(xi[yi0[0]]) > yi0[1]
149 |                     assert len(xi[yi1[0]]) >= yi1[1]
150 |                     yi.append([yi0, yi1])
151 | 
152 |                 for qij in qi:
153 |                     word_counter[qij] += 1
154 |                     lower_word_counter[qij.lower()] += 1
155 |                     for qijk in qij:
156 |                         char_counter[qijk] += 1
157 | 
158 |                 q.append(qi)
159 |                 cq.append(cqi)
160 |                 y.append(yi)
161 |                 rx.append(rxi)
162 |                 rcx.append(rxi)
163 |                 ids.append(qa['id'])
164 |                 idxs.append(len(idxs))
165 |                 answerss.append(answers)
166 | 
167 |             if args.debug:
168 |                 break
169 | 
170 |     word2vec_dict = get_word2vec(args, word_counter)
171 |     lower_word2vec_dict = get_word2vec(args, lower_word_counter)
172 | 
173 |     data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx,
174 |             'idxs': idxs, 'ids': ids, 'answerss': answerss}
175 |     shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx,
176 |               'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
177 |               'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter}
178 | 
179 |     return data, shared
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     main()


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import nltk\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "%matplotlib inline"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 10,
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n",
 28 |       "(PRP I)\n",
 29 |       "(VP (VBP am) (NNP Sam))\n",
 30 |       "(VBP am)\n",
 31 |       "(NNP Sam)\n",
 32 |       "(. .)\n",
 33 |       "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
 39 |     "tree = nltk.tree.Tree.fromstring(string)\n",
 40 |     "\n",
 41 |     "def load_compressed_tree(s):\n",
 42 |     "\n",
 43 |     "    def compress_tree(tree):\n",
 44 |     "        if len(tree) == 1:\n",
 45 |     "            if isinstance(tree[0], nltk.tree.Tree):\n",
 46 |     "                return compress_tree(tree[0])\n",
 47 |     "            else:\n",
 48 |     "                return tree\n",
 49 |     "        else:\n",
 50 |     "            for i, t in enumerate(tree):\n",
 51 |     "                tree[i] = compress_tree(t)\n",
 52 |     "            return tree\n",
 53 |     "\n",
 54 |     "    return compress_tree(nltk.tree.Tree.fromstring(s))\n",
 55 |     "tree = load_compressed_tree(string)\n",
 56 |     "for t in tree.subtrees():\n",
 57 |     "    print(t)\n",
 58 |     "    \n",
 59 |     "print(str(tree))"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "(ROOT I am Sam .)\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "print(tree.flatten())"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 10,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "['ROOT', 'S', 'NP', 'PRP', 'VP', 'VBP', 'NP', 'NNP', '.']\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "print(list(t.label() for t in tree.subtrees()))"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 11,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "import json\n",
109 |     "d = json.load(open(\"data/squad/shared_dev.json\", 'r'))"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 12,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/plain": [
122 |        "73"
123 |       ]
124 |      },
125 |      "execution_count": 12,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "len(d['pos_counter'])"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 13,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "{'#': 6,\n",
145 |        " '$': 80,\n",
146 |        " \"''\": 1291,\n",
147 |        " ',': 14136,\n",
148 |        " '-LRB-': 1926,\n",
149 |        " '-RRB-': 1925,\n",
150 |        " '.': 9505,\n",
151 |        " ':': 1455,\n",
152 |        " 'ADJP': 3426,\n",
153 |        " 'ADVP': 4936,\n",
154 |        " 'CC': 9300,\n",
155 |        " 'CD': 6216,\n",
156 |        " 'CONJP': 191,\n",
157 |        " 'DT': 26286,\n",
158 |        " 'EX': 288,\n",
159 |        " 'FRAG': 107,\n",
160 |        " 'FW': 96,\n",
161 |        " 'IN': 32564,\n",
162 |        " 'INTJ': 12,\n",
163 |        " 'JJ': 21452,\n",
164 |        " 'JJR': 563,\n",
165 |        " 'JJS': 569,\n",
166 |        " 'LS': 7,\n",
167 |        " 'LST': 1,\n",
168 |        " 'MD': 1051,\n",
169 |        " 'NAC': 19,\n",
170 |        " 'NN': 34750,\n",
171 |        " 'NNP': 28392,\n",
172 |        " 'NNPS': 1400,\n",
173 |        " 'NNS': 16716,\n",
174 |        " 'NP': 91636,\n",
175 |        " 'NP-TMP': 236,\n",
176 |        " 'NX': 108,\n",
177 |        " 'PDT': 89,\n",
178 |        " 'POS': 1451,\n",
179 |        " 'PP': 33278,\n",
180 |        " 'PRN': 2085,\n",
181 |        " 'PRP': 2320,\n",
182 |        " 'PRP$': 1959,\n",
183 |        " 'PRT': 450,\n",
184 |        " 'QP': 838,\n",
185 |        " 'RB': 7611,\n",
186 |        " 'RBR': 301,\n",
187 |        " 'RBS': 252,\n",
188 |        " 'ROOT': 9587,\n",
189 |        " 'RP': 454,\n",
190 |        " 'RRC': 19,\n",
191 |        " 'S': 21557,\n",
192 |        " 'SBAR': 5009,\n",
193 |        " 'SBARQ': 6,\n",
194 |        " 'SINV': 135,\n",
195 |        " 'SQ': 5,\n",
196 |        " 'SYM': 17,\n",
197 |        " 'TO': 5167,\n",
198 |        " 'UCP': 143,\n",
199 |        " 'UH': 15,\n",
200 |        " 'VB': 4197,\n",
201 |        " 'VBD': 8377,\n",
202 |        " 'VBG': 3570,\n",
203 |        " 'VBN': 7218,\n",
204 |        " 'VBP': 2897,\n",
205 |        " 'VBZ': 4146,\n",
206 |        " 'VP': 33696,\n",
207 |        " 'WDT': 1368,\n",
208 |        " 'WHADJP': 5,\n",
209 |        " 'WHADVP': 439,\n",
210 |        " 'WHNP': 1927,\n",
211 |        " 'WHPP': 153,\n",
212 |        " 'WP': 482,\n",
213 |        " 'WP$': 50,\n",
214 |        " 'WRB': 442,\n",
215 |        " 'X': 23,\n",
216 |        " '``': 1269}"
217 |       ]
218 |      },
219 |      "execution_count": 13,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "d['pos_counter']"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 3,
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "outputs": [
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "[[False False False False]\n",
240 |       " [False  True False False]\n",
241 |       " [False False False False]]\n",
242 |       "[[0 2 2 0]\n",
243 |       " [2 2 0 2]\n",
244 |       " [2 0 0 0]]\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "from my.nltk_utils import tree2matrix, load_compressed_tree, find_max_f1_subtree, set_span\n",
250 |     "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
251 |     "tree = load_compressed_tree(string)\n",
252 |     "span = (1, 3)\n",
253 |     "set_span(tree)\n",
254 |     "subtree = find_max_f1_subtree(tree, span)\n",
255 |     "f = lambda t: t == subtree\n",
256 |     "g = lambda t: 1 if isinstance(t, str) else 2\n",
257 |     "a, b = tree2matrix(tree, f, dtype='bool')\n",
258 |     "c, d = tree2matrix(tree, g, dtype='int32')\n",
259 |     "print(a)\n",
260 |     "print(c)"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "collapsed": true
268 |    },
269 |    "outputs": [],
270 |    "source": []
271 |   }
272 |  ],
273 |  "metadata": {
274 |   "kernelspec": {
275 |    "display_name": "Python 3",
276 |    "language": "python",
277 |    "name": "python3"
278 |   },
279 |   "language_info": {
280 |    "codemirror_mode": {
281 |     "name": "ipython",
282 |     "version": 3
283 |    },
284 |    "file_extension": ".py",
285 |    "mimetype": "text/x-python",
286 |    "name": "python",
287 |    "nbconvert_exporter": "python",
288 |    "pygments_lexer": "ipython3",
289 |    "version": "3.5.1"
290 |   }
291 |  },
292 |  "nbformat": 4,
293 |  "nbformat_minor": 0
294 | }
295 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/eda_aug_dev.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import json\n",
 12 |     "\n",
 13 |     "aug_data_path = \"/Users/minjoons/data/squad/dev-v1.0-aug.json\"\n",
 14 |     "aug_data = json.load(open(aug_data_path, 'r'))"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 17,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "(['Denver', 'Broncos'], 'Denver Broncos')\n",
 29 |       "(['Denver', 'Broncos'], 'Denver Broncos')\n",
 30 |       "(['Denver', 'Broncos'], 'Denver Broncos ')\n",
 31 |       "(['Carolina', 'Panthers'], 'Carolina Panthers')\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "def compare_answers():\n",
 37 |     "    for article in aug_data['data']:\n",
 38 |     "        for para in article['paragraphs']:\n",
 39 |     "            deps = para['deps']\n",
 40 |     "            nodess = []\n",
 41 |     "            for dep in deps:\n",
 42 |     "                nodes, edges = dep\n",
 43 |     "                if dep is not None:\n",
 44 |     "                    nodess.append(nodes)\n",
 45 |     "                else:\n",
 46 |     "                    nodess.append([])\n",
 47 |     "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
 48 |     "            for qa in para['qas']:\n",
 49 |     "                for answer in qa['answers']:\n",
 50 |     "                    text = answer['text']\n",
 51 |     "                    word_start = answer['answer_word_start']\n",
 52 |     "                    word_stop = answer['answer_word_stop']\n",
 53 |     "                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
 54 |     "                    yield answer_words, text\n",
 55 |     "\n",
 56 |     "ca = compare_answers()\n",
 57 |     "print(next(ca))\n",
 58 |     "print(next(ca))\n",
 59 |     "print(next(ca))\n",
 60 |     "print(next(ca))"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 18,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "8\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "def counter():\n",
 80 |     "    count = 0\n",
 81 |     "    for article in aug_data['data']:\n",
 82 |     "        for para in article['paragraphs']:\n",
 83 |     "            deps = para['deps']\n",
 84 |     "            nodess = []\n",
 85 |     "            for dep in deps:\n",
 86 |     "                if dep is None:\n",
 87 |     "                    count += 1\n",
 88 |     "    print(count)\n",
 89 |     "counter()\n"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 19,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "0\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "def bad_node_counter():\n",
109 |     "    count = 0\n",
110 |     "    for article in aug_data['data']:\n",
111 |     "        for para in article['paragraphs']:\n",
112 |     "            sents = para['sents']\n",
113 |     "            deps = para['deps']\n",
114 |     "            nodess = []\n",
115 |     "            for dep in deps:\n",
116 |     "                if dep is not None:\n",
117 |     "                    nodes, edges = dep\n",
118 |     "                    for node in nodes:\n",
119 |     "                        if len(node) != 5:\n",
120 |     "                            count += 1\n",
121 |     "    print(count)\n",
122 |     "bad_node_counter()  "
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 20,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "7\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "def noanswer_counter():\n",
142 |     "    count = 0\n",
143 |     "    for article in aug_data['data']:\n",
144 |     "        for para in article['paragraphs']:\n",
145 |     "            deps = para['deps']\n",
146 |     "            nodess = []\n",
147 |     "            for dep in deps:\n",
148 |     "                if dep is not None:\n",
149 |     "                    nodes, edges = dep\n",
150 |     "                    nodess.append(nodes)\n",
151 |     "                else:\n",
152 |     "                    nodess.append([])\n",
153 |     "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
154 |     "            for qa in para['qas']:\n",
155 |     "                for answer in qa['answers']:\n",
156 |     "                    text = answer['text']\n",
157 |     "                    word_start = answer['answer_word_start']\n",
158 |     "                    word_stop = answer['answer_word_stop']\n",
159 |     "                    if word_start is None:\n",
160 |     "                        count += 1\n",
161 |     "    print(count)\n",
162 |     "noanswer_counter()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 22,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "10600\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 5,
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "10348\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "import nltk\n",
201 |     "\n",
202 |     "def _set_span(t, i):\n",
203 |     "    if isinstance(t[0], str):\n",
204 |     "        t.span = (i, i+len(t))\n",
205 |     "    else:\n",
206 |     "        first = True\n",
207 |     "        for c in t:\n",
208 |     "            cur_span = _set_span(c, i)\n",
209 |     "            i = cur_span[1]\n",
210 |     "            if first:\n",
211 |     "                min_ = cur_span[0]\n",
212 |     "                first = False\n",
213 |     "        max_ = cur_span[1]\n",
214 |     "        t.span = (min_, max_)\n",
215 |     "    return t.span\n",
216 |     "\n",
217 |     "\n",
218 |     "def set_span(t):\n",
219 |     "    assert isinstance(t, nltk.tree.Tree)\n",
220 |     "    try:\n",
221 |     "        return _set_span(t, 0)\n",
222 |     "    except:\n",
223 |     "        print(t)\n",
224 |     "        exit()\n",
225 |     "\n",
226 |     "def same_span_counter():\n",
227 |     "    count = 0\n",
228 |     "    for article in aug_data['data']:\n",
229 |     "        for para in article['paragraphs']:\n",
230 |     "            consts = para['consts']\n",
231 |     "            for const in consts:\n",
232 |     "                tree = nltk.tree.Tree.fromstring(const)\n",
233 |     "                set_span(tree)\n",
234 |     "                if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):\n",
235 |     "                    count += 1\n",
236 |     "    print(count)\n",
237 |     "same_span_counter()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": []
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "kernelspec": {
252 |    "display_name": "Python 3",
253 |    "language": "python",
254 |    "name": "python3"
255 |   },
256 |   "language_info": {
257 |    "codemirror_mode": {
258 |     "name": "ipython",
259 |     "version": 3
260 |    },
261 |    "file_extension": ".py",
262 |    "mimetype": "text/x-python",
263 |    "name": "python",
264 |    "nbconvert_exporter": "python",
265 |    "pygments_lexer": "ipython3",
266 |    "version": "3.5.1"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 0
271 | }
272 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/nn.py:
--------------------------------------------------------------------------------
  1 | from tensorflow.python.ops.rnn_cell_impl import _linear
  2 | from tensorflow.python.util import nest
  3 | import tensorflow as tf
  4 | 
  5 | from my.tensorflow import flatten, reconstruct, add_wd, exp_mask
  6 | 
  7 | 
  8 | def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0,
  9 |            is_train=None):
 10 |     if args is None or (nest.is_sequence(args) and not args):
 11 |         raise ValueError("`args` must be specified")
 12 |     if not nest.is_sequence(args):
 13 |         args = [args]
 14 | 
 15 |     flat_args = [flatten(arg, 1) for arg in args]
 16 |     if input_keep_prob < 1.0:
 17 |         assert is_train is not None
 18 |         flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
 19 |                      for arg in flat_args]
 20 |     with tf.variable_scope(scope or 'Linear'):
 21 |         flat_out = _linear(flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start))
 22 |     out = reconstruct(flat_out, args[0], 1)
 23 |     if squeeze:
 24 |         out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
 25 |     if wd:
 26 |         add_wd(wd)
 27 | 
 28 |     return out
 29 | 
 30 | 
 31 | def dropout(x, keep_prob, is_train, noise_shape=None, seed=None, name=None):
 32 |     with tf.name_scope(name or "dropout"):
 33 |         if keep_prob < 1.0:
 34 |             d = tf.nn.dropout(x, keep_prob, noise_shape=noise_shape, seed=seed)
 35 |             out = tf.cond(is_train, lambda: d, lambda: x)
 36 |             return out
 37 |         return x
 38 | 
 39 | 
 40 | def softmax(logits, mask=None, scope=None):
 41 |     with tf.name_scope(scope or "Softmax"):
 42 |         if mask is not None:
 43 |             logits = exp_mask(logits, mask)
 44 |         flat_logits = flatten(logits, 1)
 45 |         flat_out = tf.nn.softmax(flat_logits)
 46 |         out = reconstruct(flat_out, logits, 1)
 47 | 
 48 |         return out
 49 | 
 50 | 
 51 | def softsel(target, logits, mask=None, scope=None):
 52 |     """
 53 | 
 54 |     :param target: [ ..., J, d] dtype=float
 55 |     :param logits: [ ..., J], dtype=float
 56 |     :param mask: [ ..., J], dtype=bool
 57 |     :param scope:
 58 |     :return: [..., d], dtype=float
 59 |     """
 60 |     with tf.name_scope(scope or "Softsel"):
 61 |         a = softmax(logits, mask=mask)
 62 |         target_rank = len(target.get_shape().as_list())
 63 |         out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2)
 64 |         return out
 65 | 
 66 | 
 67 | def double_linear_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
 68 |     with tf.variable_scope(scope or "Double_Linear_Logits"):
 69 |         first = tf.tanh(linear(args, size, bias, bias_start=bias_start, scope='first',
 70 |                                wd=wd, input_keep_prob=input_keep_prob, is_train=is_train))
 71 |         second = linear(first, 1, bias, bias_start=bias_start, squeeze=True, scope='second',
 72 |                         wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
 73 |         if mask is not None:
 74 |             second = exp_mask(second, mask)
 75 |         return second
 76 | 
 77 | 
 78 | def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
 79 |     with tf.variable_scope(scope or "Linear_Logits"):
 80 |         logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first',
 81 |                         wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
 82 |         if mask is not None:
 83 |             logits = exp_mask(logits, mask)
 84 |         return logits
 85 | 
 86 | 
 87 | def sum_logits(args, mask=None, name=None):
 88 |     with tf.name_scope(name or "sum_logits"):
 89 |         if args is None or (nest.is_sequence(args) and not args):
 90 |             raise ValueError("`args` must be specified")
 91 |         if not nest.is_sequence(args):
 92 |             args = [args]
 93 |         rank = len(args[0].get_shape())
 94 |         logits = sum(tf.reduce_sum(arg, rank-1) for arg in args)
 95 |         if mask is not None:
 96 |             logits = exp_mask(logits, mask)
 97 |         return logits
 98 | 
 99 | 
100 | def get_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None, func=None):
101 |     if func is None:
102 |         func = "sum"
103 |     if func == 'sum':
104 |         return sum_logits(args, mask=mask, name=scope)
105 |     elif func == 'linear':
106 |         return linear_logits(args, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
107 |                              is_train=is_train)
108 |     elif func == 'double':
109 |         return double_linear_logits(args, size, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
110 |                                     is_train=is_train)
111 |     elif func == 'dot':
112 |         assert len(args) == 2
113 |         arg = args[0] * args[1]
114 |         return sum_logits([arg], mask=mask, name=scope)
115 |     elif func == 'mul_linear':
116 |         assert len(args) == 2
117 |         arg = args[0] * args[1]
118 |         return linear_logits([arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
119 |                              is_train=is_train)
120 |     elif func == 'proj':
121 |         assert len(args) == 2
122 |         d = args[1].get_shape()[-1]
123 |         proj = linear([args[0]], d, False, bias_start=bias_start, scope=scope, wd=wd, input_keep_prob=input_keep_prob,
124 |                       is_train=is_train)
125 |         return sum_logits([proj * args[1]], mask=mask)
126 |     elif func == 'tri_linear':
127 |         assert len(args) == 2
128 |         new_arg = args[0] * args[1]
129 |         return linear_logits([args[0], args[1], new_arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
130 |                              is_train=is_train)
131 |     else:
132 |         raise Exception()
133 | 
134 | 
135 | def highway_layer(arg, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
136 |     with tf.variable_scope(scope or "highway_layer"):
137 |         d = arg.get_shape()[-1]
138 |         trans = linear([arg], d, bias, bias_start=bias_start, scope='trans', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
139 |         trans = tf.nn.relu(trans)
140 |         gate = linear([arg], d, bias, bias_start=bias_start, scope='gate', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
141 |         gate = tf.nn.sigmoid(gate)
142 |         out = gate * trans + (1 - gate) * arg
143 |         return out
144 | 
145 | 
146 | def highway_network(arg, num_layers, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
147 |     with tf.variable_scope(scope or "highway_network"):
148 |         prev = arg
149 |         cur = None
150 |         for layer_idx in range(num_layers):
151 |             cur = highway_layer(prev, bias, bias_start=bias_start, scope="layer_{}".format(layer_idx), wd=wd,
152 |                                 input_keep_prob=input_keep_prob, is_train=is_train)
153 |             prev = cur
154 |         return cur
155 | 
156 | 
157 | def conv1d(in_, filter_size, height, padding, is_train=None, keep_prob=1.0, scope=None):
158 |     with tf.variable_scope(scope or "conv1d"):
159 |         num_channels = in_.get_shape()[-1]
160 |         filter_ = tf.get_variable("filter", shape=[1, height, num_channels, filter_size], dtype='float')
161 |         bias = tf.get_variable("bias", shape=[filter_size], dtype='float')
162 |         strides = [1, 1, 1, 1]
163 |         if is_train is not None and keep_prob < 1.0:
164 |             in_ = dropout(in_, keep_prob, is_train)
165 |         xxc = tf.nn.conv2d(in_, filter_, strides, padding) + bias  # [N*M, JX, W/filter_stride, d]
166 |         out = tf.reduce_max(tf.nn.relu(xxc), 2)  # [-1, JX, d]
167 |         return out
168 | 
169 | 
170 | def multi_conv1d(in_, filter_sizes, heights, padding, is_train=None, keep_prob=1.0, scope=None):
171 |     with tf.variable_scope(scope or "multi_conv1d"):
172 |         assert len(filter_sizes) == len(heights)
173 |         outs = []
174 |         for filter_size, height in zip(filter_sizes, heights):
175 |             if filter_size == 0:
176 |                 continue
177 |             out = conv1d(in_, filter_size, height, padding, is_train=is_train, keep_prob=keep_prob, scope="conv1d_{}".format(height))
178 |             outs.append(out)
179 |         concat_out = tf.concat(axis=2, values=outs)
180 |         return concat_out
181 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/evaluator.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | from tree.read_data import DataSet
  5 | from my.nltk_utils import span_f1
  6 | 
  7 | 
  8 | class Evaluation(object):
  9 |     def __init__(self, data_type, global_step, idxs, yp):
 10 |         self.data_type = data_type
 11 |         self.global_step = global_step
 12 |         self.idxs = idxs
 13 |         self.yp = yp
 14 |         self.num_examples = len(yp)
 15 |         self.dict = {'data_type': data_type,
 16 |                      'global_step': global_step,
 17 |                      'yp': yp,
 18 |                      'idxs': idxs,
 19 |                      'num_examples': self.num_examples}
 20 |         self.summaries = None
 21 | 
 22 |     def __repr__(self):
 23 |         return "{} step {}".format(self.data_type, self.global_step)
 24 | 
 25 |     def __add__(self, other):
 26 |         if other == 0:
 27 |             return self
 28 |         assert self.data_type == other.data_type
 29 |         assert self.global_step == other.global_step
 30 |         new_yp = self.yp + other.yp
 31 |         new_idxs = self.idxs + other.idxs
 32 |         return Evaluation(self.data_type, self.global_step, new_idxs, new_yp)
 33 | 
 34 |     def __radd__(self, other):
 35 |         return self.__add__(other)
 36 | 
 37 | 
 38 | class LabeledEvaluation(Evaluation):
 39 |     def __init__(self, data_type, global_step, idxs, yp, y):
 40 |         super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp)
 41 |         self.y = y
 42 |         self.dict['y'] = y
 43 | 
 44 |     def __add__(self, other):
 45 |         if other == 0:
 46 |             return self
 47 |         assert self.data_type == other.data_type
 48 |         assert self.global_step == other.global_step
 49 |         new_yp = self.yp + other.yp
 50 |         new_y = self.y + other.y
 51 |         new_idxs = self.idxs + other.idxs
 52 |         return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y)
 53 | 
 54 | 
 55 | class AccuracyEvaluation(LabeledEvaluation):
 56 |     def __init__(self, data_type, global_step, idxs, yp, y, correct, loss):
 57 |         super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y)
 58 |         self.loss = loss
 59 |         self.correct = correct
 60 |         self.acc = sum(correct) / len(correct)
 61 |         self.dict['loss'] = loss
 62 |         self.dict['correct'] = correct
 63 |         self.dict['acc'] = self.acc
 64 |         loss_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=self.loss)])
 65 |         acc_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/acc', simple_value=self.acc)])
 66 |         self.summaries = [loss_summary, acc_summary]
 67 | 
 68 |     def __repr__(self):
 69 |         return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss)
 70 | 
 71 |     def __add__(self, other):
 72 |         if other == 0:
 73 |             return self
 74 |         assert self.data_type == other.data_type
 75 |         assert self.global_step == other.global_step
 76 |         new_idxs = self.idxs + other.idxs
 77 |         new_yp = self.yp + other.yp
 78 |         new_y = self.y + other.y
 79 |         new_correct = self.correct + other.correct
 80 |         new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
 81 |         return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss)
 82 | 
 83 | 
 84 | class Evaluator(object):
 85 |     def __init__(self, config, model):
 86 |         self.config = config
 87 |         self.model = model
 88 | 
 89 |     def get_evaluation(self, sess, batch):
 90 |         idxs, data_set = batch
 91 |         feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
 92 |         global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
 93 |         yp = yp[:data_set.num_examples]
 94 |         e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist())
 95 |         return e
 96 | 
 97 |     def get_evaluation_from_batches(self, sess, batches):
 98 |         e = sum(self.get_evaluation(sess, batch) for batch in batches)
 99 |         return e
100 | 
101 | 
102 | class LabeledEvaluator(Evaluator):
103 |     def get_evaluation(self, sess, batch):
104 |         idxs, data_set = batch
105 |         feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
106 |         global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
107 |         yp = yp[:data_set.num_examples]
108 |         y = feed_dict[self.model.y]
109 |         e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist())
110 |         return e
111 | 
112 | 
113 | class AccuracyEvaluator(LabeledEvaluator):
114 |     def get_evaluation(self, sess, batch):
115 |         idxs, data_set = batch
116 |         assert isinstance(data_set, DataSet)
117 |         feed_dict = self.model.get_feed_dict(data_set, False)
118 |         global_step, yp, loss = sess.run([self.model.global_step, self.model.yp, self.model.loss], feed_dict=feed_dict)
119 |         y = feed_dict[self.model.y]
120 |         yp = yp[:data_set.num_examples]
121 |         correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)]
122 |         e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), correct, float(loss))
123 |         return e
124 | 
125 |     @staticmethod
126 |     def compare(yi, ypi):
127 |         return int(np.argmax(yi)) == int(np.argmax(ypi))
128 | 
129 | 
130 | class AccuracyEvaluator2(AccuracyEvaluator):
131 |     @staticmethod
132 |     def compare(yi, ypi):
133 |         i = int(np.argmax(yi.flatten()))
134 |         j = int(np.argmax(ypi.flatten()))
135 |         # print(i, j, i == j)
136 |         return i == j
137 | 
138 | 
139 | class TempEvaluation(AccuracyEvaluation):
140 |     def __init__(self, data_type, global_step, idxs, yp, yp2, y, y2, correct, loss, f1s):
141 |         super(TempEvaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss)
142 |         self.y2 = y2
143 |         self.yp2 = yp2
144 |         self.f1s = f1s
145 |         self.f1 = float(np.mean(f1s))
146 |         self.dict['y2'] = y2
147 |         self.dict['yp2'] = yp2
148 |         self.dict['f1s'] = f1s
149 |         self.dict['f1'] = self.f1
150 |         f1_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=self.f1)])
151 |         self.summaries.append(f1_summary)
152 | 
153 |     def __add__(self, other):
154 |         if other == 0:
155 |             return self
156 |         assert self.data_type == other.data_type
157 |         assert self.global_step == other.global_step
158 |         new_idxs = self.idxs + other.idxs
159 |         new_yp = self.yp + other.yp
160 |         new_yp2 = self.yp2 + other.yp2
161 |         new_y = self.y + other.y
162 |         new_y2 = self.y2 + other.y2
163 |         new_correct = self.correct + other.correct
164 |         new_f1s = self.f1s + other.f1s
165 |         new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
166 |         return TempEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_y2, new_correct, new_loss, new_f1s)
167 | 
168 | 
169 | class TempEvaluator(LabeledEvaluator):
170 |     def get_evaluation(self, sess, batch):
171 |         idxs, data_set = batch
172 |         assert isinstance(data_set, DataSet)
173 |         feed_dict = self.model.get_feed_dict(data_set, False)
174 |         global_step, yp, yp2, loss = sess.run([self.model.global_step, self.model.yp, self.model.yp2, self.model.loss], feed_dict=feed_dict)
175 |         y, y2 = feed_dict[self.model.y], feed_dict[self.model.y2]
176 |         yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
177 |         correct = [self.__class__.compare(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
178 |         f1s = [self.__class__.span_f1(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
179 |         e = TempEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y.tolist(), y2.tolist(), correct, float(loss), f1s)
180 |         return e
181 | 
182 |     @staticmethod
183 |     def compare(yi, y2i, ypi, yp2i):
184 |         i = int(np.argmax(yi.flatten()))
185 |         j = int(np.argmax(ypi.flatten()))
186 |         k = int(np.argmax(y2i.flatten()))
187 |         l = int(np.argmax(yp2i.flatten()))
188 |         # print(i, j, i == j)
189 |         return i == j and k == l
190 | 
191 |     @staticmethod
192 |     def span_f1(yi, y2i, ypi, yp2i):
193 |         true_span = (np.argmax(yi.flatten()), np.argmax(y2i.flatten())+1)
194 |         pred_span = (np.argmax(ypi.flatten()), np.argmax(yp2i.flatten())+1)
195 |         f1 = span_f1(true_span, pred_span)
196 |         return f1
197 | 
198 | 


--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/eda_aug_train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import json\n",
 12 |     "\n",
 13 |     "aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
 14 |     "aug_data = json.load(open(aug_data_path, 'r'))"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
 29 |       "(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
 30 |       "(['the', 'Main', 'Building'], 'the Main Building')\n",
 31 |       "(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "def compare_answers():\n",
 37 |     "    for article in aug_data['data']:\n",
 38 |     "        for para in article['paragraphs']:\n",
 39 |     "            deps = para['deps']\n",
 40 |     "            nodess = []\n",
 41 |     "            for dep in deps:\n",
 42 |     "                nodes, edges = dep\n",
 43 |     "                if dep is not None:\n",
 44 |     "                    nodess.append(nodes)\n",
 45 |     "                else:\n",
 46 |     "                    nodess.append([])\n",
 47 |     "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
 48 |     "            for qa in para['qas']:\n",
 49 |     "                for answer in qa['answers']:\n",
 50 |     "                    text = answer['text']\n",
 51 |     "                    word_start = answer['answer_word_start']\n",
 52 |     "                    word_stop = answer['answer_word_stop']\n",
 53 |     "                    answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
 54 |     "                    yield answer_words, text\n",
 55 |     "\n",
 56 |     "ca = compare_answers()\n",
 57 |     "print(next(ca))\n",
 58 |     "print(next(ca))\n",
 59 |     "print(next(ca))\n",
 60 |     "print(next(ca))"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 11,
 66 |    "metadata": {
 67 |     "collapsed": false
 68 |    },
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stdout",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "x: .\n",
 75 |       "x: .\n",
 76 |       "x: .\n",
 77 |       "x: .\n",
 78 |       "x: .\n",
 79 |       "x: .\n",
 80 |       "x: .\n",
 81 |       "x: .\n",
 82 |       "q: k\n",
 83 |       "q: j\n",
 84 |       "q: n\n",
 85 |       "q: b\n",
 86 |       "q: v\n",
 87 |       "x: .\n",
 88 |       "x: :208\n",
 89 |       "x: .\n",
 90 |       "x: .\n",
 91 |       "x: .\n",
 92 |       "x: .\n",
 93 |       "x: .\n",
 94 |       "x: .\n",
 95 |       "x: .\n",
 96 |       "x: .\n",
 97 |       "x: .\n",
 98 |       "x: .\n",
 99 |       "x: .\n",
100 |       "q: dd\n",
101 |       "q: dd\n",
102 |       "q: dd\n",
103 |       "q: dd\n",
104 |       "q: d\n",
105 |       "x: .\n",
106 |       "x: .\n",
107 |       "x: .\n",
108 |       "x: .\n",
109 |       "x: .\n",
110 |       "x: .\n",
111 |       "x: .\n",
112 |       "x: .\n",
113 |       "x: :411\n",
114 |       "x: .\n",
115 |       "x: .\n",
116 |       "x: .\n",
117 |       "x: .\n",
118 |       "x: .\n",
119 |       "x: .\n",
120 |       "x: :40\n",
121 |       "x: .\n",
122 |       "x: *\n",
123 |       "x: :14\n",
124 |       "x: .\n",
125 |       "x: .\n",
126 |       "x: .\n",
127 |       "x: :131\n",
128 |       "x: .\n",
129 |       "x: .\n",
130 |       "x: .\n",
131 |       "x: .\n",
132 |       "x: .\n",
133 |       "x: .\n",
134 |       "x: .\n",
135 |       "x: .\n",
136 |       "x: .\n",
137 |       "53 10\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "def nodep_counter():\n",
143 |     "    x_count = 0\n",
144 |     "    q_count = 0\n",
145 |     "    for article in aug_data['data']:\n",
146 |     "        for para in article['paragraphs']:\n",
147 |     "            deps = para['deps']\n",
148 |     "            nodess = []\n",
149 |     "            for sent, dep in zip(para['sents'], deps):\n",
150 |     "                if dep is None:\n",
151 |     "                    print(\"x:\", sent)\n",
152 |     "                    x_count += 1\n",
153 |     "            for qa in para['qas']:\n",
154 |     "                if qa['dep'] is None:\n",
155 |     "                    print(\"q:\", qa['question'])\n",
156 |     "                    q_count += 1\n",
157 |     "    print(x_count, q_count)\n",
158 |     "nodep_counter()\n"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 4,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "0\n"
173 |      ]
174 |     }
175 |    ],
176 |    "source": [
177 |     "def bad_node_counter():\n",
178 |     "    count = 0\n",
179 |     "    for article in aug_data['data']:\n",
180 |     "        for para in article['paragraphs']:\n",
181 |     "            sents = para['sents']\n",
182 |     "            deps = para['deps']\n",
183 |     "            nodess = []\n",
184 |     "            for dep in deps:\n",
185 |     "                if dep is not None:\n",
186 |     "                    nodes, edges = dep\n",
187 |     "                    for node in nodes:\n",
188 |     "                        if len(node) != 5:\n",
189 |     "                            count += 1\n",
190 |     "    print(count)\n",
191 |     "bad_node_counter()  "
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 5,
197 |    "metadata": {
198 |     "collapsed": false
199 |    },
200 |    "outputs": [
201 |     {
202 |      "name": "stdout",
203 |      "output_type": "stream",
204 |      "text": [
205 |       "36\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": [
210 |     "def noanswer_counter():\n",
211 |     "    count = 0\n",
212 |     "    for article in aug_data['data']:\n",
213 |     "        for para in article['paragraphs']:\n",
214 |     "            deps = para['deps']\n",
215 |     "            nodess = []\n",
216 |     "            for dep in deps:\n",
217 |     "                if dep is not None:\n",
218 |     "                    nodes, edges = dep\n",
219 |     "                    nodess.append(nodes)\n",
220 |     "                else:\n",
221 |     "                    nodess.append([])\n",
222 |     "            wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
223 |     "            for qa in para['qas']:\n",
224 |     "                for answer in qa['answers']:\n",
225 |     "                    text = answer['text']\n",
226 |     "                    word_start = answer['answer_word_start']\n",
227 |     "                    word_stop = answer['answer_word_stop']\n",
228 |     "                    if word_start is None:\n",
229 |     "                        count += 1\n",
230 |     "    print(count)\n",
231 |     "noanswer_counter()"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 14,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [
241 |     {
242 |      "name": "stdout",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "106\n"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "def mult_sent_answer_counter():\n",
251 |     "    count = 0\n",
252 |     "    for article in aug_data['data']:\n",
253 |     "        for para in article['paragraphs']:\n",
254 |     "            for qa in para['qas']:\n",
255 |     "                for answer in qa['answers']:\n",
256 |     "                    text = answer['text']\n",
257 |     "                    word_start = answer['answer_word_start']\n",
258 |     "                    word_stop = answer['answer_word_stop']\n",
259 |     "                    if word_start is not None and word_start[0] != word_stop[0]:\n",
260 |     "                        count += 1\n",
261 |     "    print(count)\n",
262 |     "mult_sent_answer_counter()"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": null,
268 |    "metadata": {
269 |     "collapsed": true
270 |    },
271 |    "outputs": [],
272 |    "source": []
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {
278 |     "collapsed": true
279 |    },
280 |    "outputs": [],
281 |    "source": []
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {
287 |     "collapsed": true
288 |    },
289 |    "outputs": [],
290 |    "source": []
291 |   }
292 |  ],
293 |  "metadata": {
294 |   "kernelspec": {
295 |    "display_name": "Python 3",
296 |    "language": "python",
297 |    "name": "python3"
298 |   },
299 |   "language_info": {
300 |    "codemirror_mode": {
301 |     "name": "ipython",
302 |     "version": 3
303 |    },
304 |    "file_extension": ".py",
305 |    "mimetype": "text/x-python",
306 |    "name": "python",
307 |    "nbconvert_exporter": "python",
308 |    "pygments_lexer": "ipython3",
309 |    "version": "3.5.1"
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 0
314 | }
315 | 


--------------------------------------------------------------------------------