├── tensorflow
├── SQuAD
│ ├── my
│ │ ├── __init__.py
│ │ ├── tensorflow
│ │ │ ├── __init__.py
│ │ │ ├── rnn.py
│ │ │ ├── general.py
│ │ │ └── nn.py
│ │ ├── utils.py
│ │ ├── corenlp_interface.py
│ │ ├── zip_save.py
│ │ └── nltk_utils.py
│ ├── tree
│ │ ├── __init__.py
│ │ ├── trainer.py
│ │ ├── graph_handler.py
│ │ ├── templates
│ │ │ └── visualizer.html
│ │ ├── cli.py
│ │ ├── visualizer.py
│ │ ├── read_data.py
│ │ ├── main.py
│ │ ├── test.ipynb
│ │ └── evaluator.py
│ ├── basic
│ │ ├── __init__.py
│ │ ├── run_single.sh
│ │ ├── run_ensemble.sh
│ │ ├── get_pr.py
│ │ ├── ensemble_fast.py
│ │ ├── templates
│ │ │ └── visualizer.html
│ │ ├── trainer.py
│ │ ├── graph_handler.py
│ │ ├── ensemble.py
│ │ ├── visualizer.py
│ │ └── cli.py
│ ├── basic_cnn
│ │ ├── __init__.py
│ │ ├── superhighway.py
│ │ ├── templates
│ │ │ └── visualizer.html
│ │ ├── graph_handler.py
│ │ ├── trainer.py
│ │ ├── visualizer.py
│ │ └── cli.py
│ ├── cnn_dm
│ │ ├── __init__.py
│ │ ├── evaluate.py
│ │ └── prepro.py
│ ├── squad
│ │ ├── __init__.py
│ │ ├── neg_squad.py
│ │ ├── evaluate-v1.1.py
│ │ ├── evaluate.py
│ │ ├── utils.py
│ │ ├── aug_squad.py
│ │ ├── prepro_aug.py
│ │ ├── eda_aug_dev.ipynb
│ │ └── eda_aug_train.ipynb
│ ├── requirements.txt
│ ├── .gitignore
│ ├── run_training.sh
│ ├── download.sh
│ └── README.md
└── CIFAR10
│ ├── README.md
│ ├── time_inference.py
│ ├── eval_checkpoints.py
│ └── resnet
│ ├── README.md
│ └── cifar_input.py
├── pytorch
├── CIFAR10
│ ├── benchmark
│ │ ├── __init__.py
│ │ ├── cifar10
│ │ │ ├── __init__.py
│ │ │ ├── __main__.py
│ │ │ ├── models
│ │ │ │ └── densenet.py
│ │ │ ├── infer.py
│ │ │ └── results.py
│ │ ├── imagenet
│ │ │ └── __main__.py
│ │ └── utils.py
│ ├── .gitignore
│ ├── setup.py
│ └── README.md
└── .gitignore
├── .gitignore
└── README.md
/tensorflow/SQuAD/my/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/cnn_dm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | tqdm
3 | jinja2
4 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/.gitignore:
--------------------------------------------------------------------------------
1 | out/
2 | data/
3 | */__pycache__/
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .eggs/
4 | *.egg-info/
5 | .cache
6 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/__init__.py:
--------------------------------------------------------------------------------
1 | from my.tensorflow.general import *
--------------------------------------------------------------------------------
/pytorch/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .eggs/
4 | *.egg-info/
5 | .cache
6 | data/
7 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | __pycache__/
3 | .eggs/
4 | *.egg-info/
5 | .cache
6 | data/
7 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/run_training.sh:
--------------------------------------------------------------------------------
1 | python3 -m basic.cli --mode train --noload --len_opt --cluster
2 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/imagenet/__main__.py:
--------------------------------------------------------------------------------
1 | import click
2 |
3 | from benchmark.imagenet.train import train
4 |
5 |
6 | @click.group()
7 | def cli():
8 | pass
9 |
10 |
11 | cli.add_command(train, name='train')
12 |
13 | if __name__ == '__main__':
14 | cli()
15 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/__main__.py:
--------------------------------------------------------------------------------
1 | import click
2 |
3 | from benchmark.cifar10.train import train
4 | from benchmark.cifar10.infer import infer
5 |
6 |
7 | @click.group()
8 | def cli():
9 | pass
10 |
11 |
12 | cli.add_command(train, name='train')
13 | cli.add_command(infer, name='infer')
14 |
15 |
16 | if __name__ == '__main__':
17 | cli()
18 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name='benchmark',
5 | version='0.0.0',
6 | url='http://www.codycoleman.com',
7 | author='Cody Austun Coleman',
8 | author_email='cody.coleman@cs.stanford.edu',
9 | packages=['benchmark'],
10 | entry_points={
11 | 'console_scripts': [
12 | 'cifar10 = benchmark.cifar10.__main__:cli',
13 | 'imagenet = benchmark.imagenet.__main__:cli'
14 | ]
15 | },
16 | install_requires=[
17 | 'tqdm',
18 | 'torchvision',
19 | 'click',
20 | ]
21 | )
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DAWNBench Models
2 |
3 | This repository contains implementations for various models presented in the DAWNBench Leaderboard:
4 | - ResNet models for CIFAR10, implemented in TensorFlow, located at
5 | [`tensorflow/CIFAR10`](https://github.com/stanford-futuredata/dawn-bench-models/tree/master/tensorflow/CIFAR10)
6 | - ResNet models for CIFAR10, implemented in PyTorch, located at
7 | [`pytorch/CIFAR10`](https://github.com/stanford-futuredata/dawn-bench-models/tree/master/pytorch/CIFAR10)
8 | - BiDAF model for SQuAD, implemented in TensorFlow, located at
9 | [`tensorflow/SQuAD`](https://github.com/stanford-futuredata/dawn-bench-models/tree/master/tensorflow/SQuAD)
10 |
11 | You can email us at [dawn-benchmark@lists.stanford.edu](mailto:dawn-benchmark@lists.stanford.edu) with any
12 | questions.
13 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | DATA_DIR=$HOME/data
4 | mkdir $DATA_DIR
5 |
6 | # Download SQuAD
7 | SQUAD_DIR=$DATA_DIR/squad
8 | mkdir $SQUAD_DIR
9 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $SQUAD_DIR/train-v1.1.json
10 | wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $SQUAD_DIR/dev-v1.1.json
11 |
12 |
13 | # Download CNN and DailyMail
14 | # Download at: http://cs.nyu.edu/~kcho/DMQA/
15 |
16 |
17 | # Download GloVe
18 | GLOVE_DIR=$DATA_DIR/glove
19 | mkdir $GLOVE_DIR
20 | wget http://nlp.stanford.edu/data/glove.6B.zip -O $GLOVE_DIR/glove.6B.zip
21 | unzip $GLOVE_DIR/glove.6B.zip -d $GLOVE_DIR
22 |
23 | # Download NLTK (for tokenizer)
24 | # Make sure that nltk is installed!
25 | python3 -m nltk.downloader -d $HOME/nltk_data punkt
26 |
--------------------------------------------------------------------------------
/tensorflow/CIFAR10/README.md:
--------------------------------------------------------------------------------
1 | # ResNets on TensorFlow
2 |
3 | To train a ResNet, run,
4 |
5 | ```bash
6 | python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* --log_root=data/resnet20/log_root \
7 | --train_dir=data/resnet20/log_root/train --dataset='cifar10' --model=resnet20 \
8 | --num_gpus=1 --checkpoint_dir=data/resnet20/checkpoints --data_format=NCHW
9 | ```
10 |
11 | To evaluate resulting checkpoints, run,
12 |
13 | ```bash
14 | python3 eval_checkpoints.py -i data/resnet20/checkpoints \
15 | -c "python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin --eval_dir=data/resnet20/log_root/eval --dataset='cifar10' --model=resnet20 --num_gpus=1 --eval_batch_count=100 --eval_once=True --data_format=NCHW"
16 | ```
17 |
18 | Make sure to first follow the instructions in `resnet/README.md` to get necessary data, etc.
19 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/run_single.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | source_path=$1
3 | target_path=$2
4 | inter_dir="inter_single"
5 | root_dir="save"
6 |
7 | parg=""
8 | marg=""
9 | if [ "$3" = "debug" ]
10 | then
11 | parg="-d"
12 | marg="--debug"
13 | fi
14 |
15 | # Preprocess data
16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
17 |
18 | num=37
19 | load_path="$root_dir/$num/save"
20 | shared_path="$root_dir/$num/shared.json"
21 | eval_path="$inter_dir/eval.pklz"
22 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema
23 |
24 | # Ensemble (for single run, just one input)
25 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eval_path
26 |
27 |
28 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/run_ensemble.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | source_path=$1
3 | target_path=$2
4 | inter_dir="inter_ensemble"
5 | root_dir="save"
6 |
7 | parg=""
8 | marg=""
9 | if [ "$3" = "debug" ]
10 | then
11 | parg="-d"
12 | marg="--debug"
13 | fi
14 |
15 | # Preprocess data
16 | python3 -m squad.prepro --mode single --single_path $source_path $parg --target_dir $inter_dir --glove_dir .
17 |
18 | eargs=""
19 | for num in 31 33 34 35 36 37 40 41 43 44 45 46; do
20 | load_path="$root_dir/$num/save"
21 | shared_path="$root_dir/$num/shared.json"
22 | eval_path="$inter_dir/eval-$num.pklz"
23 | eargs="$eargs $eval_path"
24 | python3 -m basic.cli --data_dir $inter_dir --eval_path $eval_path --nodump_answer --load_path $load_path --shared_path $shared_path $marg --eval_num_batches 0 --mode forward --batch_size 1 --len_opt --cluster --cpu_opt --load_ema &
25 | done
26 | wait
27 |
28 | # Ensemble
29 | python3 -m basic.ensemble --data_path $inter_dir/data_single.json --shared_path $inter_dir/shared_single.json -o $target_path $eargs
30 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/get_pr.py:
--------------------------------------------------------------------------------
1 | import json
2 | import argparse
3 |
4 |
5 | def get_args():
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("path")
8 | parser.add_argument("-t", "--th", type=float, default=0.5)
9 | # TODO : put more args here
10 | return parser.parse_args()
11 |
12 |
13 | def get_pr(args):
14 | with open(args.path, 'r') as fp:
15 | answers = json.load(fp)
16 |
17 | na = answers['na']
18 |
19 | tp = sum(int(not id_.startswith("neg") and score < args.th) for id_, score in na.items())
20 | fp = sum(int(id_.startswith("neg") and score < args.th) for id_, score in na.items())
21 | tn = sum(int(id_.startswith("neg") and score >= args.th) for id_, score in na.items())
22 | fn = sum(int(not id_.startswith("neg") and score >= args.th) for id_, score in na.items())
23 |
24 | p = tp / (tp + fp)
25 | r = tp / (tp + fn)
26 | print("p={:.3f}, r={:.3f}".format(p, r))
27 |
28 |
29 | def main():
30 | args = get_args()
31 | get_pr(args)
32 |
33 | if __name__ == "__main__":
34 | main()
35 |
36 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/cnn_dm/evaluate.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 |
5 | root_dir = sys.argv[1]
6 | answer_path = sys.argv[2]
7 | file_names = os.listdir(root_dir)
8 |
9 | num_correct = 0
10 | num_wrong = 0
11 |
12 | with open(answer_path, 'r') as fh:
13 | id2answer_dict = json.load(fh)
14 |
15 | for file_name in file_names:
16 | if not file_name.endswith(".question"):
17 | continue
18 | with open(os.path.join(root_dir, file_name), 'r') as fh:
19 | url = fh.readline().strip()
20 | _ = fh.readline()
21 | para = fh.readline().strip()
22 | _ = fh.readline()
23 | ques = fh.readline().strip()
24 | _ = fh.readline()
25 | answer = fh.readline().strip()
26 | _ = fh.readline()
27 | if file_name in id2answer_dict:
28 | pred = id2answer_dict[file_name]
29 | if pred == answer:
30 | num_correct += 1
31 | else:
32 | num_wrong += 1
33 | else:
34 | num_wrong += 1
35 |
36 | total = num_correct + num_wrong
37 | acc = float(num_correct) / total
38 | print("{} = {} / {}".format(acc, num_correct, total))
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/ensemble_fast.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import json
3 | from collections import Counter, defaultdict
4 | import re
5 |
6 | def key_func(pair):
7 | return pair[1]
8 |
9 |
10 | def get_func(vals, probs):
11 | counter = Counter(vals)
12 | # return max(zip(vals, probs), key=lambda pair: pair[1])[0]
13 | # return max(zip(vals, probs), key=lambda pair: pair[1] * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
14 | # return max(zip(vals, probs), key=lambda pair: pair[1] + 0.7 * counter[pair[0]] / len(counter) - 999 * (len(pair[0]) == 0) )[0]
15 | d = defaultdict(float)
16 | for val, prob in zip(vals, probs):
17 | d[val] += prob
18 | d[''] = 0
19 | return max(d.items(), key=lambda pair: pair[1])[0]
20 |
21 | third_path = sys.argv[1]
22 | other_paths = sys.argv[2:]
23 |
24 | others = [json.load(open(path, 'r')) for path in other_paths]
25 |
26 |
27 | c = {}
28 |
29 | assert min(map(len, others)) == max(map(len, others)), list(map(len, others))
30 |
31 | for key in others[0].keys():
32 | if key == 'scores':
33 | continue
34 | probs = [other['scores'][key] for other in others]
35 | vals = [other[key] for other in others]
36 | largest_val = get_func(vals, probs)
37 | c[key] = largest_val
38 |
39 | json.dump(c, open(third_path, 'w'))
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/trainer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from tree.model import Model
4 |
5 |
6 | class Trainer(object):
7 | def __init__(self, config, model):
8 | assert isinstance(model, Model)
9 | self.config = config
10 | self.model = model
11 | self.opt = tf.train.AdagradOptimizer(config.init_lr)
12 | self.loss = model.get_loss()
13 | self.var_list = model.get_var_list()
14 | self.global_step = model.get_global_step()
15 | self.ema_op = model.ema_op
16 | self.summary = model.summary
17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 | opt_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 |
20 | # Define train op
21 | with tf.control_dependencies([opt_op]):
22 | self.train_op = tf.group(self.ema_op)
23 |
24 | def get_train_op(self):
25 | return self.train_op
26 |
27 | def step(self, sess, batch, get_summary=False):
28 | assert isinstance(sess, tf.Session)
29 | feed_dict = self.model.get_feed_dict(batch, True)
30 | if get_summary:
31 | loss, summary, train_op = \
32 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
33 | else:
34 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
35 | summary = None
36 | return loss, summary, train_op
37 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | from collections import deque
3 |
4 | import numpy as np
5 | from tqdm import tqdm
6 |
7 |
8 | def mytqdm(list_, desc="", show=True):
9 | if show:
10 | pbar = tqdm(list_)
11 | pbar.set_description(desc)
12 | return pbar
13 | return list_
14 |
15 |
16 | def json_pretty_dump(obj, fh):
17 | return json.dump(obj, fh, sort_keys=True, indent=2, separators=(',', ': '))
18 |
19 |
20 | def index(l, i):
21 | return index(l[i[0]], i[1:]) if len(i) > 1 else l[i[0]]
22 |
23 |
24 | def fill(l, shape, dtype=None):
25 | out = np.zeros(shape, dtype=dtype)
26 | stack = deque()
27 | stack.appendleft(((), l))
28 | while len(stack) > 0:
29 | indices, cur = stack.pop()
30 | if len(indices) < shape:
31 | for i, sub in enumerate(cur):
32 | stack.appendleft([indices + (i,), sub])
33 | else:
34 | out[indices] = cur
35 | return out
36 |
37 |
38 | def short_floats(o, precision):
39 | class ShortFloat(float):
40 | def __repr__(self):
41 | return '%.{}g'.format(precision) % self
42 |
43 | def _short_floats(obj):
44 | if isinstance(obj, float):
45 | return ShortFloat(obj)
46 | elif isinstance(obj, dict):
47 | return dict((k, _short_floats(v)) for k, v in obj.items())
48 | elif isinstance(obj, (list, tuple)):
49 | return tuple(map(_short_floats, obj))
50 | return obj
51 |
52 | return _short_floats(o)
53 |
54 |
55 | def argmax(x):
56 | return np.unravel_index(x.argmax(), x.shape)
57 |
58 |
59 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/corenlp_interface.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import requests
4 | import nltk
5 | import json
6 | import networkx as nx
7 | import time
8 |
9 |
10 | class CoreNLPInterface(object):
11 | def __init__(self, url, port):
12 | self._url = url
13 | self._port = port
14 |
15 | def get(self, type_, in_, num_max_requests=100):
16 | in_ = in_.encode("utf-8")
17 | url = "http://{}:{}/{}".format(self._url, self._port, type_)
18 | out = None
19 | for _ in range(num_max_requests):
20 | try:
21 | r = requests.post(url, data=in_)
22 | out = r.content.decode('utf-8')
23 | if out == 'error':
24 | out = None
25 | break
26 | except:
27 | time.sleep(1)
28 | return out
29 |
30 | def split_doc(self, doc):
31 | out = self.get("doc", doc)
32 | return out if out is None else json.loads(out)
33 |
34 | def split_sent(self, sent):
35 | out = self.get("sent", sent)
36 | return out if out is None else json.loads(out)
37 |
38 | def get_dep(self, sent):
39 | out = self.get("dep", sent)
40 | return out if out is None else json.loads(out)
41 |
42 | def get_const(self, sent):
43 | out = self.get("const", sent)
44 | return out
45 |
46 | def get_const_tree(self, sent):
47 | out = self.get_const(sent)
48 | return out if out is None else nltk.tree.Tree.fromstring(out)
49 |
50 | @staticmethod
51 | def dep2tree(dep):
52 | tree = nx.DiGraph()
53 | for dep, i, gov, j, label in dep:
54 | tree.add_edge(gov, dep, label=label)
55 | return tree
56 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/neg_squad.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | # data: q, cq, (dq), (pq), y, *x, *cx
5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
6 | # no metadata
7 | import random
8 | from collections import Counter
9 |
10 | from tqdm import tqdm
11 |
12 | from squad.utils import get_word_span, get_word_idx, process_tokens
13 |
14 |
15 | def main():
16 | args = get_args()
17 | neg_squad(args)
18 |
19 |
20 | def get_args():
21 | parser = argparse.ArgumentParser()
22 | home = os.path.expanduser("~")
23 | parser.add_argument("source_path")
24 | parser.add_argument("target_path")
25 | parser.add_argument('-d', "--debug", action='store_true')
26 | parser.add_argument('-r', "--aug_ratio", default=1, type=int)
27 | # TODO : put more args here
28 | return parser.parse_args()
29 |
30 |
31 | def neg_squad(args):
32 | with open(args.source_path, 'r') as fp:
33 | squad = json.load(fp)
34 | with open(args.source_path, 'r') as fp:
35 | ref_squad = json.load(fp)
36 |
37 | for ai, article in enumerate(ref_squad['data']):
38 | for pi, para in enumerate(article['paragraphs']):
39 | cands = list(range(pi)) + list(range(pi+1, len(article['paragraphs'])))
40 | samples = random.sample(cands, args.aug_ratio)
41 | for sample in samples:
42 | for qi, ques in enumerate(article['paragraphs'][sample]['qas']):
43 | new_ques = {'question': ques['question'], 'answers': [], 'answer_start': 0, 'id': "neg_" + ques['id']}
44 | squad['data'][ai]['paragraphs'][pi]['qas'].append(new_ques)
45 |
46 | with open(args.target_path, 'w') as fp:
47 | json.dump(squad, fp)
48 |
49 | if __name__ == "__main__":
50 | main()
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/superhighway.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.ops.rnn_cell import RNNCell
3 |
4 | from my.tensorflow.nn import linear
5 |
6 |
7 | class SHCell(RNNCell):
8 | """
9 | Super-Highway Cell
10 | """
11 | def __init__(self, input_size, logit_func='tri_linear', scalar=False):
12 | self._state_size = input_size
13 | self._output_size = input_size
14 | self._logit_func = logit_func
15 | self._scalar = scalar
16 |
17 | @property
18 | def state_size(self):
19 | return self._state_size
20 |
21 | @property
22 | def output_size(self):
23 | return self._output_size
24 |
25 | def __call__(self, inputs, state, scope=None):
26 | with tf.variable_scope(scope or "SHCell"):
27 | a_size = 1 if self._scalar else self._state_size
28 | h, u = tf.split(axis=1, num_or_size_splits=2, value=inputs)
29 | if self._logit_func == 'mul_linear':
30 | args = [h * u, state * u]
31 | a = tf.nn.sigmoid(linear(args, a_size, True))
32 | elif self._logit_func == 'linear':
33 | args = [h, u, state]
34 | a = tf.nn.sigmoid(linear(args, a_size, True))
35 | elif self._logit_func == 'tri_linear':
36 | args = [h, u, state, h * u, state * u]
37 | a = tf.nn.sigmoid(linear(args, a_size, True))
38 | elif self._logit_func == 'double':
39 | args = [h, u, state]
40 | a = tf.nn.sigmoid(linear(tf.tanh(linear(args, a_size, True)), self._state_size, True))
41 |
42 | else:
43 | raise Exception()
44 | new_state = a * state + (1 - a) * h
45 | outputs = state
46 | return outputs, new_state
47 |
48 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/zip_save.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import shutil
5 | from zipfile import ZipFile
6 |
7 | from tqdm import tqdm
8 |
9 |
10 | def get_args():
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('paths', nargs='+')
13 | parser.add_argument('-o', '--out', default='save.zip')
14 | args = parser.parse_args()
15 | return args
16 |
17 |
18 | def zip_save(args):
19 | temp_dir = "."
20 | save_dir = os.path.join(temp_dir, "save")
21 | if not os.path.exists(save_dir):
22 | os.makedirs(save_dir)
23 | for save_source_path in tqdm(args.paths):
24 | # path = "out/basic/30/save/basic-18000"
25 | # target_path = "save_dir/30/save"
26 | # also output full path name to "save_dir/30/readme.txt
27 | # need to also extract "out/basic/30/shared.json"
28 | temp, _ = os.path.split(save_source_path) # "out/basic/30/save", _
29 | model_dir, _ = os.path.split(temp) # "out/basic/30, _
30 | _, model_name = os.path.split(model_dir)
31 | cur_dir = os.path.join(save_dir, model_name)
32 | if not os.path.exists(cur_dir):
33 | os.makedirs(cur_dir)
34 | save_target_path = os.path.join(cur_dir, "save")
35 | shared_target_path = os.path.join(cur_dir, "shared.json")
36 | readme_path = os.path.join(cur_dir, "readme.txt")
37 | shared_source_path = os.path.join(model_dir, "shared.json")
38 | shutil.copy(save_source_path, save_target_path)
39 | shutil.copy(shared_source_path, shared_target_path)
40 | with open(readme_path, 'w') as fh:
41 | fh.write(save_source_path)
42 |
43 | os.system("zip {} -r {}".format(args.out, save_dir))
44 |
45 | def main():
46 | args = get_args()
47 | zip_save(args)
48 |
49 | if __name__ == "__main__":
50 | main()
51 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import re
4 | from functools import reduce
5 |
6 |
7 | class AverageMeter(object):
8 | """Computes and stores the average and current value"""
9 | def __init__(self):
10 | self.reset()
11 |
12 | def reset(self):
13 | self.val = 0
14 | self.avg = 0
15 | self.sum = 0
16 | self.count = 0
17 |
18 | def update(self, val, n=1):
19 | self.val = val
20 | self.sum += val * n
21 | self.count += n
22 | self.avg = self.sum / self.count
23 |
24 |
25 | def count_parameters(model):
26 | c = map(lambda p: reduce(lambda x, y: x * y, p.size()), model.parameters())
27 | return sum(c)
28 |
29 |
30 | def latest_file(model):
31 | restore = f'./run/{model}'
32 | timestamps = sorted(os.listdir(restore))
33 | assert len(timestamps) > 0
34 | run_dir = os.path.join(restore, timestamps[-1])
35 | files = os.listdir(run_dir)
36 | max_checkpoint = -1
37 | for filename in files:
38 | if re.search('checkpoint_\d+.t7', filename):
39 | num = int(re.search('\d+', filename).group())
40 |
41 | if num > max_checkpoint:
42 | max_checkpoint = num
43 | max_checkpoint_file = filename
44 |
45 | assert max_checkpoint != -1
46 | return os.path.join(run_dir, max_checkpoint_file)
47 |
48 |
49 | def save_result(result, path):
50 | write_heading = not os.path.exists(path)
51 | with open(path, mode='a') as out:
52 | if write_heading:
53 | out.write(",".join([str(k) for k, v in result.items()]) + '\n')
54 | out.write(",".join([str(v) for k, v in result.items()]) + '\n')
55 |
56 |
57 | def save_config(config, run_dir):
58 | path = os.path.join(run_dir, "config_{}.json".format(config['timestamp']))
59 | with open(path, 'w') as config_file:
60 | json.dump(config, config_file)
61 | config_file.write('\n')
62 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/README.md:
--------------------------------------------------------------------------------
1 | # Install
2 |
3 | 1. Install PyTorch v0.1.12. If you don't already have it set up, [please follow the official install instructions](http://pytorch.org/).
4 | 2. Clone this repo and go to this directory
5 |
6 | ```bash
7 | git clone git@github.com:stanford-futuredata/dawn-bench-models.git
8 | cd dawn-bench-models/pytorch/CIFAR10
9 | ```
10 |
11 | 3. Install this package
12 |
13 | ```bash
14 | pip install -e .
15 | ```
16 |
17 | # Quick start
18 |
19 | This package adds cifar10 and imagenet command line interfaces.
20 | Both include the train subcommands to learn a model from scratch.
21 | As an example, here is how to train ResNet164 with preactivation on CIFAR10:
22 |
23 | ```bash
24 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 5 -l 0.01
25 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 90 -l 0.1 --restore latest
26 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 45 -l 0.01 --restore latest
27 | cifar10 train -c last --augmentation --tracking -b 128 --optimizer sgd --arch preact164 -e 45 -l 0.001 --restore latest
28 | ```
29 |
30 | The first command creates a new run of ResNet164 with preactivation (`--arch preact164`) in the `./run/preact164/[TIMESTAMP]` directory and starts a warm up of 5 epochs (`-e 5`) with SGD (`--optimizer sgd`) and a learning rate of 0.01 (`-l 0.01`).
31 | `-c last` indicates that we only want to save a checkpoint after the last epoch of the warm up.
32 | `-b 128` sets the batch size to 128.
33 | `--augmentation` turns on standard data augmentation, i.e. random crop and flip.
34 | `--tracking` saves training and validation results to csv files at `./run/preact164/[TIMESTAMP]/[train|valid]_results.csv`
35 |
36 | The second command resumes the run from the first command (`--restore latest`) for another 90 epochs (`-e 90`) but with a new learning rate (`-l 0.1`). The third and fourth commands function similarly to the second command, changing the learning rate and running for more epochs.
37 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/graph_handler.py:
--------------------------------------------------------------------------------
1 | import json
2 | from json import encoder
3 | import os
4 |
5 | import tensorflow as tf
6 |
7 | from tree.evaluator import Evaluation
8 | from my.utils import short_floats
9 |
10 |
11 | class GraphHandler(object):
12 | def __init__(self, config):
13 | self.config = config
14 | self.saver = tf.train.Saver()
15 | self.writer = None
16 | self.save_path = os.path.join(config.save_dir, config.model_name)
17 |
18 | def initialize(self, sess):
19 | if self.config.load:
20 | self._load(sess)
21 | else:
22 | sess.run(tf.global_variables_initializer())
23 |
24 | if self.config.mode == 'train':
25 | self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
26 |
27 | def save(self, sess, global_step=None):
28 | self.saver.save(sess, self.save_path, global_step=global_step)
29 |
30 | def _load(self, sess):
31 | config = self.config
32 | if config.load_step > 0:
33 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
34 | else:
35 | save_dir = config.save_dir
36 | checkpoint = tf.train.get_checkpoint_state(save_dir)
37 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
38 | save_path = checkpoint.model_checkpoint_path
39 | print("Loading saved model from {}".format(save_path))
40 | self.saver.restore(sess, save_path)
41 |
42 | def add_summary(self, summary, global_step):
43 | self.writer.add_summary(summary, global_step)
44 |
45 | def add_summaries(self, summaries, global_step):
46 | for summary in summaries:
47 | self.add_summary(summary, global_step)
48 |
49 | def dump_eval(self, e, precision=2):
50 | assert isinstance(e, Evaluation)
51 | path = os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
52 | with open(path, 'w') as fh:
53 | json.dump(short_floats(e.dict, precision), fh)
54 |
55 |
--------------------------------------------------------------------------------
/tensorflow/CIFAR10/time_inference.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | import sys
5 |
6 | def main(checkpoint_path, model, use_bottleneck):
7 | print("Number of images\tInference time")
8 | num_trials = 10
9 | for batch_size in [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]:
10 | command = ("python3 resnet/resnet_main.py --mode=eval --eval_data_path=cifar10/test_batch.bin "
11 | "--eval_dir=data/%(model)s/log_root/eval --dataset='cifar10' --model=%(model)s "
12 | "--use_bottleneck=%(use_bottleneck)s --eval_batch_count=%(num_trials)d --eval_once=True --num_gpus=1 "
13 | "--data_format=NHWC --time_inference=True --eval_batch_count=1 --batch_size=%(batch_size)d" %
14 | {"model": model, "use_bottleneck": "True" if use_bottleneck else "False", "batch_size": batch_size,
15 | "num_trials": num_trials})
16 | full_command = command + " --log_root=%s 2>/dev/null" % checkpoint_path
17 | try:
18 | output = subprocess.check_output(full_command, shell=True)
19 | output = output.decode('utf8').strip()
20 | for line in output.split('\n'):
21 | if "Time for inference" in line:
22 | line = line.strip()
23 | inference_time = float(line.split(": ")[1]) / num_trials
24 | stats = [batch_size, inference_time]
25 | print("\t".join([str(stat) for stat in stats]))
26 | sys.stdout.flush()
27 | except:
28 | stats = [batch_size, ""]
29 | print("\t".join([str(stat) for stat in stats]))
30 | sys.stdout.flush()
31 |
32 |
33 | if __name__ == '__main__':
34 | parser = argparse.ArgumentParser(
35 | description=("Backup model checkpoints periodically")
36 | )
37 | parser.add_argument('-i', "--checkpoint_path", type=str, required=True,
38 | help="Path to dumped model checkpoints")
39 | parser.add_argument('-m', "--model", type=str, required=True,
40 | help="Model name")
41 | parser.add_argument('-b', "--use_bottleneck", type=bool, default=False,
42 | help="Use bottleneck")
43 |
44 | cmdline_args = parser.parse_args()
45 | opt_dict = vars(cmdline_args)
46 |
47 | checkpoint_path = opt_dict["checkpoint_path"]
48 | model = opt_dict["model"]
49 | use_bottleneck = opt_dict["use_bottleneck"]
50 |
51 | main(checkpoint_path, model, use_bottleneck)
52 |
--------------------------------------------------------------------------------
/tensorflow/CIFAR10/eval_checkpoints.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import subprocess
4 | import sys
5 |
6 | def main(checkpoints_path, command, start_cnt):
7 | cnt = start_cnt
8 |
9 | times = {}
10 | cum_time = 0.0
11 | with open(os.path.join(checkpoints_path, "times.log"), 'r') as f:
12 | output = f.read().strip()
13 | output_lines = output.split('\n')
14 | for output_line in output_lines:
15 | [step, time] = output_line.split('\t')
16 | step = int(step.split(': ')[1])
17 | time = float(time.split(': ')[1])
18 | cum_time += time
19 | times[step] = cum_time
20 |
21 | print("Time (in secs)\tNumber of minibatches\tTop 1 accuracy\tTop 5 accuracy")
22 | while True:
23 | ckpt_path = ("%5d" % cnt).replace(' ', '0')
24 | full_ckpt_path = os.path.join(checkpoints_path, ckpt_path)
25 | if not os.path.exists(full_ckpt_path):
26 | break
27 | if len(os.listdir(full_ckpt_path)) <= 2:
28 | cnt += 1
29 | continue
30 | full_command = command + " --log_root=%s 2>/dev/null" % full_ckpt_path
31 | output = subprocess.check_output(full_command, shell=True)
32 | output = output.decode('utf8').strip()
33 | for line in output.split('\n'):
34 | if "Precision" in line and "Recall" in line:
35 | tokens = line.split(", ") # TODO: Nasty hack, make more robust.
36 | precision_at_1 = float(tokens[0].split()[-1])
37 | recall_at_5 = float(tokens[1].split()[-1])
38 | step = int(tokens[2].split()[3])
39 | stats = [times[step], step, precision_at_1, recall_at_5]
40 | print("\t".join([str(stat) for stat in stats]))
41 | sys.stdout.flush()
42 | cnt += 1
43 |
44 |
45 | if __name__ == '__main__':
46 | parser = argparse.ArgumentParser(
47 | description=("Backup model checkpoints periodically")
48 | )
49 | parser.add_argument('-i', "--checkpoints_path", type=str, required=True,
50 | help="Path to dumped model checkpoints")
51 | parser.add_argument('-c', "--command", type=str, required=True,
52 | help="Command to evaluate each individual checkpoint")
53 | parser.add_argument('-s', "--start_cnt", type=int, default=1,
54 | help="Count to start evaluating checkpoints from")
55 |
56 | cmdline_args = parser.parse_args()
57 | opt_dict = vars(cmdline_args)
58 |
59 | main(opt_dict["checkpoints_path"], opt_dict["command"], opt_dict["start_cnt"])
60 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/templates/visualizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
19 |
20 |
23 |
24 | {{ title }}
25 |
26 |
27 | | ID |
28 | Question |
29 | Answer |
30 | Paragraph |
31 |
32 | {% for row in rows %}
33 |
34 | | {{ row.id }} |
35 |
36 | {% for qj in row.ques %}
37 | {{ qj }}
38 | {% endfor %}
39 | |
40 | {{ row.a }} |
41 |
42 |
43 | {% for xj, yj, y2j, ypj, yp2j in zip(row.para, row.y, row.y2, row.yp, row.yp2) %}
44 |
45 | {% for xjk, yjk, y2jk, ypjk in zip(xj, yj, y2j, ypj) %}
46 | |
47 | {% if yjk or y2jk %}
48 | {{ xjk }}
49 | {% else %}
50 | {{ xjk }}
51 | {% endif %}
52 | |
53 | {% endfor %}
54 |
55 |
56 | {% for xjk, yp2jk in zip(xj, yp2j) %}
57 | | - |
58 | {% endfor %}
59 |
60 | {% endfor %}
61 |
62 | |
63 |
64 | {% endfor %}
65 |
66 |
67 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pprint import pprint
3 |
4 | import tensorflow as tf
5 |
6 | from tree.main import main as m
7 |
8 | flags = tf.app.flags
9 |
10 | flags.DEFINE_string("model_name", "tree", "Model name [tree]")
11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
12 | flags.DEFINE_integer("run_id", 0, "Run ID [0]")
13 |
14 | flags.DEFINE_integer("batch_size", 128, "Batch size [128]")
15 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
16 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
17 | flags.DEFINE_integer("num_steps", 0, "Number of steps [0]")
18 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
19 | flags.DEFINE_integer("load_step", 0, "load step [0]")
20 | flags.DEFINE_integer("early_stop", 4, "early stop [4]")
21 |
22 | flags.DEFINE_string("mode", "test", "train | test | forward [test]")
23 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
24 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
25 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
26 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
27 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
28 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
29 |
30 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
31 |
32 | flags.DEFINE_integer("hidden_size", 32, "Hidden size [32]")
33 | flags.DEFINE_float("input_keep_prob", 0.5, "Input keep prob [0.5]")
34 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
35 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
36 | flags.DEFINE_float("wd", 0.0001, "Weight decay [0.001]")
37 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
38 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
39 |
40 | flags.DEFINE_integer("word_count_th", 100, "word count th [100]")
41 | flags.DEFINE_integer("char_count_th", 500, "char count th [500]")
42 | flags.DEFINE_integer("sent_size_th", 64, "sent size th [64]")
43 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
44 | flags.DEFINE_integer("ques_size_th", 64, "ques size th [64]")
45 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
46 | flags.DEFINE_integer("tree_height_th", 16, "tree height th [16]")
47 |
48 |
49 | def main(_):
50 | config = flags.FLAGS
51 |
52 | config.out_dir = os.path.join("out", config.model_name, str(config.run_id).zfill(2))
53 |
54 | m(config)
55 |
56 | if __name__ == "__main__":
57 | tf.app.run()
58 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/templates/visualizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
19 |
20 |
23 |
24 | {{ title }}
25 |
26 |
27 | | ID |
28 | Question |
29 | Answers |
30 | Predicted |
31 | Score |
32 | Paragraph |
33 |
34 | {% for row in rows %}
35 |
36 | | {{ row.id }} |
37 |
38 | {% for qj in row.ques %}
39 | {{ qj }}
40 | {% endfor %}
41 | |
42 |
43 | {% for aa in row.a %}
44 | {{ aa }}
45 | {% endfor %}
46 | |
47 | {{ row.ap }} |
48 | {{ row.score }} |
49 |
50 |
51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
52 |
53 | {% set rowloop = loop %}
54 | {% for xjk, ypjk in zip(xj, ypj) %}
55 | |
56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
57 | {{ xjk }}
58 | {% else %}
59 | {{ xjk }}
60 | {% endif %}
61 | |
62 | {% endfor %}
63 |
64 |
65 | {% for xjk, yp2jk in zip(xj, yp2j) %}
66 | | - |
67 | {% endfor %}
68 |
69 | {% endfor %}
70 |
71 | |
72 |
73 | {% endfor %}
74 |
75 |
76 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/templates/visualizer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{ title }}
6 |
7 |
8 |
19 |
20 |
23 |
24 | {{ title }}
25 |
26 |
27 | | ID |
28 | Question |
29 | Answers |
30 | Predicted |
31 | Score |
32 | Paragraph |
33 |
34 | {% for row in rows %}
35 |
36 | | {{ row.id }} |
37 |
38 | {% for qj in row.ques %}
39 | {{ qj }}
40 | {% endfor %}
41 | |
42 |
43 | {% for aa in row.a %}
44 | {{ aa }}
45 | {% endfor %}
46 | |
47 | {{ row.ap }} |
48 | {{ row.score }} |
49 |
50 |
51 | {% for xj, ypj, yp2j in zip(row.para, row.yp, row.yp2) %}
52 |
53 | {% set rowloop = loop %}
54 | {% for xjk, ypjk in zip(xj, ypj) %}
55 | |
56 | {% if row.y[0][0] == rowloop.index0 and row.y[0][1] <= loop.index0 <= row.y[1][1] %}
57 | {{ xjk }}
58 | {% else %}
59 | {{ xjk }}
60 | {% endif %}
61 | |
62 | {% endfor %}
63 |
64 |
65 | {% for xjk, yp2jk in zip(xj, yp2j) %}
66 | | - |
67 | {% endfor %}
68 |
69 | {% endfor %}
70 |
71 | |
72 |
73 | {% endfor %}
74 |
75 |
76 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/graph_handler.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import json
3 | from json import encoder
4 | import os
5 |
6 | import tensorflow as tf
7 |
8 | from basic_cnn.evaluator import Evaluation, F1Evaluation
9 | from my.utils import short_floats
10 |
11 | import pickle
12 |
13 |
14 | class GraphHandler(object):
15 | def __init__(self, config):
16 | self.config = config
17 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
18 | self.writer = None
19 | self.save_path = os.path.join(config.save_dir, config.model_name)
20 |
21 | def initialize(self, sess):
22 | if self.config.load:
23 | self._load(sess)
24 | else:
25 | sess.run(tf.global_variables_initializer())
26 |
27 | if self.config.mode == 'train':
28 | self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
29 |
30 | def save(self, sess, global_step=None):
31 | self.saver.save(sess, self.save_path, global_step=global_step)
32 |
33 | def _load(self, sess):
34 | config = self.config
35 | if config.load_path:
36 | save_path = config.load_path
37 | elif config.load_step > 0:
38 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
39 | else:
40 | save_dir = config.save_dir
41 | checkpoint = tf.train.get_checkpoint_state(save_dir)
42 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
43 | save_path = checkpoint.model_checkpoint_path
44 | print("Loading saved model from {}".format(save_path))
45 | self.saver.restore(sess, save_path)
46 |
47 | def add_summary(self, summary, global_step):
48 | self.writer.add_summary(summary, global_step)
49 |
50 | def add_summaries(self, summaries, global_step):
51 | for summary in summaries:
52 | self.add_summary(summary, global_step)
53 |
54 | def dump_eval(self, e, precision=2, path=None):
55 | assert isinstance(e, Evaluation)
56 | if self.config.dump_pickle:
57 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
58 | with gzip.open(path, 'wb', compresslevel=3) as fh:
59 | pickle.dump(e.dict, fh)
60 | else:
61 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
62 | with open(path, 'w') as fh:
63 | json.dump(short_floats(e.dict, precision), fh)
64 |
65 | def dump_answer(self, e, path=None):
66 | assert isinstance(e, Evaluation)
67 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
68 | with open(path, 'w') as fh:
69 | json.dump(e.id2answer_dict, fh)
70 |
71 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/trainer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from basic_cnn.model import Model
4 | from my.tensorflow import average_gradients
5 |
6 |
7 | class Trainer(object):
8 | def __init__(self, config, model):
9 | assert isinstance(model, Model)
10 | self.config = config
11 | self.model = model
12 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
13 | self.loss = model.get_loss()
14 | self.var_list = model.get_var_list()
15 | self.global_step = model.get_global_step()
16 | self.summary = model.summary
17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 |
20 | def get_train_op(self):
21 | return self.train_op
22 |
23 | def step(self, sess, batch, get_summary=False):
24 | assert isinstance(sess, tf.Session)
25 | _, ds = batch
26 | feed_dict = self.model.get_feed_dict(ds, True)
27 | if get_summary:
28 | loss, summary, train_op = \
29 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
30 | else:
31 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
32 | summary = None
33 | return loss, summary, train_op
34 |
35 |
36 | class MultiGPUTrainer(object):
37 | def __init__(self, config, models):
38 | model = models[0]
39 | assert isinstance(model, Model)
40 | self.config = config
41 | self.model = model
42 | self.opt = tf.train.AdadeltaOptimizer(config.init_lr)
43 | self.var_list = model.get_var_list()
44 | self.global_step = model.get_global_step()
45 | self.summary = model.summary
46 | self.models = models
47 | losses = []
48 | grads_list = []
49 | for gpu_idx, model in enumerate(models):
50 | with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/gpu:{}".format(gpu_idx)):
51 | loss = model.get_loss()
52 | grads = self.opt.compute_gradients(loss, var_list=self.var_list)
53 | losses.append(loss)
54 | grads_list.append(grads)
55 |
56 | self.loss = tf.add_n(losses)/len(losses)
57 | self.grads = average_gradients(grads_list)
58 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
59 |
60 | def step(self, sess, batches, get_summary=False):
61 | assert isinstance(sess, tf.Session)
62 | feed_dict = {}
63 | for batch, model in zip(batches, self.models):
64 | _, ds = batch
65 | feed_dict.update(model.get_feed_dict(ds, True))
66 |
67 | if get_summary:
68 | loss, summary, train_op = \
69 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
70 | else:
71 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
72 | summary = None
73 | return loss, summary, train_op
74 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/trainer.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from basic.model import Model
4 | from my.tensorflow import average_gradients
5 |
6 |
7 | class Trainer(object):
8 | def __init__(self, config, model):
9 | assert isinstance(model, Model)
10 | self.config = config
11 | self.model = model
12 | self.opt = tf.train.AdamOptimizer(config.init_lr)
13 | self.loss = model.get_loss()
14 | self.var_list = model.get_var_list()
15 | self.global_step = model.get_global_step()
16 | self.summary = model.summary
17 | self.grads = self.opt.compute_gradients(self.loss, var_list=self.var_list)
18 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
19 |
20 | def get_train_op(self):
21 | return self.train_op
22 |
23 | def step(self, sess, batch, get_summary=False):
24 | assert isinstance(sess, tf.Session)
25 | _, ds = batch
26 | feed_dict = self.model.get_feed_dict(ds, True)
27 | if get_summary:
28 | loss, summary, train_op = \
29 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
30 | else:
31 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
32 | summary = None
33 | return loss, summary, train_op
34 |
35 |
36 | class MultiGPUTrainer(object):
37 | def __init__(self, config, models):
38 | model = models[0]
39 | assert isinstance(model, Model)
40 | self.config = config
41 | self.model = model
42 | self.opt = tf.train.AdamOptimizer(config.init_lr)
43 | self.var_list = model.get_var_list()
44 | self.global_step = model.get_global_step()
45 | self.summary = model.summary
46 | self.models = models
47 | losses = []
48 | grads_list = []
49 | for gpu_idx, model in enumerate(models):
50 | with tf.name_scope("grads_{}".format(gpu_idx)), tf.device("/{}:{}".format(config.device_type, gpu_idx)):
51 | loss = model.get_loss()
52 | grads = self.opt.compute_gradients(loss, var_list=self.var_list)
53 | losses.append(loss)
54 | grads_list.append(grads)
55 |
56 | self.loss = tf.add_n(losses)/len(losses)
57 | self.grads = average_gradients(grads_list)
58 | self.train_op = self.opt.apply_gradients(self.grads, global_step=self.global_step)
59 |
60 | def step(self, sess, batches, get_summary=False):
61 | assert isinstance(sess, tf.Session)
62 | feed_dict = {}
63 | for batch, model in zip(batches, self.models):
64 | _, ds = batch
65 | feed_dict.update(model.get_feed_dict(ds, True))
66 |
67 | if get_summary:
68 | loss, summary, train_op = \
69 | sess.run([self.loss, self.summary, self.train_op], feed_dict=feed_dict)
70 | else:
71 | loss, train_op = sess.run([self.loss, self.train_op], feed_dict=feed_dict)
72 | summary = None
73 | return loss, summary, train_op
74 |
--------------------------------------------------------------------------------
/tensorflow/CIFAR10/resnet/README.md:
--------------------------------------------------------------------------------
1 | # ResNet on CIFAR10 and CIFAR100
2 |
3 | (Borrowed from the tensorflow/models repository)
4 |
5 | ## Dataset
6 |
7 | https://www.cs.toronto.edu/~kriz/cifar.html
8 |
9 | ## Related papers
10 |
11 | - [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027v2.pdf)
12 | - [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385v1.pdf)
13 | - [Wide Residual Networks](https://arxiv.org/pdf/1605.07146v1.pdf)
14 |
15 | ## Setting
16 |
17 | * Pad to 36x36 and random crop. Horizontal flip. Per-image whitening.
18 | * Momentum optimizer (momentum = 0.9).
19 | * Learning rate schedule: 0.01 (1 epoch), 0.1 (90 epochs), 0.01 (45 epochs), 0.001 (45 epochs).
20 | * L2 weight decay: 0.005.
21 | * Batch size: 128. (28-10 wide and 1001 layer bottleneck use 64)
22 |
23 | ## Results
24 |
25 | CIFAR-10 Model|Best Precision|Steps
26 | --------------|--------------|------
27 | 32 layer|92.5%|~80k
28 | 110 layer|93.6%|~80k
29 | 164 layer bottleneck|94.5%|~80k
30 | 1001 layer bottleneck|94.9%|~80k
31 | 28-10 wide|95%|~90k
32 |
33 | CIFAR-100 Model|Best Precision|Steps
34 | ---------------|--------------|-----
35 | 32 layer|68.1%|~45k
36 | 110 layer|71.3%|~60k
37 | 164 layer bottleneck|75.7%|~50k
38 | 1001 layer bottleneck|78.2%|~70k
39 | 28-10 wide|78.3%|~70k
40 |
41 | ## Prerequisites
42 |
43 | 1. Install TensorFlow 1.2 (preferably from source for higher performance) and Python 3.6.2.
44 |
45 | 2. Download CIFAR-10/CIFAR-100 dataset.
46 |
47 | ```shell
48 | curl -o cifar-10-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
49 | curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
50 | ```
51 |
52 | ## How to run
53 |
54 | ```shell
55 | # cd to the models repository and run with bash. Expected command output shown.
56 | # The directory should contain an empty WORKSPACE file, the resnet code, and the cifar10 dataset.
57 | # Note: The user can split 5k from train set for eval set.
58 | $ ls -R
59 | .:
60 | cifar10 resnet WORKSPACE
61 |
62 | ./cifar10:
63 | data_batch_1.bin data_batch_2.bin data_batch_3.bin data_batch_4.bin
64 | data_batch_5.bin test_batch.bin
65 |
66 | ./resnet:
67 | cifar_input.py README.md resnet_main.py resnet_model.py
68 |
69 | # Train the model.
70 | $ python3 resnet/resnet_main.py --train_data_path=cifar10/data_batch* \
71 | --log_root=/tmp/resnet_model \
72 | --train_dir=/tmp/resnet_model/train \
73 | --dataset='cifar10' \
74 | --num_gpus=1
75 |
76 | # While the model is training, you can also check on its progress using tensorboard:
77 | $ tensorboard --logdir=/tmp/resnet_model
78 |
79 | # Evaluate the model.
80 | # Avoid running on the same GPU as the training job at the same time,
81 | # otherwise, you might run out of memory.
82 | $ python3 resnet/resnet_main.py --eval_data_path=cifar10/test_batch.bin \
83 | --log_root=/tmp/resnet_model \
84 | --eval_dir=/tmp/resnet_model/test \
85 | --mode=eval \
86 | --dataset='cifar10' \
87 | --num_gpus=0
88 | ```
89 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/graph_handler.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import json
3 | from json import encoder
4 | import os
5 |
6 | import tensorflow as tf
7 |
8 | from basic.evaluator import Evaluation, F1Evaluation
9 | from my.utils import short_floats
10 |
11 | import pickle
12 |
13 |
14 | class GraphHandler(object):
15 | def __init__(self, config, model):
16 | self.config = config
17 | self.model = model
18 | self.saver = tf.train.Saver(max_to_keep=config.max_to_keep)
19 | self.writer = None
20 | self.save_path = os.path.join(config.save_dir, config.model_name)
21 |
22 | def initialize(self, sess):
23 | sess.run(tf.global_variables_initializer())
24 | if self.config.load:
25 | self._load(sess)
26 |
27 | if self.config.mode == 'train':
28 | self.writer = tf.summary.FileWriter(self.config.log_dir, graph=tf.get_default_graph())
29 |
30 | def save(self, sess, global_step=None):
31 | saver = tf.train.Saver(max_to_keep=self.config.max_to_keep)
32 | saver.save(sess, self.save_path, global_step=global_step)
33 |
34 | def _load(self, sess):
35 | config = self.config
36 | vars_ = {var.name.split(":")[0]: var for var in tf.global_variables()}
37 | if config.load_ema:
38 | ema = self.model.var_ema
39 | for var in tf.trainable_variables():
40 | del vars_[var.name.split(":")[0]]
41 | vars_[ema.average_name(var)] = var
42 | saver = tf.train.Saver(vars_, max_to_keep=config.max_to_keep)
43 |
44 | if config.load_path:
45 | save_path = config.load_path
46 | elif config.load_step > 0:
47 | save_path = os.path.join(config.save_dir, "{}-{}".format(config.model_name, config.load_step))
48 | else:
49 | save_dir = config.save_dir
50 | checkpoint = tf.train.get_checkpoint_state(save_dir)
51 | assert checkpoint is not None, "cannot load checkpoint at {}".format(save_dir)
52 | save_path = checkpoint.model_checkpoint_path
53 | print("Loading saved model from {}".format(save_path))
54 | saver.restore(sess, save_path)
55 |
56 | def add_summary(self, summary, global_step):
57 | self.writer.add_summary(summary, global_step)
58 |
59 | def add_summaries(self, summaries, global_step):
60 | for summary in summaries:
61 | self.add_summary(summary, global_step)
62 |
63 | def dump_eval(self, e, precision=2, path=None):
64 | assert isinstance(e, Evaluation)
65 | if self.config.dump_pickle:
66 | path = path or os.path.join(self.config.eval_dir, "{}-{}.pklz".format(e.data_type, str(e.global_step).zfill(6)))
67 | with gzip.open(path, 'wb', compresslevel=3) as fh:
68 | pickle.dump(e.dict, fh)
69 | else:
70 | path = path or os.path.join(self.config.eval_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
71 | with open(path, 'w') as fh:
72 | json.dump(short_floats(e.dict, precision), fh)
73 |
74 | def dump_answer(self, e, path=None):
75 | assert isinstance(e, Evaluation)
76 | path = path or os.path.join(self.config.answer_dir, "{}-{}.json".format(e.data_type, str(e.global_step).zfill(6)))
77 | with open(path, 'w') as fh:
78 | json.dump(e.id2answer_dict, fh)
79 |
80 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/evaluate-v1.1.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | for article in dataset:
57 | for paragraph in article['paragraphs']:
58 | for qa in paragraph['qas']:
59 | total += 1
60 | if qa['id'] not in predictions:
61 | message = 'Unanswered question ' + qa['id'] + \
62 | ' will receive score 0.'
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 | prediction = predictions[qa['id']]
67 | exact_match += metric_max_over_ground_truths(
68 | exact_match_score, prediction, ground_truths)
69 | f1 += metric_max_over_ground_truths(
70 | f1_score, prediction, ground_truths)
71 |
72 | exact_match = 100.0 * exact_match / total
73 | f1 = 100.0 * f1 / total
74 |
75 | return {'exact_match': exact_match, 'f1': f1}
76 |
77 |
78 | if __name__ == '__main__':
79 | expected_version = '1.1'
80 | parser = argparse.ArgumentParser(
81 | description='Evaluation for SQuAD ' + expected_version)
82 | parser.add_argument('dataset_file', help='Dataset file')
83 | parser.add_argument('prediction_file', help='Prediction File')
84 | args = parser.parse_args()
85 | with open(args.dataset_file) as dataset_file:
86 | dataset_json = json.load(dataset_file)
87 | if (dataset_json['version'] != expected_version):
88 | print('Evaluation expects v-' + expected_version +
89 | ', but got dataset with v-' + dataset_json['version'],
90 | file=sys.stderr)
91 | dataset = dataset_json['data']
92 | with open(args.prediction_file) as prediction_file:
93 | predictions = json.load(prediction_file)
94 | print(json.dumps(evaluate(dataset, predictions)))
95 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/evaluate.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. [Changed name for external importing]"""
2 | from __future__ import print_function
3 | from collections import Counter
4 | import string
5 | import re
6 | import argparse
7 | import json
8 | import sys
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 | def remove_articles(text):
14 | return re.sub(r'\b(a|an|the)\b', ' ', text)
15 |
16 | def white_space_fix(text):
17 | return ' '.join(text.split())
18 |
19 | def remove_punc(text):
20 | exclude = set(string.punctuation)
21 | return ''.join(ch for ch in text if ch not in exclude)
22 |
23 | def lower(text):
24 | return text.lower()
25 |
26 | return white_space_fix(remove_articles(remove_punc(lower(s))))
27 |
28 |
29 | def f1_score(prediction, ground_truth):
30 | prediction_tokens = normalize_answer(prediction).split()
31 | ground_truth_tokens = normalize_answer(ground_truth).split()
32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 | num_same = sum(common.values())
34 | if num_same == 0:
35 | return 0
36 | precision = 1.0 * num_same / len(prediction_tokens)
37 | recall = 1.0 * num_same / len(ground_truth_tokens)
38 | f1 = (2 * precision * recall) / (precision + recall)
39 | return f1
40 |
41 |
42 | def exact_match_score(prediction, ground_truth):
43 | return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 |
45 |
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 | scores_for_ground_truths = []
48 | for ground_truth in ground_truths:
49 | score = metric_fn(prediction, ground_truth)
50 | scores_for_ground_truths.append(score)
51 | return max(scores_for_ground_truths)
52 |
53 |
54 | def evaluate(dataset, predictions):
55 | f1 = exact_match = total = 0
56 | for article in dataset:
57 | for paragraph in article['paragraphs']:
58 | for qa in paragraph['qas']:
59 | total += 1
60 | if qa['id'] not in predictions:
61 | message = 'Unanswered question ' + qa['id'] + \
62 | ' will receive score 0.'
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 | prediction = predictions[qa['id']]
67 | exact_match += metric_max_over_ground_truths(
68 | exact_match_score, prediction, ground_truths)
69 | f1 += metric_max_over_ground_truths(
70 | f1_score, prediction, ground_truths)
71 |
72 | exact_match = 100.0 * exact_match / total
73 | f1 = 100.0 * f1 / total
74 |
75 | return {'exact_match': exact_match, 'f1': f1}
76 |
77 |
78 | if __name__ == '__main__':
79 | expected_version = '1.1'
80 | parser = argparse.ArgumentParser(
81 | description='Evaluation for SQuAD ' + expected_version)
82 | parser.add_argument('dataset_file', help='Dataset file')
83 | parser.add_argument('prediction_file', help='Prediction File')
84 | args = parser.parse_args()
85 | with open(args.dataset_file) as dataset_file:
86 | dataset_json = json.load(dataset_file)
87 | if (dataset_json['version'] != expected_version):
88 | print('Evaluation expects v-' + expected_version +
89 | ', but got dataset with v-' + dataset_json['version'],
90 | file=sys.stderr)
91 | dataset = dataset_json['data']
92 | with open(args.prediction_file) as prediction_file:
93 | predictions = json.load(prediction_file)
94 | print(json.dumps(evaluate(dataset, predictions)))
95 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/nltk_utils.py:
--------------------------------------------------------------------------------
1 | import nltk
2 | import numpy as np
3 |
4 |
5 | def _set_span(t, i):
6 | if isinstance(t[0], str):
7 | t.span = (i, i+len(t))
8 | else:
9 | first = True
10 | for c in t:
11 | cur_span = _set_span(c, i)
12 | i = cur_span[1]
13 | if first:
14 | min_ = cur_span[0]
15 | first = False
16 | max_ = cur_span[1]
17 | t.span = (min_, max_)
18 | return t.span
19 |
20 |
21 | def set_span(t):
22 | assert isinstance(t, nltk.tree.Tree)
23 | try:
24 | return _set_span(t, 0)
25 | except:
26 | print(t)
27 | exit()
28 |
29 |
30 | def tree_contains_span(tree, span):
31 | """
32 | Assumes that tree span has been set with set_span
33 | Returns true if any subtree of t has exact span as the given span
34 | :param t:
35 | :param span:
36 | :return bool:
37 | """
38 | return span in set(t.span for t in tree.subtrees())
39 |
40 |
41 | def span_len(span):
42 | return span[1] - span[0]
43 |
44 |
45 | def span_overlap(s1, s2):
46 | start = max(s1[0], s2[0])
47 | stop = min(s1[1], s2[1])
48 | if stop > start:
49 | return start, stop
50 | return None
51 |
52 |
53 | def span_prec(true_span, pred_span):
54 | overlap = span_overlap(true_span, pred_span)
55 | if overlap is None:
56 | return 0
57 | return span_len(overlap) / span_len(pred_span)
58 |
59 |
60 | def span_recall(true_span, pred_span):
61 | overlap = span_overlap(true_span, pred_span)
62 | if overlap is None:
63 | return 0
64 | return span_len(overlap) / span_len(true_span)
65 |
66 |
67 | def span_f1(true_span, pred_span):
68 | p = span_prec(true_span, pred_span)
69 | r = span_recall(true_span, pred_span)
70 | if p == 0 or r == 0:
71 | return 0.0
72 | return 2 * p * r / (p + r)
73 |
74 |
75 | def find_max_f1_span(tree, span):
76 | return find_max_f1_subtree(tree, span).span
77 |
78 |
79 | def find_max_f1_subtree(tree, span):
80 | return max(((t, span_f1(span, t.span)) for t in tree.subtrees()), key=lambda p: p[1])[0]
81 |
82 |
83 | def tree2matrix(tree, node2num, row_size=None, col_size=None, dtype='int32'):
84 | set_span(tree)
85 | D = tree.height() - 1
86 | B = len(tree.leaves())
87 | row_size = row_size or D
88 | col_size = col_size or B
89 | matrix = np.zeros([row_size, col_size], dtype=dtype)
90 | mask = np.zeros([row_size, col_size, col_size], dtype='bool')
91 |
92 | for subtree in tree.subtrees():
93 | row = subtree.height() - 2
94 | col = subtree.span[0]
95 | matrix[row, col] = node2num(subtree)
96 | for subsub in subtree.subtrees():
97 | if isinstance(subsub, nltk.tree.Tree):
98 | mask[row, col, subsub.span[0]] = True
99 | if not isinstance(subsub[0], nltk.tree.Tree):
100 | c = subsub.span[0]
101 | for r in range(row):
102 | mask[r, c, c] = True
103 | else:
104 | mask[row, col, col] = True
105 |
106 | return matrix, mask
107 |
108 |
109 | def load_compressed_tree(s):
110 |
111 | def compress_tree(tree):
112 | assert not isinstance(tree, str)
113 | if len(tree) == 1:
114 | if isinstance(tree[0], nltk.tree.Tree):
115 | return compress_tree(tree[0])
116 | else:
117 | return tree
118 | else:
119 | for i, t in enumerate(tree):
120 | if isinstance(t, nltk.tree.Tree):
121 | tree[i] = compress_tree(t)
122 | else:
123 | tree[i] = t
124 | return tree
125 |
126 | return compress_tree(nltk.tree.Tree.fromstring(s))
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/ensemble.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import functools
3 | import gzip
4 | import json
5 | import pickle
6 | from collections import defaultdict
7 | from operator import mul
8 |
9 | from tqdm import tqdm
10 | from squad.utils import get_phrase, get_best_span, get_span_score_pairs
11 |
12 |
13 | def get_args():
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('paths', nargs='+')
16 | parser.add_argument('-o', '--out', default='ensemble.json')
17 | parser.add_argument("--data_path", default="data/squad/data_test.json")
18 | parser.add_argument("--shared_path", default="data/squad/shared_test.json")
19 | args = parser.parse_args()
20 | return args
21 |
22 |
23 | def ensemble(args):
24 | e_list = []
25 | for path in tqdm(args.paths):
26 | with gzip.open(path, 'r') as fh:
27 | e = pickle.load(fh)
28 | e_list.append(e)
29 |
30 | with open(args.data_path, 'r') as fh:
31 | data = json.load(fh)
32 |
33 | with open(args.shared_path, 'r') as fh:
34 | shared = json.load(fh)
35 |
36 | out = {}
37 | for idx, (id_, rx) in tqdm(enumerate(zip(data['ids'], data['*x'])), total=len(e['yp'])):
38 | if idx >= len(e['yp']):
39 | # for debugging purpose
40 | break
41 | context = shared['p'][rx[0]][rx[1]]
42 | wordss = shared['x'][rx[0]][rx[1]]
43 | yp_list = [e['yp'][idx] for e in e_list]
44 | yp2_list = [e['yp2'][idx] for e in e_list]
45 | answer = ensemble4(context, wordss, yp_list, yp2_list)
46 | out[id_] = answer
47 |
48 | with open(args.out, 'w') as fh:
49 | json.dump(out, fh)
50 |
51 |
52 | def ensemble1(context, wordss, y1_list, y2_list):
53 | """
54 |
55 | :param context: Original context
56 | :param wordss: tokenized words (nested 2D list)
57 | :param y1_list: list of start index probs (each element corresponds to probs form single model)
58 | :param y2_list: list of stop index probs
59 | :return:
60 | """
61 | sum_y1 = combine_y_list(y1_list)
62 | sum_y2 = combine_y_list(y2_list)
63 | span, score = get_best_span(sum_y1, sum_y2)
64 | return get_phrase(context, wordss, span)
65 |
66 |
67 | def ensemble2(context, wordss, y1_list, y2_list):
68 | start_dict = defaultdict(float)
69 | stop_dict = defaultdict(float)
70 | for y1, y2 in zip(y1_list, y2_list):
71 | span, score = get_best_span(y1, y2)
72 | start_dict[span[0]] += y1[span[0][0]][span[0][1]]
73 | stop_dict[span[1]] += y2[span[1][0]][span[1][1]]
74 | start = max(start_dict.items(), key=lambda pair: pair[1])[0]
75 | stop = max(stop_dict.items(), key=lambda pair: pair[1])[0]
76 | best_span = (start, stop)
77 | return get_phrase(context, wordss, best_span)
78 |
79 |
80 | def ensemble3(context, wordss, y1_list, y2_list):
81 | d = defaultdict(float)
82 | for y1, y2 in zip(y1_list, y2_list):
83 | span, score = get_best_span(y1, y2)
84 | phrase = get_phrase(context, wordss, span)
85 | d[phrase] += score
86 | return max(d.items(), key=lambda pair: pair[1])[0]
87 |
88 |
89 | def ensemble4(context, wordss, y1_list, y2_list):
90 | d = defaultdict(lambda: 0.0)
91 | for y1, y2 in zip(y1_list, y2_list):
92 | for span, score in get_span_score_pairs(y1, y2):
93 | d[span] += score
94 | span = max(d.items(), key=lambda pair: pair[1])[0]
95 | phrase = get_phrase(context, wordss, span)
96 | return phrase
97 |
98 |
99 | def combine_y_list(y_list, op='*'):
100 | if op == '+':
101 | func = sum
102 | elif op == '*':
103 | def func(l): return functools.reduce(mul, l)
104 | else:
105 | func = op
106 | return [[func(yij_list) for yij_list in zip(*yi_list)] for yi_list in zip(*y_list)]
107 |
108 |
109 | def main():
110 | args = get_args()
111 | ensemble(args)
112 |
113 | if __name__ == "__main__":
114 | main()
115 |
116 |
117 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/models/densenet.py:
--------------------------------------------------------------------------------
1 | '''DenseNet in PyTorch.'''
2 | import math
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 |
9 | class Bottleneck(nn.Module):
10 | def __init__(self, in_planes, growth_rate):
11 | super(Bottleneck, self).__init__()
12 | self.bn1 = nn.BatchNorm2d(in_planes)
13 | self.conv1 = nn.Conv2d(in_planes, 4 * growth_rate, kernel_size=1, bias=False)
14 | self.bn2 = nn.BatchNorm2d(4 * growth_rate)
15 | self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
16 |
17 | def forward(self, x):
18 | out = self.conv1(F.relu(self.bn1(x)))
19 | out = self.conv2(F.relu(self.bn2(out)))
20 | out = torch.cat([out, x], 1)
21 | return out
22 |
23 |
24 | class Transition(nn.Module):
25 | def __init__(self, in_planes, out_planes, last=False, pool_size=2):
26 | super(Transition, self).__init__()
27 | self.last = last
28 | self.pool_size = pool_size
29 | self.bn = nn.BatchNorm2d(in_planes)
30 | if not self.last:
31 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False)
32 |
33 | def forward(self, x):
34 | out = F.relu(self.bn(x))
35 | if not self.last:
36 | out = self.conv(out)
37 | out = F.avg_pool2d(out, self.pool_size)
38 | return out
39 |
40 |
41 | class DenseNet(nn.Module):
42 | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10):
43 | super(DenseNet, self).__init__()
44 | # TODO: Add drop for CIFAR10 without data augmentation
45 | self.growth_rate = growth_rate
46 |
47 | num_planes = 2 * growth_rate
48 | self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False)
49 |
50 | self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0])
51 | num_planes += nblocks[0] * growth_rate
52 | out_planes = int(math.floor(num_planes*reduction))
53 | self.trans1 = Transition(num_planes, out_planes)
54 | num_planes = out_planes
55 |
56 | self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1])
57 | num_planes += nblocks[1] * growth_rate
58 | out_planes = int(math.floor(num_planes*reduction))
59 | self.trans2 = Transition(num_planes, out_planes)
60 | num_planes = out_planes
61 |
62 | self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2])
63 | num_planes += nblocks[2] * growth_rate
64 | self.trans3 = Transition(num_planes, num_planes, last=True, pool_size=8)
65 |
66 | self.linear = nn.Linear(num_planes, num_classes)
67 |
68 | for m in self.modules():
69 | if isinstance(m, nn.Conv2d):
70 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
71 | m.weight.data.normal_(0, math.sqrt(2. / n))
72 | elif isinstance(m, nn.BatchNorm2d):
73 | m.weight.data.fill_(1)
74 | m.bias.data.zero_()
75 |
76 | def _make_dense_layers(self, block, in_planes, nblock):
77 | layers = []
78 | for i in range(nblock):
79 | layers.append(block(in_planes, self.growth_rate))
80 | in_planes += self.growth_rate
81 | return nn.Sequential(*layers)
82 |
83 | def forward(self, x):
84 | out = self.conv1(x)
85 | out = self.trans1(self.dense1(out))
86 | out = self.trans2(self.dense2(out))
87 | out = self.trans3(self.dense3(out))
88 | out = out.view(out.size(0), -1)
89 | out = self.linear(out)
90 | return out
91 |
92 |
93 | def DenseNetBC(L, k):
94 | assert (L - 4) % 6 == 0
95 | num_blocks = int((L - 4) / 6)
96 | return DenseNet(Bottleneck, [num_blocks] * 3, growth_rate=k, reduction=0.5)
97 |
98 |
99 | def DenseNetBC100():
100 | return DenseNetBC(100, 12)
101 |
102 |
103 | def DenseNetBC250():
104 | return DenseNetBC(250, 24)
105 |
106 |
107 | def DenseNetBC190():
108 | return DenseNetBC(190, 40)
109 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/rnn.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from tensorflow.python.ops.rnn import dynamic_rnn as _dynamic_rnn, \
3 | bidirectional_dynamic_rnn as _bidirectional_dynamic_rnn
4 |
5 | from my.tensorflow import flatten, reconstruct
6 |
7 |
8 | def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
9 | dtype=None, parallel_iterations=None, swap_memory=False,
10 | time_major=False, scope=None):
11 | assert not time_major # TODO : to be implemented later!
12 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
13 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
14 |
15 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
16 | initial_state=initial_state, dtype=dtype,
17 | parallel_iterations=parallel_iterations, swap_memory=swap_memory,
18 | time_major=time_major, scope=scope)
19 |
20 | outputs = reconstruct(flat_outputs, inputs, 2)
21 | return outputs, final_state
22 |
23 |
24 | def bw_dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
25 | dtype=None, parallel_iterations=None, swap_memory=False,
26 | time_major=False, scope=None):
27 | assert not time_major # TODO : to be implemented later!
28 |
29 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
30 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
31 |
32 | flat_inputs = tf.reverse(flat_inputs, 1) if sequence_length is None \
33 | else tf.reverse_sequence(flat_inputs, sequence_length, 1)
34 | flat_outputs, final_state = _dynamic_rnn(cell, flat_inputs, sequence_length=flat_len,
35 | initial_state=initial_state, dtype=dtype,
36 | parallel_iterations=parallel_iterations, swap_memory=swap_memory,
37 | time_major=time_major, scope=scope)
38 | flat_outputs = tf.reverse(flat_outputs, 1) if sequence_length is None \
39 | else tf.reverse_sequence(flat_outputs, sequence_length, 1)
40 |
41 | outputs = reconstruct(flat_outputs, inputs, 2)
42 | return outputs, final_state
43 |
44 |
45 | def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
46 | initial_state_fw=None, initial_state_bw=None,
47 | dtype=None, parallel_iterations=None,
48 | swap_memory=False, time_major=False, scope=None):
49 | assert not time_major
50 |
51 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
52 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
53 |
54 | (flat_fw_outputs, flat_bw_outputs), final_state = \
55 | _bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
56 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
57 | dtype=dtype, parallel_iterations=parallel_iterations, swap_memory=swap_memory,
58 | time_major=time_major, scope=scope)
59 |
60 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
61 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
62 | # FIXME : final state is not reshaped!
63 | return (fw_outputs, bw_outputs), final_state
64 |
65 |
66 | def bidirectional_rnn(cell_fw, cell_bw, inputs,
67 | initial_state_fw=None, initial_state_bw=None,
68 | dtype=None, sequence_length=None, scope=None):
69 |
70 | flat_inputs = flatten(inputs, 2) # [-1, J, d]
71 | flat_len = None if sequence_length is None else tf.cast(flatten(sequence_length, 0), 'int64')
72 |
73 | (flat_fw_outputs, flat_bw_outputs), final_state = \
74 | tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, flat_inputs, sequence_length=flat_len,
75 | initial_state_fw=initial_state_fw, initial_state_bw=initial_state_bw,
76 | dtype=dtype, scope=scope)
77 |
78 | fw_outputs = reconstruct(flat_fw_outputs, inputs, 2)
79 | bw_outputs = reconstruct(flat_bw_outputs, inputs, 2)
80 | # FIXME : final state is not reshaped!
81 | return (fw_outputs, bw_outputs), final_state
82 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/visualizer.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from collections import OrderedDict
3 | import http.server
4 | import socketserver
5 | import argparse
6 | import json
7 | import os
8 | import numpy as np
9 | from tqdm import tqdm
10 |
11 | from jinja2 import Environment, FileSystemLoader
12 |
13 |
14 | def bool_(string):
15 | if string == 'True':
16 | return True
17 | elif string == 'False':
18 | return False
19 | else:
20 | raise Exception()
21 |
22 | def get_args():
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--model_name", type=str, default='basic')
25 | parser.add_argument("--data_type", type=str, default='dev')
26 | parser.add_argument("--step", type=int, default=5000)
27 | parser.add_argument("--template_name", type=str, default="visualizer.html")
28 | parser.add_argument("--num_per_page", type=int, default=100)
29 | parser.add_argument("--data_dir", type=str, default="data/squad")
30 | parser.add_argument("--port", type=int, default=8000)
31 | parser.add_argument("--host", type=str, default="0.0.0.0")
32 | parser.add_argument("--open", type=str, default='False')
33 | parser.add_argument("--run_id", type=str, default="0")
34 |
35 | args = parser.parse_args()
36 | return args
37 |
38 |
39 | def _decode(decoder, sent):
40 | return " ".join(decoder[idx] for idx in sent)
41 |
42 |
43 | def accuracy2_visualizer(args):
44 | model_name = args.model_name
45 | data_type = args.data_type
46 | num_per_page = args.num_per_page
47 | data_dir = args.data_dir
48 | run_id = args.run_id.zfill(2)
49 | step = args.step
50 |
51 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
52 | eval_ = json.load(open(eval_path, 'r'))
53 |
54 | _id = 0
55 | html_dir = "/tmp/list_results%d" % _id
56 | while os.path.exists(html_dir):
57 | _id += 1
58 | html_dir = "/tmp/list_results%d" % _id
59 |
60 | if os.path.exists(html_dir):
61 | shutil.rmtree(html_dir)
62 | os.mkdir(html_dir)
63 |
64 | cur_dir = os.path.dirname(os.path.realpath(__file__))
65 | templates_dir = os.path.join(cur_dir, 'templates')
66 | env = Environment(loader=FileSystemLoader(templates_dir))
67 | env.globals.update(zip=zip, reversed=reversed)
68 | template = env.get_template(args.template_name)
69 |
70 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
71 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
72 | data = json.load(open(data_path, 'r'))
73 | shared = json.load(open(shared_path, 'r'))
74 |
75 | rows = []
76 | for i, (idx, yi, ypi) in enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp')])):
77 | id_, q, rx = (data[key][idx] for key in ('ids', 'q', '*x'))
78 | x = shared['x'][rx[0]][rx[1]]
79 | ques = [" ".join(q)]
80 | para = [[word for word in sent] for sent in x]
81 | row = {
82 | 'id': id_,
83 | 'title': "Hello world!",
84 | 'ques': ques,
85 | 'para': para,
86 | 'y': yi,
87 | 'y2': yi,
88 | 'yp': ypi,
89 | 'yp2': ypi,
90 | 'a': ""
91 | }
92 | rows.append(row)
93 |
94 | if i % num_per_page == 0:
95 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
96 |
97 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
98 | var_dict = {'title': "Accuracy Visualization",
99 | 'rows': rows
100 | }
101 | with open(html_path, "wb") as f:
102 | f.write(template.render(**var_dict).encode('UTF-8'))
103 | rows = []
104 |
105 | os.chdir(html_dir)
106 | port = args.port
107 | host = args.host
108 | # Overriding to suppress log message
109 | class MyHandler(http.server.SimpleHTTPRequestHandler):
110 | def log_message(self, format, *args):
111 | pass
112 | handler = MyHandler
113 | httpd = socketserver.TCPServer((host, port), handler)
114 | if args.open == 'True':
115 | os.system("open http://%s:%d" % (args.host, args.port))
116 | print("serving at %s:%d" % (host, port))
117 | httpd.serve_forever()
118 |
119 |
120 | if __name__ == "__main__":
121 | ARGS = get_args()
122 | accuracy2_visualizer(ARGS)
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/visualizer.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from collections import OrderedDict
3 | import http.server
4 | import socketserver
5 | import argparse
6 | import json
7 | import os
8 | import numpy as np
9 | from tqdm import tqdm
10 |
11 | from jinja2 import Environment, FileSystemLoader
12 |
13 | from basic_cnn.evaluator import get_span_score_pairs, get_best_span
14 |
15 |
16 | def bool_(string):
17 | if string == 'True':
18 | return True
19 | elif string == 'False':
20 | return False
21 | else:
22 | raise Exception()
23 |
24 | def get_args():
25 | parser = argparse.ArgumentParser()
26 | parser.add_argument("--model_name", type=str, default='basic')
27 | parser.add_argument("--data_type", type=str, default='dev')
28 | parser.add_argument("--step", type=int, default=5000)
29 | parser.add_argument("--template_name", type=str, default="visualizer.html")
30 | parser.add_argument("--num_per_page", type=int, default=100)
31 | parser.add_argument("--data_dir", type=str, default="data/squad")
32 | parser.add_argument("--port", type=int, default=8000)
33 | parser.add_argument("--host", type=str, default="0.0.0.0")
34 | parser.add_argument("--open", type=str, default='False')
35 | parser.add_argument("--run_id", type=str, default="0")
36 |
37 | args = parser.parse_args()
38 | return args
39 |
40 |
41 | def _decode(decoder, sent):
42 | return " ".join(decoder[idx] for idx in sent)
43 |
44 |
45 | def accuracy2_visualizer(args):
46 | model_name = args.model_name
47 | data_type = args.data_type
48 | num_per_page = args.num_per_page
49 | data_dir = args.data_dir
50 | run_id = args.run_id.zfill(2)
51 | step = args.step
52 |
53 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.json".format(data_type, str(step).zfill(6)))
54 | print("loading {}".format(eval_path))
55 | eval_ = json.load(open(eval_path, 'r'))
56 |
57 | _id = 0
58 | html_dir = "/tmp/list_results%d" % _id
59 | while os.path.exists(html_dir):
60 | _id += 1
61 | html_dir = "/tmp/list_results%d" % _id
62 |
63 | if os.path.exists(html_dir):
64 | shutil.rmtree(html_dir)
65 | os.mkdir(html_dir)
66 |
67 | cur_dir = os.path.dirname(os.path.realpath(__file__))
68 | templates_dir = os.path.join(cur_dir, 'templates')
69 | env = Environment(loader=FileSystemLoader(templates_dir))
70 | env.globals.update(zip=zip, reversed=reversed)
71 | template = env.get_template(args.template_name)
72 |
73 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
74 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
75 | print("loading {}".format(data_path))
76 | data = json.load(open(data_path, 'r'))
77 | print("loading {}".format(shared_path))
78 | shared = json.load(open(shared_path, 'r'))
79 |
80 | rows = []
81 | for i, (idx, yi, ypi, yp2i) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2')])), total=len(eval_['idxs'])):
82 | id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
83 | x = shared['x'][rx[0]][rx[1]]
84 | ques = [" ".join(q)]
85 | para = [[word for word in sent] for sent in x]
86 | span = get_best_span(ypi, yp2i)
87 | ap = get_segment(para, span)
88 | score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
89 |
90 | row = {
91 | 'id': id_,
92 | 'title': "Hello world!",
93 | 'ques': ques,
94 | 'para': para,
95 | 'y': yi[0][0],
96 | 'y2': yi[0][1],
97 | 'yp': ypi,
98 | 'yp2': yp2i,
99 | 'a': answers,
100 | 'ap': ap,
101 | 'score': score
102 | }
103 | rows.append(row)
104 |
105 | if i % num_per_page == 0:
106 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
107 |
108 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
109 | var_dict = {'title': "Accuracy Visualization",
110 | 'rows': rows
111 | }
112 | with open(html_path, "wb") as f:
113 | f.write(template.render(**var_dict).encode('UTF-8'))
114 | rows = []
115 |
116 | os.chdir(html_dir)
117 | port = args.port
118 | host = args.host
119 | # Overriding to suppress log message
120 | class MyHandler(http.server.SimpleHTTPRequestHandler):
121 | def log_message(self, format, *args):
122 | pass
123 | handler = MyHandler
124 | httpd = socketserver.TCPServer((host, port), handler)
125 | if args.open == 'True':
126 | os.system("open http://%s:%d" % (args.host, args.port))
127 | print("serving at %s:%d" % (host, port))
128 | httpd.serve_forever()
129 |
130 |
131 | def get_segment(para, span):
132 | return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
133 |
134 |
135 | if __name__ == "__main__":
136 | ARGS = get_args()
137 | accuracy2_visualizer(ARGS)
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import numpy as np
3 |
4 |
5 | def get_2d_spans(text, tokenss):
6 | spanss = []
7 | cur_idx = 0
8 | for tokens in tokenss:
9 | spans = []
10 | for token in tokens:
11 | if text.find(token, cur_idx) < 0:
12 | print(tokens)
13 | print("{} {} {}".format(token, cur_idx, text))
14 | raise Exception()
15 | cur_idx = text.find(token, cur_idx)
16 | spans.append((cur_idx, cur_idx + len(token)))
17 | cur_idx += len(token)
18 | spanss.append(spans)
19 | return spanss
20 |
21 |
22 | def get_word_span(context, wordss, start, stop):
23 | spanss = get_2d_spans(context, wordss)
24 | idxs = []
25 | for sent_idx, spans in enumerate(spanss):
26 | for word_idx, span in enumerate(spans):
27 | if not (stop <= span[0] or start >= span[1]):
28 | idxs.append((sent_idx, word_idx))
29 |
30 | assert len(idxs) > 0, "{} {} {} {}".format(context, spanss, start, stop)
31 | return idxs[0], (idxs[-1][0], idxs[-1][1] + 1)
32 |
33 |
34 | def get_phrase(context, wordss, span):
35 | """
36 | Obtain phrase as substring of context given start and stop indices in word level
37 | :param context:
38 | :param wordss:
39 | :param start: [sent_idx, word_idx]
40 | :param stop: [sent_idx, word_idx]
41 | :return:
42 | """
43 | start, stop = span
44 | flat_start = get_flat_idx(wordss, start)
45 | flat_stop = get_flat_idx(wordss, stop)
46 | words = sum(wordss, [])
47 | char_idx = 0
48 | char_start, char_stop = None, None
49 | for word_idx, word in enumerate(words):
50 | char_idx = context.find(word, char_idx)
51 | assert char_idx >= 0
52 | if word_idx == flat_start:
53 | char_start = char_idx
54 | char_idx += len(word)
55 | if word_idx == flat_stop - 1:
56 | char_stop = char_idx
57 | assert char_start is not None
58 | assert char_stop is not None
59 | return context[char_start:char_stop]
60 |
61 |
62 | def get_flat_idx(wordss, idx):
63 | return sum(len(words) for words in wordss[:idx[0]]) + idx[1]
64 |
65 |
66 | def get_word_idx(context, wordss, idx):
67 | spanss = get_2d_spans(context, wordss)
68 | return spanss[idx[0]][idx[1]][0]
69 |
70 |
71 | def process_tokens(temp_tokens):
72 | tokens = []
73 | for token in temp_tokens:
74 | flag = False
75 | l = ("-", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\u201C", "\u2019", "\u201D", "\u2018", "\u00B0")
76 | # \u2013 is en-dash. Used for number to nubmer
77 | # l = ("-", "\u2212", "\u2014", "\u2013")
78 | # l = ("\u2013",)
79 | tokens.extend(re.split("([{}])".format("".join(l)), token))
80 | return tokens
81 |
82 |
83 | def get_best_span(ypi, yp2i):
84 | max_val = 0
85 | best_word_span = (0, 1)
86 | best_sent_idx = 0
87 | for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
88 | argmax_j1 = 0
89 | for j in range(len(ypif)):
90 | val1 = ypif[argmax_j1]
91 | if val1 < ypif[j]:
92 | val1 = ypif[j]
93 | argmax_j1 = j
94 |
95 | val2 = yp2if[j]
96 | if val1 * val2 > max_val:
97 | best_word_span = (argmax_j1, j)
98 | best_sent_idx = f
99 | max_val = val1 * val2
100 | return ((best_sent_idx, best_word_span[0]), (best_sent_idx, best_word_span[1] + 1)), float(max_val)
101 |
102 |
103 | def get_best_span_wy(wypi, th):
104 | chunk_spans = []
105 | scores = []
106 | chunk_start = None
107 | score = 0
108 | l = 0
109 | th = min(th, np.max(wypi))
110 | for f, wypif in enumerate(wypi):
111 | for j, wypifj in enumerate(wypif):
112 | if wypifj >= th:
113 | if chunk_start is None:
114 | chunk_start = f, j
115 | score += wypifj
116 | l += 1
117 | else:
118 | if chunk_start is not None:
119 | chunk_stop = f, j
120 | chunk_spans.append((chunk_start, chunk_stop))
121 | scores.append(score/l)
122 | score = 0
123 | l = 0
124 | chunk_start = None
125 | if chunk_start is not None:
126 | chunk_stop = f, j+1
127 | chunk_spans.append((chunk_start, chunk_stop))
128 | scores.append(score/l)
129 | score = 0
130 | l = 0
131 | chunk_start = None
132 |
133 | return max(zip(chunk_spans, scores), key=lambda pair: pair[1])
134 |
135 |
136 | def get_span_score_pairs(ypi, yp2i):
137 | span_score_pairs = []
138 | for f, (ypif, yp2if) in enumerate(zip(ypi, yp2i)):
139 | for j in range(len(ypif)):
140 | for k in range(j, len(yp2if)):
141 | span = ((f, j), (f, k+1))
142 | score = ypif[j] * yp2if[k]
143 | span_score_pairs.append((span, score))
144 | return span_score_pairs
145 |
146 |
147 |
--------------------------------------------------------------------------------
/tensorflow/CIFAR10/resnet/cifar_input.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """CIFAR dataset input module.
17 | """
18 |
19 | import tensorflow as tf
20 |
21 | def build_input(dataset, data_path, batch_size, mode, data_format):
22 | """Build CIFAR image and labels.
23 |
24 | Args:
25 | dataset: Either 'cifar10' or 'cifar100'.
26 | data_path: Filename for data.
27 | batch_size: Input batch size.
28 | mode: Either 'train' or 'eval'.
29 | data_format: Either 'NCHW' or 'NHWC'.
30 | Returns:
31 | images: Batches of images. [batch_size, image_size, image_size, 3]
32 | labels: Batches of labels. [batch_size, num_classes]
33 | Raises:
34 | ValueError: when the specified dataset is not supported.
35 | """
36 | with tf.device('/cpu:0'):
37 | image_size = 32
38 | if dataset == 'cifar10':
39 | label_bytes = 1
40 | label_offset = 0
41 | num_classes = 10
42 | elif dataset == 'cifar100':
43 | label_bytes = 1
44 | label_offset = 1
45 | num_classes = 100
46 | else:
47 | raise ValueError('Not supported dataset %s', dataset)
48 |
49 | depth = 3
50 | image_bytes = image_size * image_size * depth
51 | record_bytes = label_bytes + label_offset + image_bytes
52 |
53 | data_files = tf.gfile.Glob(data_path)
54 | file_queue = tf.train.string_input_producer(data_files, shuffle=True)
55 | # Read examples from files in the filename queue.
56 | reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
57 | _, value = reader.read(file_queue)
58 |
59 | # Convert these examples to dense labels and processed images.
60 | record = tf.reshape(tf.decode_raw(value, tf.uint8), [record_bytes])
61 | label = tf.cast(tf.slice(record, [label_offset], [label_bytes]), tf.int32)
62 | # Convert from string to [depth * height * width] to [depth, height, width].
63 | depth_major = tf.reshape(tf.slice(record, [label_bytes], [image_bytes]),
64 | [depth, image_size, image_size])
65 | # Convert from [depth, height, width] to [height, width, depth].
66 | image = tf.cast(tf.transpose(depth_major, [1, 2, 0]), tf.float32)
67 |
68 | if mode == 'train':
69 | image = tf.image.resize_image_with_crop_or_pad(
70 | image, image_size+4, image_size+4)
71 | image = tf.random_crop(image, [image_size, image_size, 3])
72 | image = tf.image.random_flip_left_right(image)
73 | # Brightness/saturation/constrast provides small gains .2%~.5% on cifar.
74 | # image = tf.image.random_brightness(image, max_delta=63. / 255.)
75 | # image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
76 | # image = tf.image.random_contrast(image, lower=0.2, upper=1.8)
77 | image = tf.image.per_image_standardization(image)
78 |
79 | example_queue = tf.RandomShuffleQueue(
80 | capacity=16 * batch_size,
81 | min_after_dequeue=8 * batch_size,
82 | dtypes=[tf.float32, tf.int32],
83 | shapes=[[image_size, image_size, depth], [1]])
84 | num_threads = 16
85 | else:
86 | image = tf.image.resize_image_with_crop_or_pad(
87 | image, image_size, image_size)
88 | image = tf.image.per_image_standardization(image)
89 |
90 | example_queue = tf.FIFOQueue(
91 | 3 * batch_size,
92 | dtypes=[tf.float32, tf.int32],
93 | shapes=[[image_size, image_size, depth], [1]])
94 | num_threads = 1
95 |
96 | example_enqueue_op = example_queue.enqueue([image, label])
97 | tf.train.add_queue_runner(tf.train.queue_runner.QueueRunner(
98 | example_queue, [example_enqueue_op] * num_threads))
99 |
100 | # Read 'batch' labels + images from the example queue.
101 | images, labels = example_queue.dequeue_many(batch_size)
102 | labels = tf.reshape(labels, [batch_size, 1])
103 | indices = tf.reshape(tf.range(0, batch_size, 1), [batch_size, 1])
104 | labels = tf.sparse_to_dense(
105 | tf.concat(values=[indices, labels], axis=1),
106 | [batch_size, num_classes], 1.0, 0.0)
107 |
108 | if data_format == 'NCHW':
109 | images = tf.transpose(images, [0, 3, 1, 2])
110 |
111 | assert len(images.get_shape()) == 4
112 | assert images.get_shape()[0] == batch_size
113 | if data_format == 'NCHW':
114 | assert images.get_shape()[1] == 3
115 | else:
116 | assert images.get_shape()[-1] == 3
117 | assert len(labels.get_shape()) == 2
118 | assert labels.get_shape()[0] == batch_size
119 | assert labels.get_shape()[1] == num_classes
120 |
121 | return images, labels
122 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/visualizer.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from collections import OrderedDict
3 | import http.server
4 | import socketserver
5 | import argparse
6 | import json
7 | import os
8 | import numpy as np
9 | from tqdm import tqdm
10 | import pickle
11 | import gzip
12 |
13 | from jinja2 import Environment, FileSystemLoader
14 |
15 | from squad.utils import get_best_span, get_best_span_wy
16 |
17 |
18 | def bool_(string):
19 | if string == 'True':
20 | return True
21 | elif string == 'False':
22 | return False
23 | else:
24 | raise Exception()
25 |
26 | def get_args():
27 | parser = argparse.ArgumentParser()
28 | parser.add_argument("--model_name", type=str, default='basic')
29 | parser.add_argument("--data_type", type=str, default='dev')
30 | parser.add_argument("--step", type=int, default=5000)
31 | parser.add_argument("--template_name", type=str, default="visualizer.html")
32 | parser.add_argument("--num_per_page", type=int, default=100)
33 | parser.add_argument("--data_dir", type=str, default="data/squad")
34 | parser.add_argument("--port", type=int, default=8000)
35 | parser.add_argument("--host", type=str, default="0.0.0.0")
36 | parser.add_argument("--open", type=str, default='False')
37 | parser.add_argument("--run_id", type=str, default="0")
38 | parser.add_argument("-w", "--wy", action='store_true')
39 |
40 | args = parser.parse_args()
41 | return args
42 |
43 |
44 | def _decode(decoder, sent):
45 | return " ".join(decoder[idx] for idx in sent)
46 |
47 |
48 | def accuracy2_visualizer(args):
49 | model_name = args.model_name
50 | data_type = args.data_type
51 | num_per_page = args.num_per_page
52 | data_dir = args.data_dir
53 | run_id = args.run_id.zfill(2)
54 | step = args.step
55 |
56 | eval_path =os.path.join("out", model_name, run_id, "eval", "{}-{}.pklz".format(data_type, str(step).zfill(6)))
57 | print("loading {}".format(eval_path))
58 | eval_ = pickle.load(gzip.open(eval_path, 'r'))
59 |
60 | _id = 0
61 | html_dir = "/tmp/list_results%d" % _id
62 | while os.path.exists(html_dir):
63 | _id += 1
64 | html_dir = "/tmp/list_results%d" % _id
65 |
66 | if os.path.exists(html_dir):
67 | shutil.rmtree(html_dir)
68 | os.mkdir(html_dir)
69 |
70 | cur_dir = os.path.dirname(os.path.realpath(__file__))
71 | templates_dir = os.path.join(cur_dir, 'templates')
72 | env = Environment(loader=FileSystemLoader(templates_dir))
73 | env.globals.update(zip=zip, reversed=reversed)
74 | template = env.get_template(args.template_name)
75 |
76 | data_path = os.path.join(data_dir, "data_{}.json".format(data_type))
77 | shared_path = os.path.join(data_dir, "shared_{}.json".format(data_type))
78 | print("loading {}".format(data_path))
79 | data = json.load(open(data_path, 'r'))
80 | print("loading {}".format(shared_path))
81 | shared = json.load(open(shared_path, 'r'))
82 |
83 | rows = []
84 | for i, (idx, yi, ypi, yp2i, wypi) in tqdm(enumerate(zip(*[eval_[key] for key in ('idxs', 'y', 'yp', 'yp2', 'wyp')])), total=len(eval_['idxs'])):
85 | id_, q, rx, answers = (data[key][idx] for key in ('ids', 'q', '*x', 'answerss'))
86 | x = shared['x'][rx[0]][rx[1]]
87 | ques = [" ".join(q)]
88 | para = [[word for word in sent] for sent in x]
89 | span, score = get_best_span_wy(wypi, 0.5) if args.wy else get_best_span(ypi, yp2i)
90 | ap = get_segment(para, span)
91 | # score = "{:.3f}".format(ypi[span[0][0]][span[0][1]] * yp2i[span[1][0]][span[1][1]-1])
92 |
93 | row = {
94 | 'id': id_,
95 | 'title': "Hello world!",
96 | 'ques': ques,
97 | 'para': para,
98 | 'y': yi[0][0],
99 | 'y2': yi[0][1],
100 | 'yp': wypi if args.wy else ypi,
101 | 'yp2': wypi if args.wy else yp2i,
102 | 'a': answers,
103 | 'ap': ap,
104 | 'score': score
105 | }
106 | rows.append(row)
107 |
108 | if i % num_per_page == 0:
109 | html_path = os.path.join(html_dir, "%s.html" % str(i).zfill(8))
110 |
111 | if (i + 1) % num_per_page == 0 or (i + 1) == len(eval_['y']):
112 | var_dict = {'title': "Accuracy Visualization",
113 | 'rows': rows
114 | }
115 | with open(html_path, "wb") as f:
116 | f.write(template.render(**var_dict).encode('UTF-8'))
117 | rows = []
118 |
119 | os.chdir(html_dir)
120 | port = args.port
121 | host = args.host
122 | # Overriding to suppress log message
123 | class MyHandler(http.server.SimpleHTTPRequestHandler):
124 | def log_message(self, format, *args):
125 | pass
126 | handler = MyHandler
127 | httpd = socketserver.TCPServer((host, port), handler)
128 | if args.open == 'True':
129 | os.system("open http://%s:%d" % (args.host, args.port))
130 | print("serving at %s:%d" % (host, port))
131 | httpd.serve_forever()
132 |
133 |
134 | def get_segment(para, span):
135 | return " ".join(para[span[0][0]][span[0][1]:span[1][1]])
136 |
137 |
138 | if __name__ == "__main__":
139 | ARGS = get_args()
140 | accuracy2_visualizer(ARGS)
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/infer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import timeit
3 | from glob import glob
4 | from collections import OrderedDict
5 |
6 | import click
7 | import torch
8 | import numpy as np
9 | from torch.autograd import Variable
10 | from torchvision import transforms
11 | from torchvision import datasets
12 |
13 | from benchmark.utils import save_result
14 | from benchmark.cifar10.train import MEAN, STD, MODELS
15 |
16 |
17 | class PyTorchEngine:
18 | def __init__(self, path, arch, use_cuda=False):
19 | self.path = path
20 | self.use_cuda = use_cuda
21 | self.arch = arch
22 | model = MODELS[self.arch]()
23 | restored_state = torch.load(path)
24 | model = model.load_state_dict(restored_state['model'])
25 | accuracy = restored_state['accuracy']
26 | epoch = restored_state['epoch'] + 1
27 |
28 | if self.use_cuda:
29 | self.model = model.cuda()
30 | else:
31 | self.model = model.cpu()
32 | self.epoch = epoch
33 | self.accuracy = accuracy
34 |
35 | def pred(self, inputs):
36 | inputs = Variable(inputs, requires_grad=False, volatile=True)
37 |
38 | if self.use_cuda:
39 | inputs = inputs.cuda()
40 | return self.model(inputs).data.cpu().numpy()
41 | else:
42 | return self.model(inputs).data.numpy()
43 |
44 |
45 | def time_batch_size(dataset, batch_size, pred, use_cuda, repeat=100, bestof=3):
46 | loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
47 | shuffle=False, pin_memory=use_cuda)
48 | inputs, targets = loader.__iter__().next()
49 | assert inputs.size(0) == batch_size
50 |
51 | times = timeit.repeat('pred(inputs)', globals=locals(),
52 | repeat=repeat, number=1)
53 |
54 | return times
55 |
56 |
57 | def infer_cifar10(dataset, engine, start=1, end=128, repeat=100, log2=True,
58 | output=None):
59 | if log2:
60 | start = int(np.floor(np.log2(start)))
61 | end = int(np.ceil(np.log2(end)))
62 | assert start >= 0
63 | assert end >= start
64 | batch_sizes = map(lambda x: 2**x, range(start, end + 1))
65 | else:
66 | batch_sizes = range(start, end + 1)
67 | results = []
68 | for batch_size in batch_sizes:
69 | times = time_batch_size(dataset, batch_size, engine.pred,
70 | engine.use_cuda, repeat=repeat)
71 |
72 | result = OrderedDict()
73 | result['nodename'] = os.uname().nodename
74 | result['model'] = engine.arch
75 | result['use_cuda'] = engine.use_cuda
76 | result['batch_size'] = batch_size
77 | result['mean'] = np.mean(times)
78 | result['std'] = np.std(times)
79 | result['throughput'] = batch_size / np.mean(times)
80 | result['path'] = engine.path
81 | if output is not None:
82 | save_result(result, output)
83 |
84 | print('batch_size: {batch_size:4d}'
85 | ' - mean: {mean:.4f}'
86 | ' - std: {std:.4f}'
87 | ' - throughput: {throughput:.4f}'.format(**result))
88 | results.append(result)
89 |
90 | return results
91 |
92 |
93 | @click.command()
94 | @click.option('--dataset-dir', default='./data/cifar10')
95 | @click.option('--run-dir', default='./run/')
96 | @click.option('--output-file', default='inference.csv')
97 | @click.option('--start', '-s', default=1)
98 | @click.option('--end', '-e', default=128)
99 | @click.option('--repeat', '-r', default=100)
100 | @click.option('--log2/--no-log2', default=True)
101 | @click.option('--cpu/--no-cpu', default=True)
102 | @click.option('--gpu/--no-gpu', default=True)
103 | @click.option('--append', is_flag=True)
104 | @click.option('--models', '-m', type=click.Choice(MODELS.keys()),
105 | multiple=True)
106 | def infer(dataset_dir, run_dir, output_file, start, end, repeat, log2,
107 | cpu, gpu, append, models):
108 |
109 | transform_test = transforms.Compose([
110 | transforms.ToTensor(),
111 | transforms.Normalize(MEAN, STD)
112 | ])
113 |
114 | testset = datasets.CIFAR10(root=dataset_dir, train=False, download=True,
115 | transform=transform_test)
116 | models = models or os.listdir(run_dir)
117 | output_path = os.path.join(run_dir, output_file)
118 | assert not os.path.exists(output_path) or append
119 | for model in models:
120 | model_dir = os.path.join(run_dir, model)
121 | paths = glob(f"{model_dir}/*/checkpoint_best_model.t7")
122 | assert len(paths) > 0
123 | path = os.path.abspath(paths[0])
124 |
125 | print(f'Model: {model}')
126 | print(f'Path: {path}')
127 |
128 | if cpu:
129 | print('With CPU:')
130 | engine = PyTorchEngine(path, use_cuda=False, arch=model)
131 | infer_cifar10(testset, engine, start=start, end=end, log2=log2,
132 | repeat=repeat, output=output_path)
133 |
134 | if gpu and torch.cuda.is_available():
135 | print('With GPU:')
136 | engine = PyTorchEngine(path, use_cuda=True, arch=model)
137 | # Warmup
138 | time_batch_size(testset, 1, engine.pred, engine.use_cuda, repeat=1)
139 |
140 | infer_cifar10(testset, engine, start=start, end=end, log2=log2,
141 | repeat=repeat, output=output_path)
142 |
143 |
144 | if __name__ == '__main__':
145 | infer()
146 |
--------------------------------------------------------------------------------
/pytorch/CIFAR10/benchmark/cifar10/results.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 |
4 | import pandas as pd
5 |
6 | from benchmark.cifar10.train import MODELS
7 | from benchmark.utils import count_parameters
8 |
9 |
10 | MODEL_SIZES = {key: count_parameters(MODELS[key]()) for key in MODELS.keys()}
11 |
12 |
13 | def single_run_acc(df):
14 | df = df.copy()
15 | df['duration'] = (df['timestamp'] - df['prev_timestamp']).apply(lambda x: x.total_seconds())
16 | df['batch_duration'] = df['batch_duration'].apply(lambda x: x.total_seconds())
17 |
18 | tmp = df.loc[:, ['epoch', 'batch_size', 'ncorrect', 'duration', 'batch_duration']].groupby('epoch').sum()
19 | tmp['accuracy'] = tmp['ncorrect'] / tmp['batch_size']
20 | tmp['throughput'] = tmp['batch_size'] / tmp['duration']
21 | tmp['_throughput'] = tmp['batch_size'] / tmp['batch_duration']
22 | tmp['elapsed'] = df.groupby('epoch')['elapsed'].agg('max')
23 | tmp.reset_index(inplace=True)
24 |
25 | return tmp
26 |
27 |
28 | def load_file(file, start_timestamp=None):
29 | df = pd.read_csv(file)
30 | df['timestamp'] = pd.to_datetime(df['timestamp'])
31 | df['batch_duration'] = pd.to_timedelta(df['batch_duration'])
32 | df['ncorrect'] = df['top1_correct']
33 | start_timestamp = start_timestamp or df['timestamp'].iloc[0]
34 | df['elapsed'] = df['timestamp'] - start_timestamp
35 | df['batch_accuracy'] = df['ncorrect'] / df['batch_size']
36 | return df
37 |
38 |
39 | def load_data(directory, verbose=True):
40 | train_file = os.path.join(directory, 'train_results.csv')
41 | train = load_file(train_file)
42 | start_timestamp = train['timestamp'].iloc[0]
43 |
44 | if verbose:
45 | print(train_file)
46 | print("Training results shape: {}".format(train.shape))
47 |
48 | try:
49 | test_file = os.path.join(directory, 'test_results.csv')
50 | test = load_file(test_file, start_timestamp=start_timestamp)
51 | except FileNotFoundError:
52 | test_file = os.path.join(directory, 'valid_results.csv')
53 | test = load_file(test_file, start_timestamp=start_timestamp)
54 |
55 | if verbose:
56 | print(test_file)
57 | print('Test results shape: {}'.format(test.shape))
58 |
59 | train['mode'] = 'train'
60 | test['mode'] = 'test'
61 |
62 | combined = pd.concat([train, test], ignore_index=True).sort_values(by=['timestamp'])
63 | combined['prev_timestamp'] = combined['timestamp'].shift(1)
64 | combined.loc[0, 'prev_timestamp'] = combined.loc[0, 'timestamp'] - combined.loc[0, 'batch_duration']
65 | train = combined[combined['mode'] == 'train'].copy()
66 | test = combined[combined['mode'] == 'test'].copy()
67 |
68 | return single_run_acc(train), single_run_acc(test)
69 |
70 |
71 | def load_multiple(directory, timestamps=None, verbose=False):
72 | timestamps = timestamps or os.listdir(directory)
73 | train_sets = []
74 | test_sets = []
75 | for timestamp in sorted(timestamps):
76 | _dir = os.path.join(directory, timestamp)
77 | train, test = load_data(_dir, verbose=verbose)
78 | if verbose:
79 | print()
80 | train['run'] = _dir
81 | test['run'] = _dir
82 | train['job_start'] = timestamp
83 | test['job_start'] = timestamp
84 | train_sets.append(train)
85 | test_sets.append(test)
86 |
87 | return pd.concat(train_sets), pd.concat(test_sets)
88 |
89 |
90 | def load_multiple_models(directory, verbose=False):
91 | paths = os.listdir(directory)
92 | models = [path for path in paths if path in MODELS]
93 |
94 | train_sets = []
95 | test_sets = []
96 | for model in sorted(models):
97 | if verbose:
98 | print(f"Loading {model}")
99 | _dir = os.path.join(directory, model)
100 | train, test = load_multiple(_dir, verbose=verbose)
101 | train['model'] = model
102 | train['nparameters'] = MODEL_SIZES[model]
103 | test['model'] = model
104 | test['nparameters'] = MODEL_SIZES[model]
105 |
106 | train_sets.append(train)
107 | test_sets.append(test)
108 |
109 | return pd.concat(train_sets), pd.concat(test_sets)
110 |
111 |
112 | def concat_update(existing, other, repeat=False):
113 | for key in other.keys():
114 | if key in existing:
115 | if existing[key] != other[key] or repeat:
116 | current = existing[key]
117 | if isinstance(current, list):
118 | current.append(other[key])
119 | else:
120 | existing[key] = [current, other[key]]
121 | else:
122 | existing[key] = other[key]
123 |
124 |
125 | def run_config(run, repeat=False):
126 | full = {}
127 | configs = (os.path.join(run, entry.name) for entry in os.scandir(run) if 'config' in entry.name)
128 |
129 | for config in sorted(configs):
130 | with open(config) as file:
131 | tmp = json.load(file)
132 |
133 | tmp['path'] = config
134 | concat_update(full, tmp, repeat=repeat)
135 | return full
136 |
137 |
138 | def search_configs(criteria, configs):
139 | matches = []
140 | for run, config in configs.items():
141 | is_match = True
142 | for key, value in criteria.items():
143 | try:
144 | config_value = config[key]
145 | if config_value != value:
146 | is_match = False
147 | except KeyError:
148 | is_match = False
149 |
150 | if is_match:
151 | matches.append(run)
152 |
153 | return matches
154 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic_cnn/cli.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import tensorflow as tf
4 |
5 | from basic_cnn.main import main as m
6 |
7 | flags = tf.app.flags
8 |
9 | flags.DEFINE_string("model_name", "basic_cnn", "Model name [basic]")
10 | flags.DEFINE_string("data_dir", "data/cnn", "Data dir [data/cnn]")
11 | flags.DEFINE_string("root_dir", "/Users/minjoons/data/cnn/questions", "root dir [~/data/cnn/questions]")
12 | flags.DEFINE_string("run_id", "0", "Run ID [0]")
13 | flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
14 |
15 | flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
16 | flags.DEFINE_float("init_lr", 0.5, "Initial learning rate [0.5]")
17 | flags.DEFINE_integer("num_epochs", 50, "Total number of epochs for training [50]")
18 | flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
19 | flags.DEFINE_integer("eval_num_batches", 100, "eval num batches [100]")
20 | flags.DEFINE_integer("load_step", 0, "load step [0]")
21 | flags.DEFINE_integer("early_stop", 4, "early stop [4]")
22 |
23 | flags.DEFINE_string("mode", "test", "train | dev | test | forward [test]")
24 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
25 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
26 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
27 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
28 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
29 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay [0.9]")
30 |
31 | flags.DEFINE_boolean("draft", False, "Draft for quick testing? [False]")
32 |
33 | flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
34 | flags.DEFINE_integer("char_out_size", 100, "Char out size [100]")
35 | flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob [0.8]")
36 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
37 | flags.DEFINE_integer("char_filter_height", 5, "Char filter height [5]")
38 | flags.DEFINE_float("wd", 0.0, "Weight decay [0.0]")
39 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
40 | flags.DEFINE_bool("dump_eval", False, "dump eval? [True]")
41 | flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
42 | flags.DEFINE_string("model", "2", "config 1 |2 [2]")
43 | flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
44 | flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
45 |
46 | flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
47 | flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
48 | flags.DEFINE_integer("sent_size_th", 60, "sent size th [64]")
49 | flags.DEFINE_integer("num_sents_th", 200, "num sents th [8]")
50 | flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
51 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
52 | flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
53 |
54 | flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
55 | flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
56 | flags.DEFINE_bool("finetune", False, "finetune? [False]")
57 | flags.DEFINE_bool("feed_gt", False, "feed gt prev token during training [False]")
58 | flags.DEFINE_bool("feed_hard", False, "feed hard argmax prev token during testing [False]")
59 | flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
60 | flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
61 | flags.DEFINE_bool("eval", True, "eval? [True]")
62 | flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
63 | flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
64 |
65 | flags.DEFINE_string("forward_name", "single", "Forward name [single]")
66 | flags.DEFINE_string("answer_path", "", "Answer path []")
67 | flags.DEFINE_string("load_path", "", "Load path []")
68 | flags.DEFINE_string("shared_path", "", "Shared path []")
69 | flags.DEFINE_string("device", "/cpu:0", "default device [/cpu:0]")
70 | flags.DEFINE_integer("num_gpus", 1, "num of gpus [1]")
71 |
72 | flags.DEFINE_string("out_channel_dims", "100", "Out channel dims, separated by commas [100]")
73 | flags.DEFINE_string("filter_heights", "5", "Filter heights, separated by commas [5]")
74 |
75 | flags.DEFINE_bool("share_cnn_weights", True, "Share CNN weights [False]")
76 | flags.DEFINE_bool("share_lstm_weights", True, "Share LSTM weights [True]")
77 | flags.DEFINE_bool("two_prepro_layers", False, "Use two layers for preprocessing? [False]")
78 | flags.DEFINE_bool("aug_att", False, "Augment attention layers with more features? [False]")
79 | flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
80 | flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
81 | flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
82 | flags.DEFINE_float("keep_prob", 1.0, "keep prob [1.0]")
83 | flags.DEFINE_string("prev_mode", "a", "prev mode gy | y | a [a]")
84 | flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
85 | flags.DEFINE_bool("sh", False, "use superhighway [False]")
86 | flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
87 | flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
88 | flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
89 | flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
90 | flags.DEFINE_float("filter_ratio", 1.0, "filter ratio [1.0]")
91 | flags.DEFINE_bool("bi", False, "bi-directional attention? [False]")
92 | flags.DEFINE_integer("width", 5, "width around entity [5]")
93 |
94 |
95 | def main(_):
96 | config = flags.FLAGS
97 |
98 | config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
99 |
100 | m(config)
101 |
102 | if __name__ == "__main__":
103 | tf.app.run()
104 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/aug_squad.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 |
4 | from tqdm import tqdm
5 |
6 | from my.corenlp_interface import CoreNLPInterface
7 |
8 | in_path = sys.argv[1]
9 | out_path = sys.argv[2]
10 | url = sys.argv[3]
11 | port = int(sys.argv[4])
12 | data = json.load(open(in_path, 'r'))
13 |
14 | h = CoreNLPInterface(url, port)
15 |
16 |
17 | def find_all(a_str, sub):
18 | start = 0
19 | while True:
20 | start = a_str.find(sub, start)
21 | if start == -1: return
22 | yield start
23 | start += len(sub) # use start += 1 to find overlapping matches
24 |
25 |
26 | def to_hex(s):
27 | return " ".join(map(hex, map(ord, s)))
28 |
29 |
30 | def handle_nobreak(cand, text):
31 | if cand == text:
32 | return cand
33 | if cand.replace(u'\u00A0', ' ') == text:
34 | return cand
35 | elif cand == text.replace(u'\u00A0', ' '):
36 | return text
37 | raise Exception("{} '{}' {} '{}'".format(cand, to_hex(cand), text, to_hex(text)))
38 |
39 |
40 | # resolving unicode complication
41 |
42 | wrong_loc_count = 0
43 | loc_diffs = []
44 |
45 | for article in data['data']:
46 | for para in article['paragraphs']:
47 | para['context'] = para['context'].replace(u'\u000A', '')
48 | para['context'] = para['context'].replace(u'\u00A0', ' ')
49 | context = para['context']
50 | for qa in para['qas']:
51 | for answer in qa['answers']:
52 | answer['text'] = answer['text'].replace(u'\u00A0', ' ')
53 | text = answer['text']
54 | answer_start = answer['answer_start']
55 | if context[answer_start:answer_start + len(text)] == text:
56 | if text.lstrip() == text:
57 | pass
58 | else:
59 | answer_start += len(text) - len(text.lstrip())
60 | answer['answer_start'] = answer_start
61 | text = text.lstrip()
62 | answer['text'] = text
63 | else:
64 | wrong_loc_count += 1
65 | text = text.lstrip()
66 | answer['text'] = text
67 | starts = list(find_all(context, text))
68 | if len(starts) == 1:
69 | answer_start = starts[0]
70 | elif len(starts) > 1:
71 | new_answer_start = min(starts, key=lambda s: abs(s - answer_start))
72 | loc_diffs.append(abs(new_answer_start - answer_start))
73 | answer_start = new_answer_start
74 | else:
75 | raise Exception()
76 | answer['answer_start'] = answer_start
77 |
78 | answer_stop = answer_start + len(text)
79 | answer['answer_stop'] = answer_stop
80 | assert para['context'][answer_start:answer_stop] == answer['text'], "{} {}".format(
81 | para['context'][answer_start:answer_stop], answer['text'])
82 |
83 | print(wrong_loc_count, loc_diffs)
84 |
85 | mismatch_count = 0
86 | dep_fail_count = 0
87 | no_answer_count = 0
88 |
89 | size = sum(len(article['paragraphs']) for article in data['data'])
90 | pbar = tqdm(range(size))
91 |
92 | for ai, article in enumerate(data['data']):
93 | for pi, para in enumerate(article['paragraphs']):
94 | context = para['context']
95 | sents = h.split_doc(context)
96 | words = h.split_sent(context)
97 | sent_starts = []
98 | ref_idx = 0
99 | for sent in sents:
100 | new_idx = context.find(sent, ref_idx)
101 | sent_starts.append(new_idx)
102 | ref_idx = new_idx + len(sent)
103 | para['sents'] = sents
104 | para['words'] = words
105 | para['sent_starts'] = sent_starts
106 |
107 | consts = list(map(h.get_const, sents))
108 | para['consts'] = consts
109 | deps = list(map(h.get_dep, sents))
110 | para['deps'] = deps
111 |
112 | for qa in para['qas']:
113 | question = qa['question']
114 | question_const = h.get_const(question)
115 | qa['const'] = question_const
116 | question_dep = h.get_dep(question)
117 | qa['dep'] = question_dep
118 | qa['words'] = h.split_sent(question)
119 |
120 | for answer in qa['answers']:
121 | answer_start = answer['answer_start']
122 | text = answer['text']
123 | answer_stop = answer_start + len(text)
124 | # answer_words = h.split_sent(text)
125 | word_idxs = []
126 | answer_words = []
127 | for sent_idx, (sent, sent_start, dep) in enumerate(zip(sents, sent_starts, deps)):
128 | if dep is None:
129 | print("dep parse failed at {} {} {}".format(ai, pi, sent_idx))
130 | dep_fail_count += 1
131 | continue
132 | nodes, edges = dep
133 | words = [node[0] for node in nodes]
134 |
135 | for word_idx, (word, _, _, start, _) in enumerate(nodes):
136 | global_start = sent_start + start
137 | global_stop = global_start + len(word)
138 | if answer_start <= global_start < answer_stop or answer_start < global_stop <= answer_stop:
139 | word_idxs.append((sent_idx, word_idx))
140 | answer_words.append(word)
141 | if len(word_idxs) > 0:
142 | answer['answer_word_start'] = word_idxs[0]
143 | answer['answer_word_stop'] = word_idxs[-1][0], word_idxs[-1][1] + 1
144 | if not text.startswith(answer_words[0]):
145 | print("'{}' '{}'".format(text, ' '.join(answer_words)))
146 | mismatch_count += 1
147 | else:
148 | answer['answer_word_start'] = None
149 | answer['answer_word_stop'] = None
150 | no_answer_count += 1
151 | pbar.update(1)
152 | pbar.close()
153 |
154 | print(mismatch_count, dep_fail_count, no_answer_count)
155 |
156 | print("saving...")
157 | json.dump(data, open(out_path, 'w'))
--------------------------------------------------------------------------------
/tensorflow/SQuAD/basic/cli.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import tensorflow as tf
4 |
5 | from basic.main import main as m
6 |
7 | flags = tf.app.flags
8 |
9 | # Names and directories
10 | flags.DEFINE_string("model_name", "basic", "Model name [basic]")
11 | flags.DEFINE_string("data_dir", "data/squad", "Data dir [data/squad]")
12 | flags.DEFINE_string("run_id", "0", "Run ID [0]")
13 | flags.DEFINE_string("out_base_dir", "out", "out base dir [out]")
14 | flags.DEFINE_string("forward_name", "single", "Forward name [single]")
15 | flags.DEFINE_string("answer_path", "", "Answer path []")
16 | flags.DEFINE_string("eval_path", "", "Eval path []")
17 | flags.DEFINE_string("load_path", "", "Load path []")
18 | flags.DEFINE_string("shared_path", "", "Shared path []")
19 |
20 | # Device placement
21 | flags.DEFINE_string("device", "/cpu:0", "default device for summing gradients. [/cpu:0]")
22 | flags.DEFINE_string("device_type", "gpu", "device for computing gradients (parallelization). cpu | gpu [gpu]")
23 | flags.DEFINE_integer("num_gpus", 1, "num of gpus or cpus for computing gradients [1]")
24 |
25 | # Essential training and test options
26 | flags.DEFINE_string("mode", "test", "trains | test | forward [test]")
27 | flags.DEFINE_boolean("load", True, "load saved data? [True]")
28 | flags.DEFINE_bool("single", False, "supervise only the answer sentence? [False]")
29 | flags.DEFINE_boolean("debug", False, "Debugging mode? [False]")
30 | flags.DEFINE_bool('load_ema', True, "load exponential average of variables when testing? [True]")
31 | flags.DEFINE_bool("eval", True, "eval? [True]")
32 | flags.DEFINE_bool("wy", False, "Use wy for loss / eval? [False]")
33 | flags.DEFINE_bool("na", False, "Enable no answer strategy and learn bias? [False]")
34 | flags.DEFINE_float("th", 0.5, "Threshold [0.5]")
35 |
36 | # Training / test parameters
37 | flags.DEFINE_integer("batch_size", 60, "Batch size [60]")
38 | flags.DEFINE_integer("val_num_batches", 100, "validation num batches [100]")
39 | flags.DEFINE_integer("test_num_batches", 0, "test num batches [0]")
40 | flags.DEFINE_integer("num_epochs", 12, "Total number of epochs for training [12]")
41 | flags.DEFINE_integer("num_steps", 20000, "Number of steps [20000]")
42 | flags.DEFINE_integer("load_step", 0, "load step [0]")
43 | flags.DEFINE_float("init_lr", 0.001, "Initial learning rate [0.001]")
44 | flags.DEFINE_float("input_keep_prob", 0.8, "Input keep prob for the dropout of LSTM weights [0.8]")
45 | flags.DEFINE_float("keep_prob", 0.8, "Keep prob for the dropout of Char-CNN weights [0.8]")
46 | flags.DEFINE_float("wd", 0.0, "L2 weight decay for regularization [0.0]")
47 | flags.DEFINE_integer("hidden_size", 100, "Hidden size [100]")
48 | flags.DEFINE_integer("char_out_size", 100, "char-level word embedding size [100]")
49 | flags.DEFINE_integer("char_emb_size", 8, "Char emb size [8]")
50 | flags.DEFINE_string("out_channel_dims", "100", "Out channel dims of Char-CNN, separated by commas [100]")
51 | flags.DEFINE_string("filter_heights", "5", "Filter heights of Char-CNN, separated by commas [5]")
52 | flags.DEFINE_bool("finetune", False, "Finetune word embeddings? [False]")
53 | flags.DEFINE_bool("highway", True, "Use highway? [True]")
54 | flags.DEFINE_integer("highway_num_layers", 2, "highway num layers [2]")
55 | flags.DEFINE_bool("share_cnn_weights", True, "Share Char-CNN weights [True]")
56 | flags.DEFINE_bool("share_lstm_weights", True, "Share pre-processing (phrase-level) LSTM weights [True]")
57 | flags.DEFINE_float("var_decay", 0.999, "Exponential moving average decay for variables [0.999]")
58 |
59 | # Optimizations
60 | flags.DEFINE_bool("cluster", False, "Cluster data for faster training [False]")
61 | flags.DEFINE_bool("len_opt", False, "Length optimization? [False]")
62 | flags.DEFINE_bool("cpu_opt", False, "CPU optimization? GPU computation can be slower [False]")
63 |
64 | # Logging and saving options
65 | flags.DEFINE_boolean("progress", True, "Show progress? [True]")
66 | flags.DEFINE_integer("log_period", 100, "Log period [100]")
67 | flags.DEFINE_integer("eval_period", 1000, "Eval period [1000]")
68 | flags.DEFINE_integer("save_period", 1000, "Save Period [1000]")
69 | flags.DEFINE_integer("max_to_keep", 20, "Max recent saves to keep [20]")
70 | flags.DEFINE_bool("dump_eval", True, "dump eval? [True]")
71 | flags.DEFINE_bool("dump_answer", True, "dump answer? [True]")
72 | flags.DEFINE_bool("vis", False, "output visualization numbers? [False]")
73 | flags.DEFINE_bool("dump_pickle", True, "Dump pickle instead of json? [True]")
74 | flags.DEFINE_float("decay", 0.9, "Exponential moving average decay for logging values [0.9]")
75 |
76 | # Thresholds for speed and less memory usage
77 | flags.DEFINE_integer("word_count_th", 10, "word count th [100]")
78 | flags.DEFINE_integer("char_count_th", 50, "char count th [500]")
79 | flags.DEFINE_integer("sent_size_th", 400, "sent size th [64]")
80 | flags.DEFINE_integer("num_sents_th", 8, "num sents th [8]")
81 | flags.DEFINE_integer("ques_size_th", 30, "ques size th [32]")
82 | flags.DEFINE_integer("word_size_th", 16, "word size th [16]")
83 | flags.DEFINE_integer("para_size_th", 256, "para size th [256]")
84 |
85 | # Advanced training options
86 | flags.DEFINE_bool("lower_word", True, "lower word [True]")
87 | flags.DEFINE_bool("squash", False, "squash the sentences into one? [False]")
88 | flags.DEFINE_bool("swap_memory", True, "swap memory? [True]")
89 | flags.DEFINE_string("data_filter", "max", "max | valid | semi [max]")
90 | flags.DEFINE_bool("use_glove_for_unk", True, "use glove for unk [False]")
91 | flags.DEFINE_bool("known_if_glove", True, "consider as known if present in glove [False]")
92 | flags.DEFINE_string("logit_func", "tri_linear", "logit func [tri_linear]")
93 | flags.DEFINE_string("answer_func", "linear", "answer logit func [linear]")
94 | flags.DEFINE_string("sh_logit_func", "tri_linear", "sh logit func [tri_linear]")
95 |
96 | # Ablation options
97 | flags.DEFINE_bool("use_char_emb", True, "use char emb? [True]")
98 | flags.DEFINE_bool("use_word_emb", True, "use word embedding? [True]")
99 | flags.DEFINE_bool("q2c_att", True, "question-to-context attention? [True]")
100 | flags.DEFINE_bool("c2q_att", True, "context-to-question attention? [True]")
101 | flags.DEFINE_bool("dynamic_att", False, "Dynamic attention [False]")
102 |
103 |
104 | def main(_):
105 | config = flags.FLAGS
106 |
107 | config.out_dir = os.path.join(config.out_base_dir, config.model_name, str(config.run_id).zfill(2))
108 |
109 | m(config)
110 |
111 | if __name__ == "__main__":
112 | tf.app.run()
113 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/README.md:
--------------------------------------------------------------------------------
1 | # Bi-directional Attention Flow for Machine Comprehension
2 |
3 | - This the original implementation of [Bi-directional Attention Flow for Machine Comprehension][paper] (Seo et al., 2016).
4 | - This is tensorflow v1.1.0 comaptible version. This is not compatible with previous trained models,
5 | so if you want to use them, go to [v0.2.1][v0.2.1].
6 | - The CodaLab worksheet for the [SQuAD Leaderboard][squad] submission is available [here][worksheet].
7 | - Please contact [Minjoon Seo][minjoon] ([@seominjoon][minjoon-github]) for questions and suggestions.
8 |
9 | ## 0. Requirements
10 | #### General
11 | - Python (developed on 3.5.2. Issues have been reported with Python 2!)
12 | - unzip
13 |
14 | #### Python Packages
15 | - tensorflow (deep learning library, verified on 1.1.0)
16 | - nltk (NLP tools, verified on 3.2.1)
17 | - tqdm (progress bar, verified on 4.7.4)
18 | - jinja2 (for visaulization; if you only train and test, not needed)
19 |
20 | ## 1. Pre-processing
21 | First, prepare data. Donwload SQuAD data and GloVe and nltk corpus
22 | (~850 MB, this will download files to `$HOME/data`):
23 | ```
24 | chmod +x download.sh; ./download.sh
25 | ```
26 |
27 | Second, Preprocess Stanford QA dataset (along with GloVe vectors) and save them in `$PWD/data/squad` (~5 minutes):
28 | ```
29 | python -m squad.prepro
30 | ```
31 |
32 | ## 2. Training
33 | The model was trained with NVidia Titan X (Pascal Architecture, 2016).
34 | The model requires at least 12GB of GPU RAM.
35 | If your GPU RAM is smaller than 12GB, you can either decrease batch size (performance might degrade),
36 | or you can use multi GPU (see below).
37 | The training converges at ~18k steps, and it took ~4s per step (i.e. ~20 hours).
38 |
39 | Before training, it is recommended to first try the following code to verify everything is okay and memory is sufficient:
40 | ```
41 | python -m basic.cli --mode train --noload --debug
42 | ```
43 |
44 | Then to fully train, run:
45 | ```
46 | python -m basic.cli --mode train --noload
47 | ```
48 |
49 | You can speed up the training process with optimization flags:
50 | ```
51 | python -m basic.cli --mode train --noload --len_opt --cluster
52 | ```
53 | You can still omit them, but training will be much slower.
54 |
55 |
56 | ## 3. Test
57 | To test, run:
58 | ```
59 | python -m basic.cli
60 | ```
61 |
62 | Similarly to training, you can give the optimization flags to speed up test (5 minutes on dev data):
63 | ```
64 | python -m basic.cli --len_opt --cluster
65 | ```
66 |
67 | This command loads the most recently saved model during training and begins testing on the test data.
68 | After the process ends, it prints F1 and EM scores, and also outputs a json file (`$PWD/out/basic/00/answer/test-####.json`,
69 | where `####` is the step # that the model was saved).
70 | Note that the printed scores are not official (our scoring scheme is a bit harsher).
71 | To obtain the official number, use the official evaluator (copied in `squad` folder) and the output json file:
72 |
73 | ```
74 | python squad/evaluate-v1.1.py $HOME/data/squad/dev-v1.1.json out/basic/00/answer/test-####.json
75 | ```
76 |
77 | ### 3.1 Loading from pre-trained weights
78 | NOTE: this version is not compatible with the following trained models.
79 | For compatibility, use [v0.2.1][v0.2.1].
80 |
81 | Instead of training the model yourself, you can choose to use pre-trained weights that were used for [SQuAD Leaderboard][squad] submission.
82 | Refer to [this worksheet][worksheet] in CodaLab to reproduce the results.
83 | If you are unfamiliar with CodaLab, follow these simple steps (given that you met all prereqs above):
84 |
85 | 1. Download `save.zip` from the [worksheet][worksheet] and unzip it in the current directory.
86 | 2. Copy `glove.6B.100d.txt` from your glove data folder (`$HOME/data/glove/`) to the current directory.
87 | 3. To reproduce single model:
88 |
89 | ```
90 | basic/run_single.sh $HOME/data/squad/dev-v1.1.json single.json
91 | ```
92 |
93 | This writes the answers to `single.json` in the current directory. You can then use the official evaluator to obtain EM and F1 scores. If you want to run on GPU (~5 mins), change the value of batch_size flag in the shell file to a higher number (60 for 12GB GPU RAM).
94 | 4. Similarly, to reproduce ensemble method:
95 |
96 | ```
97 | basic/run_ensemble.sh $HOME/data/squad/dev-v1.1.json ensemble.json
98 | ```
99 | If you want to run on GPU, you should run the script sequentially by removing '&' in the forloop, or you will need to specify different GPUs for each run of the for loop.
100 |
101 | ## Results
102 |
103 | ### Dev Data
104 |
105 | | | EM (%) | F1 (%) |
106 | | -------- |:------:|:------:|
107 | | single | 67.8 | 77.4 |
108 |
109 | ###Dev Data (old)
110 | NOTE: These numbers are from [v0.2.1][v0.2.1].
111 |
112 | | | EM (%) | F1 (%) |
113 | | -------- |:------:|:------:|
114 | | single | 67.7 | 77.3 |
115 | | ensemble | 72.6 | 80.7 |
116 |
117 |
118 | ###Test Data (old)
119 | NOTE: These numbers are from [v0.2.1][v0.2.1].
120 |
121 | | | EM (%) | F1 (%) |
122 | | -------- |:------:|:------:|
123 | | single | 68.0 | 77.3 |
124 | | ensemble | 73.3 | 81.1 |
125 |
126 | Refer to [our paper][paper] for more details.
127 | See [SQuAD Leaderboard][squad] to compare with other models.
128 |
129 |
130 |
141 |
142 |
143 | ## Multi-GPU Training & Testing
144 | Our model supports multi-GPU training.
145 | We follow the parallelization paradigm described in [TensorFlow Tutorial][multi-gpu].
146 | In short, if you want to use batch size of 60 (default) but if you have 3 GPUs with 4GB of RAM,
147 | then you initialize each GPU with batch size of 20, and combine the gradients on CPU.
148 | This can be easily done by running:
149 | ```
150 | python -m basic.cli --mode train --noload --num_gpus 3 --batch_size 20
151 | ```
152 |
153 | Similarly, you can speed up your testing by:
154 | ```
155 | python -m basic.cli --num_gpus 3 --batch_size 20
156 | ```
157 |
158 |
159 | [multi-gpu]: https://www.tensorflow.org/versions/r0.11/tutorials/deep_cnn/index.html#training-a-model-using-multiple-gpu-cards
160 | [squad]: http://stanford-qa.com
161 | [paper]: https://arxiv.org/abs/1611.01603
162 | [worksheet]: https://worksheets.codalab.org/worksheets/0x37a9b8c44f6845c28866267ef941c89d/
163 | [minjoon]: https://seominjoon.github.io
164 | [minjoon-github]: https://github.com/seominjoon
165 | [v0.2.1]: https://github.com/allenai/bi-att-flow/tree/v0.2.1
166 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/general.py:
--------------------------------------------------------------------------------
1 | from itertools import zip_longest
2 |
3 | import itertools
4 | import tensorflow as tf
5 | from functools import reduce
6 | from operator import mul
7 | import numpy as np
8 |
9 | VERY_BIG_NUMBER = 1e30
10 | VERY_SMALL_NUMBER = 1e-30
11 | VERY_POSITIVE_NUMBER = VERY_BIG_NUMBER
12 | VERY_NEGATIVE_NUMBER = -VERY_BIG_NUMBER
13 |
14 |
15 | def get_initializer(matrix):
16 | def _initializer(shape, dtype=None, partition_info=None, **kwargs): return matrix
17 | return _initializer
18 |
19 |
20 | def variable_on_cpu(name, shape, initializer):
21 | """Helper to create a Variable stored on CPU memory.
22 |
23 | Args:
24 | name: name of the variable
25 | shape: list of ints
26 | initializer: initializer for Variable
27 |
28 | Returns:
29 | Variable Tensor
30 | """
31 | with tf.device('/cpu:0'):
32 | var = tf.get_variable(name, shape, initializer=initializer)
33 | return var
34 |
35 |
36 | def variable_with_weight_decay(name, shape, stddev, wd):
37 | """Helper to create an initialized Variable with weight decay.
38 |
39 | Note that the Variable is initialized with a truncated normal distribution.
40 | A weight decay is added only if one is specified.
41 |
42 | Args:
43 | name: name of the variable
44 | shape: list of ints
45 | stddev: standard deviation of a truncated Gaussian
46 | wd: add L2Loss weight decay multiplied by this float. If None, weight
47 | decay is not added for this Variable.
48 |
49 | Returns:
50 | Variable Tensor
51 | """
52 | var = variable_on_cpu(name, shape,
53 | tf.truncated_normal_initializer(stddev=stddev))
54 | if wd:
55 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
56 | tf.add_to_collection('losses', weight_decay)
57 | return var
58 |
59 |
60 | def average_gradients(tower_grads):
61 | """Calculate the average gradient for each shared variable across all towers.
62 |
63 | Note that this function provides a synchronization point across all towers.
64 |
65 | Args:
66 | tower_grads: List of lists of (gradient, variable) tuples. The outer list
67 | is over individual gradients. The inner list is over the gradient
68 | calculation for each tower.
69 | Returns:
70 | List of pairs of (gradient, variable) where the gradient has been averaged
71 | across all towers.
72 | """
73 | average_grads = []
74 | for grad_and_vars in zip(*tower_grads):
75 | # Note that each grad_and_vars looks like the following:
76 | # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
77 | grads = []
78 | for g, var in grad_and_vars:
79 | # Add 0 dimension to the gradients to represent the tower.
80 | assert g is not None, var.name
81 | expanded_g = tf.expand_dims(g, 0)
82 |
83 | # Append on a 'tower' dimension which we will average over below.
84 | grads.append(expanded_g)
85 |
86 | # Average over the 'tower' dimension.
87 | grad = tf.concat(axis=0, values=grads)
88 | grad = tf.reduce_mean(grad, 0)
89 |
90 | # Keep in mind that the Variables are redundant because they are shared
91 | # across towers. So .. we will just return the first tower's pointer to
92 | # the Variable.
93 | v = grad_and_vars[0][1]
94 | grad_and_var = (grad, v)
95 | average_grads.append(grad_and_var)
96 | return average_grads
97 |
98 |
99 | def mask(val, mask, name=None):
100 | if name is None:
101 | name = 'mask'
102 | return tf.multiply(val, tf.cast(mask, 'float'), name=name)
103 |
104 |
105 | def exp_mask(val, mask, name=None):
106 | """Give very negative number to unmasked elements in val.
107 | For example, [-3, -2, 10], [True, True, False] -> [-3, -2, -1e9].
108 | Typically, this effectively masks in exponential space (e.g. softmax)
109 | Args:
110 | val: values to be masked
111 | mask: masking boolean tensor, same shape as tensor
112 | name: name for output tensor
113 |
114 | Returns:
115 | Same shape as val, where some elements are very small (exponentially zero)
116 | """
117 | if name is None:
118 | name = "exp_mask"
119 | return tf.add(val, (1 - tf.cast(mask, 'float')) * VERY_NEGATIVE_NUMBER, name=name)
120 |
121 |
122 | def flatten(tensor, keep):
123 | fixed_shape = tensor.get_shape().as_list()
124 | start = len(fixed_shape) - keep
125 | left = reduce(mul, [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start)])
126 | out_shape = [left] + [fixed_shape[i] or tf.shape(tensor)[i] for i in range(start, len(fixed_shape))]
127 | flat = tf.reshape(tensor, out_shape)
128 | return flat
129 |
130 |
131 | def reconstruct(tensor, ref, keep):
132 | ref_shape = ref.get_shape().as_list()
133 | tensor_shape = tensor.get_shape().as_list()
134 | ref_stop = len(ref_shape) - keep
135 | tensor_start = len(tensor_shape) - keep
136 | pre_shape = [ref_shape[i] or tf.shape(ref)[i] for i in range(ref_stop)]
137 | keep_shape = [tensor_shape[i] or tf.shape(tensor)[i] for i in range(tensor_start, len(tensor_shape))]
138 | # pre_shape = [tf.shape(ref)[i] for i in range(len(ref.get_shape().as_list()[:-keep]))]
139 | # keep_shape = tensor.get_shape().as_list()[-keep:]
140 | target_shape = pre_shape + keep_shape
141 | out = tf.reshape(tensor, target_shape)
142 | return out
143 |
144 |
145 | def add_wd(wd, scope=None):
146 | scope = scope or tf.get_variable_scope().name
147 | variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope)
148 | with tf.name_scope("weight_decay"):
149 | for var in variables:
150 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name="{}/wd".format(var.op.name))
151 | tf.add_to_collection('losses', weight_decay)
152 |
153 |
154 | def grouper(iterable, n, fillvalue=None, shorten=False, num_groups=None):
155 | args = [iter(iterable)] * n
156 | out = zip_longest(*args, fillvalue=fillvalue)
157 | out = list(out)
158 | if num_groups is not None:
159 | default = (fillvalue, ) * n
160 | assert isinstance(num_groups, int)
161 | out = list(each for each, _ in zip_longest(out, range(num_groups), fillvalue=default))
162 | if shorten:
163 | assert fillvalue is None
164 | out = (tuple(e for e in each if e is not None) for each in out)
165 | return out
166 |
167 | def padded_reshape(tensor, shape, mode='CONSTANT', name=None):
168 | paddings = [[0, shape[i] - tf.shape(tensor)[i]] for i in range(len(shape))]
169 | return tf.pad(tensor, paddings, mode=mode, name=name)
170 |
171 |
172 | def get_num_params():
173 | num_params = 0
174 | for variable in tf.trainable_variables():
175 | shape = variable.get_shape()
176 | num_params += reduce(mul, [dim.value for dim in shape], 1)
177 | return num_params
178 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/read_data.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import random
4 | import itertools
5 | import math
6 |
7 | import nltk
8 |
9 | from my.nltk_utils import load_compressed_tree
10 | from my.utils import index
11 |
12 |
13 | class DataSet(object):
14 | def __init__(self, data, data_type, shared=None, valid_idxs=None):
15 | total_num_examples = len(next(iter(data.values())))
16 | self.data = data # e.g. {'X': [0, 1, 2], 'Y': [2, 3, 4]}
17 | self.data_type = data_type
18 | self.shared = shared
19 | self.valid_idxs = range(total_num_examples) if valid_idxs is None else valid_idxs
20 | self.num_examples = len(self.valid_idxs)
21 |
22 | def get_batches(self, batch_size, num_batches=None, shuffle=False):
23 | num_batches_per_epoch = int(math.ceil(self.num_examples / batch_size))
24 | if num_batches is None:
25 | num_batches = num_batches_per_epoch
26 | num_epochs = int(math.ceil(num_batches / num_batches_per_epoch))
27 |
28 | idxs = itertools.chain.from_iterable(random.sample(self.valid_idxs, len(self.valid_idxs))
29 | if shuffle else self.valid_idxs
30 | for _ in range(num_epochs))
31 | for _ in range(num_batches):
32 | batch_idxs = tuple(itertools.islice(idxs, batch_size))
33 | batch_data = {}
34 | for key, val in self.data.items():
35 | if key.startswith('*'):
36 | assert self.shared is not None
37 | shared_key = key[1:]
38 | batch_data[shared_key] = [index(self.shared[shared_key], val[idx]) for idx in batch_idxs]
39 | else:
40 | batch_data[key] = list(map(val.__getitem__, batch_idxs))
41 |
42 | batch_ds = DataSet(batch_data, self.data_type, shared=self.shared)
43 | yield batch_idxs, batch_ds
44 |
45 |
46 | class SquadDataSet(DataSet):
47 | def __init__(self, data, data_type, shared=None, valid_idxs=None):
48 | super(SquadDataSet, self).__init__(data, data_type, shared=shared, valid_idxs=valid_idxs)
49 |
50 |
51 | def load_metadata(config, data_type):
52 | metadata_path = os.path.join(config.data_dir, "metadata_{}.json".format(data_type))
53 | with open(metadata_path, 'r') as fh:
54 | metadata = json.load(fh)
55 | for key, val in metadata.items():
56 | config.__setattr__(key, val)
57 | return metadata
58 |
59 |
60 | def read_data(config, data_type, ref, data_filter=None):
61 | data_path = os.path.join(config.data_dir, "data_{}.json".format(data_type))
62 | shared_path = os.path.join(config.data_dir, "shared_{}.json".format(data_type))
63 | with open(data_path, 'r') as fh:
64 | data = json.load(fh)
65 | with open(shared_path, 'r') as fh:
66 | shared = json.load(fh)
67 |
68 | num_examples = len(next(iter(data.values())))
69 | if data_filter is None:
70 | valid_idxs = range(num_examples)
71 | else:
72 | mask = []
73 | keys = data.keys()
74 | values = data.values()
75 | for vals in zip(*values):
76 | each = {key: val for key, val in zip(keys, vals)}
77 | mask.append(data_filter(each, shared))
78 | valid_idxs = [idx for idx in range(len(mask)) if mask[idx]]
79 |
80 | print("Loaded {}/{} examples from {}".format(len(valid_idxs), num_examples, data_type))
81 |
82 | shared_path = os.path.join(config.out_dir, "shared.json")
83 | if not ref:
84 | word_counter = shared['lower_word_counter'] if config.lower_word else shared['word_counter']
85 | char_counter = shared['char_counter']
86 | pos_counter = shared['pos_counter']
87 | shared['word2idx'] = {word: idx + 2 for idx, word in
88 | enumerate(word for word, count in word_counter.items()
89 | if count > config.word_count_th)}
90 | shared['char2idx'] = {char: idx + 2 for idx, char in
91 | enumerate(char for char, count in char_counter.items()
92 | if count > config.char_count_th)}
93 | shared['pos2idx'] = {pos: idx + 2 for idx, pos in enumerate(pos_counter.keys())}
94 | NULL = "-NULL-"
95 | UNK = "-UNK-"
96 | shared['word2idx'][NULL] = 0
97 | shared['word2idx'][UNK] = 1
98 | shared['char2idx'][NULL] = 0
99 | shared['char2idx'][UNK] = 1
100 | shared['pos2idx'][NULL] = 0
101 | shared['pos2idx'][UNK] = 1
102 | json.dump({'word2idx': shared['word2idx'], 'char2idx': shared['char2idx'],
103 | 'pos2idx': shared['pos2idx']}, open(shared_path, 'w'))
104 | else:
105 | new_shared = json.load(open(shared_path, 'r'))
106 | for key, val in new_shared.items():
107 | shared[key] = val
108 |
109 | data_set = DataSet(data, data_type, shared=shared, valid_idxs=valid_idxs)
110 | return data_set
111 |
112 |
113 | def get_squad_data_filter(config):
114 | def data_filter(data_point, shared):
115 | assert shared is not None
116 | rx, rcx, q, cq, y = (data_point[key] for key in ('*x', '*cx', 'q', 'cq', 'y'))
117 | x, cx, stx = shared['x'], shared['cx'], shared['stx']
118 | if len(q) > config.ques_size_th:
119 | return False
120 | xi = x[rx[0]][rx[1]]
121 | if len(xi) > config.num_sents_th:
122 | return False
123 | if any(len(xij) > config.sent_size_th for xij in xi):
124 | return False
125 | stxi = stx[rx[0]][rx[1]]
126 | if any(nltk.tree.Tree.fromstring(s).height() > config.tree_height_th for s in stxi):
127 | return False
128 | return True
129 | return data_filter
130 |
131 |
132 | def update_config(config, data_sets):
133 | config.max_num_sents = 0
134 | config.max_sent_size = 0
135 | config.max_ques_size = 0
136 | config.max_word_size = 0
137 | config.max_tree_height = 0
138 | for data_set in data_sets:
139 | data = data_set.data
140 | shared = data_set.shared
141 | for idx in data_set.valid_idxs:
142 | rx = data['*x'][idx]
143 | q = data['q'][idx]
144 | sents = shared['x'][rx[0]][rx[1]]
145 | trees = map(nltk.tree.Tree.fromstring, shared['stx'][rx[0]][rx[1]])
146 | config.max_tree_height = max(config.max_tree_height, max(tree.height() for tree in trees))
147 | config.max_num_sents = max(config.max_num_sents, len(sents))
148 | config.max_sent_size = max(config.max_sent_size, max(map(len, sents)))
149 | config.max_word_size = max(config.max_word_size, max(len(word) for sent in sents for word in sent))
150 | if len(q) > 0:
151 | config.max_ques_size = max(config.max_ques_size, len(q))
152 | config.max_word_size = max(config.max_word_size, max(len(word) for word in q))
153 |
154 | config.max_word_size = min(config.max_word_size, config.word_size_th)
155 |
156 | config.char_vocab_size = len(data_sets[0].shared['char2idx'])
157 | config.word_emb_size = len(next(iter(data_sets[0].shared['word2vec'].values())))
158 | config.word_vocab_size = len(data_sets[0].shared['word2idx'])
159 | config.pos_vocab_size = len(data_sets[0].shared['pos2idx'])
160 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/main.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import math
4 | import os
5 | import shutil
6 | from pprint import pprint
7 |
8 | import tensorflow as tf
9 | from tqdm import tqdm
10 | import numpy as np
11 |
12 | from tree.evaluator import AccuracyEvaluator2, Evaluator
13 | from tree.graph_handler import GraphHandler
14 | from tree.model import Model
15 | from tree.trainer import Trainer
16 |
17 | from tree.read_data import load_metadata, read_data, get_squad_data_filter, update_config
18 |
19 |
20 | def main(config):
21 | set_dirs(config)
22 | if config.mode == 'train':
23 | _train(config)
24 | elif config.mode == 'test':
25 | _test(config)
26 | elif config.mode == 'forward':
27 | _forward(config)
28 | else:
29 | raise ValueError("invalid value for 'mode': {}".format(config.mode))
30 |
31 |
32 | def _config_draft(config):
33 | if config.draft:
34 | config.num_steps = 10
35 | config.eval_period = 10
36 | config.log_period = 1
37 | config.save_period = 10
38 | config.eval_num_batches = 1
39 |
40 |
41 | def _train(config):
42 | # load_metadata(config, 'train') # this updates the config file according to metadata file
43 |
44 | data_filter = get_squad_data_filter(config)
45 | train_data = read_data(config, 'train', config.load, data_filter=data_filter)
46 | dev_data = read_data(config, 'dev', True, data_filter=data_filter)
47 | update_config(config, [train_data, dev_data])
48 |
49 | _config_draft(config)
50 |
51 | word2vec_dict = train_data.shared['lower_word2vec'] if config.lower_word else train_data.shared['word2vec']
52 | word2idx_dict = train_data.shared['word2idx']
53 | idx2vec_dict = {word2idx_dict[word]: vec for word, vec in word2vec_dict.items() if word in word2idx_dict}
54 | print("{}/{} unique words have corresponding glove vectors.".format(len(idx2vec_dict), len(word2idx_dict)))
55 | emb_mat = np.array([idx2vec_dict[idx] if idx in idx2vec_dict
56 | else np.random.multivariate_normal(np.zeros(config.word_emb_size), np.eye(config.word_emb_size))
57 | for idx in range(config.word_vocab_size)])
58 | config.emb_mat = emb_mat
59 |
60 | # construct model graph and variables (using default graph)
61 | pprint(config.__flags, indent=2)
62 | model = Model(config)
63 | trainer = Trainer(config, model)
64 | evaluator = AccuracyEvaluator2(config, model)
65 | graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
66 |
67 | # Variables
68 | sess = tf.Session()
69 | graph_handler.initialize(sess)
70 |
71 | # begin training
72 | num_steps = config.num_steps or int(config.num_epochs * train_data.num_examples / config.batch_size)
73 | max_acc = 0
74 | noupdate_count = 0
75 | global_step = 0
76 | for _, batch in tqdm(train_data.get_batches(config.batch_size, num_batches=num_steps, shuffle=True), total=num_steps):
77 | global_step = sess.run(model.global_step) + 1 # +1 because all calculations are done after step
78 | get_summary = global_step % config.log_period == 0
79 | loss, summary, train_op = trainer.step(sess, batch, get_summary=get_summary)
80 | if get_summary:
81 | graph_handler.add_summary(summary, global_step)
82 |
83 | # Occasional evaluation and saving
84 | if global_step % config.save_period == 0:
85 | graph_handler.save(sess, global_step=global_step)
86 | if global_step % config.eval_period == 0:
87 | num_batches = math.ceil(dev_data.num_examples / config.batch_size)
88 | if 0 < config.eval_num_batches < num_batches:
89 | num_batches = config.eval_num_batches
90 | e = evaluator.get_evaluation_from_batches(
91 | sess, tqdm(dev_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
92 | graph_handler.add_summaries(e.summaries, global_step)
93 | if e.acc > max_acc:
94 | max_acc = e.acc
95 | noupdate_count = 0
96 | else:
97 | noupdate_count += 1
98 | if noupdate_count == config.early_stop:
99 | break
100 | if config.dump_eval:
101 | graph_handler.dump_eval(e)
102 | if global_step % config.save_period != 0:
103 | graph_handler.save(sess, global_step=global_step)
104 |
105 |
106 | def _test(config):
107 | test_data = read_data(config, 'test', True)
108 | update_config(config, [test_data])
109 |
110 | _config_draft(config)
111 |
112 | pprint(config.__flags, indent=2)
113 | model = Model(config)
114 | evaluator = AccuracyEvaluator2(config, model)
115 | graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
116 |
117 | sess = tf.Session()
118 | graph_handler.initialize(sess)
119 |
120 | num_batches = math.ceil(test_data.num_examples / config.batch_size)
121 | if 0 < config.eval_num_batches < num_batches:
122 | num_batches = config.eval_num_batches
123 | e = evaluator.get_evaluation_from_batches(sess, tqdm(test_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
124 | print(e)
125 | if config.dump_eval:
126 | graph_handler.dump_eval(e)
127 |
128 |
129 | def _forward(config):
130 |
131 | forward_data = read_data(config, 'forward', True)
132 |
133 | _config_draft(config)
134 |
135 | pprint(config.__flag, indent=2)
136 | model = Model(config)
137 | evaluator = Evaluator(config, model)
138 | graph_handler = GraphHandler(config) # controls all tensors and variables in the graph, including loading /saving
139 |
140 | sess = tf.Session()
141 | graph_handler.initialize(sess)
142 |
143 | num_batches = math.ceil(forward_data.num_examples / config.batch_size)
144 | if 0 < config.eval_num_batches < num_batches:
145 | num_batches = config.eval_num_batches
146 | e = evaluator.get_evaluation_from_batches(sess, tqdm(forward_data.get_batches(config.batch_size, num_batches=num_batches), total=num_batches))
147 | print(e)
148 | if config.dump_eval:
149 | graph_handler.dump_eval(e)
150 |
151 |
152 | def set_dirs(config):
153 | # create directories
154 | if not config.load and os.path.exists(config.out_dir):
155 | shutil.rmtree(config.out_dir)
156 |
157 | config.save_dir = os.path.join(config.out_dir, "save")
158 | config.log_dir = os.path.join(config.out_dir, "log")
159 | config.eval_dir = os.path.join(config.out_dir, "eval")
160 | if not os.path.exists(config.out_dir):
161 | os.makedirs(config.out_dir)
162 | if not os.path.exists(config.save_dir):
163 | os.mkdir(config.save_dir)
164 | if not os.path.exists(config.log_dir):
165 | os.mkdir(config.eval_dir)
166 |
167 |
168 | def _get_args():
169 | parser = argparse.ArgumentParser()
170 | parser.add_argument("config_path")
171 | return parser.parse_args()
172 |
173 |
174 | class Config(object):
175 | def __init__(self, **entries):
176 | self.__dict__.update(entries)
177 |
178 |
179 | def _run():
180 | args = _get_args()
181 | with open(args.config_path, 'r') as fh:
182 | config = Config(**json.load(fh))
183 | main(config)
184 |
185 |
186 | if __name__ == "__main__":
187 | _run()
188 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/cnn_dm/prepro.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | # data: q, cq, (dq), (pq), y, *x, *cx
5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
6 | # no metadata
7 | from collections import Counter
8 |
9 | from tqdm import tqdm
10 |
11 | from my.utils import process_tokens
12 | from squad.utils import get_word_span, process_tokens
13 |
14 |
15 | def bool_(arg):
16 | if arg == 'True':
17 | return True
18 | elif arg == 'False':
19 | return False
20 | raise Exception(arg)
21 |
22 |
23 | def main():
24 | args = get_args()
25 | prepro(args)
26 |
27 |
28 | def get_args():
29 | parser = argparse.ArgumentParser()
30 | home = os.path.expanduser("~")
31 | source_dir = os.path.join(home, "data", "cnn", 'questions')
32 | target_dir = "data/cnn"
33 | glove_dir = os.path.join(home, "data", "glove")
34 | parser.add_argument("--source_dir", default=source_dir)
35 | parser.add_argument("--target_dir", default=target_dir)
36 | parser.add_argument("--glove_dir", default=glove_dir)
37 | parser.add_argument("--glove_corpus", default='6B')
38 | parser.add_argument("--glove_vec_size", default=100, type=int)
39 | parser.add_argument("--debug", default=False, type=bool_)
40 | parser.add_argument("--num_sents_th", default=200, type=int)
41 | parser.add_argument("--ques_size_th", default=30, type=int)
42 | parser.add_argument("--width", default=5, type=int)
43 | # TODO : put more args here
44 | return parser.parse_args()
45 |
46 |
47 | def prepro(args):
48 | prepro_each(args, 'train')
49 | prepro_each(args, 'dev')
50 | prepro_each(args, 'test')
51 |
52 |
53 | def para2sents(para, width):
54 | """
55 | Turn para into double array of words (wordss)
56 | Where each sentence is up to 5 word neighbors of each entity
57 | :param para:
58 | :return:
59 | """
60 | words = para.split(" ")
61 | sents = []
62 | for i, word in enumerate(words):
63 | if word.startswith("@"):
64 | start = max(i - width, 0)
65 | stop = min(i + width + 1, len(words))
66 | sent = words[start:stop]
67 | sents.append(sent)
68 | return sents
69 |
70 |
71 | def get_word2vec(args, word_counter):
72 | glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
73 | sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
74 | total = sizes[args.glove_corpus]
75 | word2vec_dict = {}
76 | with open(glove_path, 'r', encoding='utf-8') as fh:
77 | for line in tqdm(fh, total=total):
78 | array = line.lstrip().rstrip().split(" ")
79 | word = array[0]
80 | vector = list(map(float, array[1:]))
81 | if word in word_counter:
82 | word2vec_dict[word] = vector
83 | elif word.capitalize() in word_counter:
84 | word2vec_dict[word.capitalize()] = vector
85 | elif word.lower() in word_counter:
86 | word2vec_dict[word.lower()] = vector
87 | elif word.upper() in word_counter:
88 | word2vec_dict[word.upper()] = vector
89 |
90 | print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
91 | return word2vec_dict
92 |
93 |
94 | def prepro_each(args, mode):
95 | source_dir = os.path.join(args.source_dir, mode)
96 | word_counter = Counter()
97 | lower_word_counter = Counter()
98 | ent_counter = Counter()
99 | char_counter = Counter()
100 | max_sent_size = 0
101 | max_word_size = 0
102 | max_ques_size = 0
103 | max_num_sents = 0
104 |
105 | file_names = list(os.listdir(source_dir))
106 | if args.debug:
107 | file_names = file_names[:1000]
108 | lens = []
109 |
110 | out_file_names = []
111 | for file_name in tqdm(file_names, total=len(file_names)):
112 | if file_name.endswith(".question"):
113 | with open(os.path.join(source_dir, file_name), 'r') as fh:
114 | url = fh.readline().strip()
115 | _ = fh.readline()
116 | para = fh.readline().strip()
117 | _ = fh.readline()
118 | ques = fh.readline().strip()
119 | _ = fh.readline()
120 | answer = fh.readline().strip()
121 | _ = fh.readline()
122 | cands = list(line.strip() for line in fh)
123 | cand_ents = list(cand.split(":")[0] for cand in cands)
124 | sents = para2sents(para, args.width)
125 | ques_words = ques.split(" ")
126 |
127 | # Filtering
128 | if len(sents) > args.num_sents_th or len(ques_words) > args.ques_size_th:
129 | continue
130 |
131 | max_sent_size = max(max(map(len, sents)), max_sent_size)
132 | max_ques_size = max(len(ques_words), max_ques_size)
133 | max_word_size = max(max(len(word) for sent in sents for word in sent), max_word_size)
134 | max_num_sents = max(len(sents), max_num_sents)
135 |
136 | for word in ques_words:
137 | if word.startswith("@"):
138 | ent_counter[word] += 1
139 | word_counter[word] += 1
140 | else:
141 | word_counter[word] += 1
142 | lower_word_counter[word.lower()] += 1
143 | for c in word:
144 | char_counter[c] += 1
145 | for sent in sents:
146 | for word in sent:
147 | if word.startswith("@"):
148 | ent_counter[word] += 1
149 | word_counter[word] += 1
150 | else:
151 | word_counter[word] += 1
152 | lower_word_counter[word.lower()] += 1
153 | for c in word:
154 | char_counter[c] += 1
155 |
156 | out_file_names.append(file_name)
157 | lens.append(len(sents))
158 | num_examples = len(out_file_names)
159 |
160 | assert len(out_file_names) == len(lens)
161 | sorted_file_names, lens = zip(*sorted(zip(out_file_names, lens), key=lambda each: each[1]))
162 | assert lens[-1] == max_num_sents
163 |
164 | word2vec_dict = get_word2vec(args, word_counter)
165 | lower_word2vec_dit = get_word2vec(args, lower_word_counter)
166 |
167 | shared = {'word_counter': word_counter, 'ent_counter': ent_counter, 'char_counter': char_counter,
168 | 'lower_word_counter': lower_word_counter,
169 | 'max_num_sents': max_num_sents, 'max_sent_size': max_sent_size, 'max_word_size': max_word_size,
170 | 'max_ques_size': max_ques_size,
171 | 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dit, 'sorted': sorted_file_names,
172 | 'num_examples': num_examples}
173 |
174 | print("max num sents: {}".format(max_num_sents))
175 | print("max ques size: {}".format(max_ques_size))
176 |
177 | if not os.path.exists(args.target_dir):
178 | os.makedirs(args.target_dir)
179 | shared_path = os.path.join(args.target_dir, "shared_{}.json".format(mode))
180 | with open(shared_path, 'w') as fh:
181 | json.dump(shared, fh)
182 |
183 |
184 | if __name__ == "__main__":
185 | main()
186 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/prepro_aug.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 | import os
4 | # data: q, cq, (dq), (pq), y, *x, *cx
5 | # shared: x, cx, (dx), (px), word_counter, char_counter, word2vec
6 | # no metadata
7 | from collections import Counter
8 |
9 | import nltk
10 | from tqdm import tqdm
11 |
12 | from my.nltk_utils import load_compressed_tree
13 |
14 |
15 | def bool_(arg):
16 | if arg == 'True':
17 | return True
18 | elif arg == 'False':
19 | return False
20 | raise Exception()
21 |
22 |
23 | def main():
24 | args = get_args()
25 | prepro(args)
26 |
27 |
28 | def get_args():
29 | parser = argparse.ArgumentParser()
30 | home = os.path.expanduser("~")
31 | source_dir = os.path.join(home, "data", "squad")
32 | target_dir = "data/squad"
33 | glove_dir = os.path.join(home, "data", "glove")
34 | parser.add_argument("--source_dir", default=source_dir)
35 | parser.add_argument("--target_dir", default=target_dir)
36 | parser.add_argument("--debug", default=False, type=bool_)
37 | parser.add_argument("--train_ratio", default=0.9, type=int)
38 | parser.add_argument("--glove_corpus", default="6B")
39 | parser.add_argument("--glove_dir", default=glove_dir)
40 | parser.add_argument("--glove_vec_size", default=100, type=int)
41 | parser.add_argument("--full_train", default=False, type=bool_)
42 | # TODO : put more args here
43 | return parser.parse_args()
44 |
45 |
46 | def prepro(args):
47 | if not os.path.exists(args.target_dir):
48 | os.makedirs(args.target_dir)
49 |
50 | if args.full_train:
51 | data_train, shared_train = prepro_each(args, 'train')
52 | data_dev, shared_dev = prepro_each(args, 'dev')
53 | else:
54 | data_train, shared_train = prepro_each(args, 'train', 0.0, args.train_ratio)
55 | data_dev, shared_dev = prepro_each(args, 'train', args.train_ratio, 1.0)
56 | data_test, shared_test = prepro_each(args, 'dev')
57 |
58 | print("saving ...")
59 | save(args, data_train, shared_train, 'train')
60 | save(args, data_dev, shared_dev, 'dev')
61 | save(args, data_test, shared_test, 'test')
62 |
63 |
64 | def save(args, data, shared, data_type):
65 | data_path = os.path.join(args.target_dir, "data_{}.json".format(data_type))
66 | shared_path = os.path.join(args.target_dir, "shared_{}.json".format(data_type))
67 | json.dump(data, open(data_path, 'w'))
68 | json.dump(shared, open(shared_path, 'w'))
69 |
70 |
71 | def get_word2vec(args, word_counter):
72 | glove_path = os.path.join(args.glove_dir, "glove.{}.{}d.txt".format(args.glove_corpus, args.glove_vec_size))
73 | sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
74 | total = sizes[args.glove_corpus]
75 | word2vec_dict = {}
76 | with open(glove_path, 'r') as fh:
77 | for line in tqdm(fh, total=total):
78 | array = line.lstrip().rstrip().split(" ")
79 | word = array[0]
80 | vector = list(map(float, array[1:]))
81 | if word in word_counter:
82 | word2vec_dict[word] = vector
83 | elif word.capitalize() in word_counter:
84 | word2vec_dict[word.capitalize()] = vector
85 | elif word.lower() in word_counter:
86 | word2vec_dict[word.lower()] = vector
87 | elif word.upper() in word_counter:
88 | word2vec_dict[word.upper()] = vector
89 |
90 | print("{}/{} of word vocab have corresponding vectors in {}".format(len(word2vec_dict), len(word_counter), glove_path))
91 | return word2vec_dict
92 |
93 |
94 | def prepro_each(args, data_type, start_ratio=0.0, stop_ratio=1.0):
95 | source_path = os.path.join(args.source_dir, "{}-v1.0-aug.json".format(data_type))
96 | source_data = json.load(open(source_path, 'r'))
97 |
98 | q, cq, y, rx, rcx, ids, idxs = [], [], [], [], [], [], []
99 | x, cx, tx, stx = [], [], [], []
100 | answerss = []
101 | word_counter, char_counter, lower_word_counter = Counter(), Counter(), Counter()
102 | pos_counter = Counter()
103 | start_ai = int(round(len(source_data['data']) * start_ratio))
104 | stop_ai = int(round(len(source_data['data']) * stop_ratio))
105 | for ai, article in enumerate(tqdm(source_data['data'][start_ai:stop_ai])):
106 | xp, cxp, txp, stxp = [], [], [], []
107 | x.append(xp)
108 | cx.append(cxp)
109 | tx.append(txp)
110 | stx.append(stxp)
111 | for pi, para in enumerate(article['paragraphs']):
112 | xi = []
113 | for dep in para['deps']:
114 | if dep is None:
115 | xi.append([])
116 | else:
117 | xi.append([node[0] for node in dep[0]])
118 | cxi = [[list(xijk) for xijk in xij] for xij in xi]
119 | xp.append(xi)
120 | cxp.append(cxi)
121 | txp.append(para['consts'])
122 | stxp.append([str(load_compressed_tree(s)) for s in para['consts']])
123 | trees = map(nltk.tree.Tree.fromstring, para['consts'])
124 | for tree in trees:
125 | for subtree in tree.subtrees():
126 | pos_counter[subtree.label()] += 1
127 |
128 | for xij in xi:
129 | for xijk in xij:
130 | word_counter[xijk] += len(para['qas'])
131 | lower_word_counter[xijk.lower()] += len(para['qas'])
132 | for xijkl in xijk:
133 | char_counter[xijkl] += len(para['qas'])
134 |
135 | rxi = [ai, pi]
136 | assert len(x) - 1 == ai
137 | assert len(x[ai]) - 1 == pi
138 | for qa in para['qas']:
139 | dep = qa['dep']
140 | qi = [] if dep is None else [node[0] for node in dep[0]]
141 | cqi = [list(qij) for qij in qi]
142 | yi = []
143 | answers = []
144 | for answer in qa['answers']:
145 | answers.append(answer['text'])
146 | yi0 = answer['answer_word_start'] or [0, 0]
147 | yi1 = answer['answer_word_stop'] or [0, 1]
148 | assert len(xi[yi0[0]]) > yi0[1]
149 | assert len(xi[yi1[0]]) >= yi1[1]
150 | yi.append([yi0, yi1])
151 |
152 | for qij in qi:
153 | word_counter[qij] += 1
154 | lower_word_counter[qij.lower()] += 1
155 | for qijk in qij:
156 | char_counter[qijk] += 1
157 |
158 | q.append(qi)
159 | cq.append(cqi)
160 | y.append(yi)
161 | rx.append(rxi)
162 | rcx.append(rxi)
163 | ids.append(qa['id'])
164 | idxs.append(len(idxs))
165 | answerss.append(answers)
166 |
167 | if args.debug:
168 | break
169 |
170 | word2vec_dict = get_word2vec(args, word_counter)
171 | lower_word2vec_dict = get_word2vec(args, lower_word_counter)
172 |
173 | data = {'q': q, 'cq': cq, 'y': y, '*x': rx, '*cx': rcx, '*tx': rx, '*stx': rx,
174 | 'idxs': idxs, 'ids': ids, 'answerss': answerss}
175 | shared = {'x': x, 'cx': cx, 'tx': tx, 'stx': stx,
176 | 'word_counter': word_counter, 'char_counter': char_counter, 'lower_word_counter': lower_word_counter,
177 | 'word2vec': word2vec_dict, 'lower_word2vec': lower_word2vec_dict, 'pos_counter': pos_counter}
178 |
179 | return data, shared
180 |
181 |
182 | if __name__ == "__main__":
183 | main()
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import nltk\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "%matplotlib inline"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 10,
19 | "metadata": {
20 | "collapsed": false
21 | },
22 | "outputs": [
23 | {
24 | "name": "stdout",
25 | "output_type": "stream",
26 | "text": [
27 | "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n",
28 | "(PRP I)\n",
29 | "(VP (VBP am) (NNP Sam))\n",
30 | "(VBP am)\n",
31 | "(NNP Sam)\n",
32 | "(. .)\n",
33 | "(S (PRP I) (VP (VBP am) (NNP Sam)) (. .))\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
39 | "tree = nltk.tree.Tree.fromstring(string)\n",
40 | "\n",
41 | "def load_compressed_tree(s):\n",
42 | "\n",
43 | " def compress_tree(tree):\n",
44 | " if len(tree) == 1:\n",
45 | " if isinstance(tree[0], nltk.tree.Tree):\n",
46 | " return compress_tree(tree[0])\n",
47 | " else:\n",
48 | " return tree\n",
49 | " else:\n",
50 | " for i, t in enumerate(tree):\n",
51 | " tree[i] = compress_tree(t)\n",
52 | " return tree\n",
53 | "\n",
54 | " return compress_tree(nltk.tree.Tree.fromstring(s))\n",
55 | "tree = load_compressed_tree(string)\n",
56 | "for t in tree.subtrees():\n",
57 | " print(t)\n",
58 | " \n",
59 | "print(str(tree))"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 3,
65 | "metadata": {
66 | "collapsed": false
67 | },
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "(ROOT I am Sam .)\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "print(tree.flatten())"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 10,
84 | "metadata": {
85 | "collapsed": false
86 | },
87 | "outputs": [
88 | {
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "['ROOT', 'S', 'NP', 'PRP', 'VP', 'VBP', 'NP', 'NNP', '.']\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "print(list(t.label() for t in tree.subtrees()))"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 11,
103 | "metadata": {
104 | "collapsed": true
105 | },
106 | "outputs": [],
107 | "source": [
108 | "import json\n",
109 | "d = json.load(open(\"data/squad/shared_dev.json\", 'r'))"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 12,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/plain": [
122 | "73"
123 | ]
124 | },
125 | "execution_count": 12,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "len(d['pos_counter'])"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 13,
137 | "metadata": {
138 | "collapsed": false
139 | },
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "{'#': 6,\n",
145 | " '$': 80,\n",
146 | " \"''\": 1291,\n",
147 | " ',': 14136,\n",
148 | " '-LRB-': 1926,\n",
149 | " '-RRB-': 1925,\n",
150 | " '.': 9505,\n",
151 | " ':': 1455,\n",
152 | " 'ADJP': 3426,\n",
153 | " 'ADVP': 4936,\n",
154 | " 'CC': 9300,\n",
155 | " 'CD': 6216,\n",
156 | " 'CONJP': 191,\n",
157 | " 'DT': 26286,\n",
158 | " 'EX': 288,\n",
159 | " 'FRAG': 107,\n",
160 | " 'FW': 96,\n",
161 | " 'IN': 32564,\n",
162 | " 'INTJ': 12,\n",
163 | " 'JJ': 21452,\n",
164 | " 'JJR': 563,\n",
165 | " 'JJS': 569,\n",
166 | " 'LS': 7,\n",
167 | " 'LST': 1,\n",
168 | " 'MD': 1051,\n",
169 | " 'NAC': 19,\n",
170 | " 'NN': 34750,\n",
171 | " 'NNP': 28392,\n",
172 | " 'NNPS': 1400,\n",
173 | " 'NNS': 16716,\n",
174 | " 'NP': 91636,\n",
175 | " 'NP-TMP': 236,\n",
176 | " 'NX': 108,\n",
177 | " 'PDT': 89,\n",
178 | " 'POS': 1451,\n",
179 | " 'PP': 33278,\n",
180 | " 'PRN': 2085,\n",
181 | " 'PRP': 2320,\n",
182 | " 'PRP$': 1959,\n",
183 | " 'PRT': 450,\n",
184 | " 'QP': 838,\n",
185 | " 'RB': 7611,\n",
186 | " 'RBR': 301,\n",
187 | " 'RBS': 252,\n",
188 | " 'ROOT': 9587,\n",
189 | " 'RP': 454,\n",
190 | " 'RRC': 19,\n",
191 | " 'S': 21557,\n",
192 | " 'SBAR': 5009,\n",
193 | " 'SBARQ': 6,\n",
194 | " 'SINV': 135,\n",
195 | " 'SQ': 5,\n",
196 | " 'SYM': 17,\n",
197 | " 'TO': 5167,\n",
198 | " 'UCP': 143,\n",
199 | " 'UH': 15,\n",
200 | " 'VB': 4197,\n",
201 | " 'VBD': 8377,\n",
202 | " 'VBG': 3570,\n",
203 | " 'VBN': 7218,\n",
204 | " 'VBP': 2897,\n",
205 | " 'VBZ': 4146,\n",
206 | " 'VP': 33696,\n",
207 | " 'WDT': 1368,\n",
208 | " 'WHADJP': 5,\n",
209 | " 'WHADVP': 439,\n",
210 | " 'WHNP': 1927,\n",
211 | " 'WHPP': 153,\n",
212 | " 'WP': 482,\n",
213 | " 'WP$': 50,\n",
214 | " 'WRB': 442,\n",
215 | " 'X': 23,\n",
216 | " '``': 1269}"
217 | ]
218 | },
219 | "execution_count": 13,
220 | "metadata": {},
221 | "output_type": "execute_result"
222 | }
223 | ],
224 | "source": [
225 | "d['pos_counter']"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 3,
231 | "metadata": {
232 | "collapsed": false
233 | },
234 | "outputs": [
235 | {
236 | "name": "stdout",
237 | "output_type": "stream",
238 | "text": [
239 | "[[False False False False]\n",
240 | " [False True False False]\n",
241 | " [False False False False]]\n",
242 | "[[0 2 2 0]\n",
243 | " [2 2 0 2]\n",
244 | " [2 0 0 0]]\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "from my.nltk_utils import tree2matrix, load_compressed_tree, find_max_f1_subtree, set_span\n",
250 | "string = \"(ROOT(S(NP (PRP I))(VP (VBP am)(NP (NNP Sam)))(. .)))\"\n",
251 | "tree = load_compressed_tree(string)\n",
252 | "span = (1, 3)\n",
253 | "set_span(tree)\n",
254 | "subtree = find_max_f1_subtree(tree, span)\n",
255 | "f = lambda t: t == subtree\n",
256 | "g = lambda t: 1 if isinstance(t, str) else 2\n",
257 | "a, b = tree2matrix(tree, f, dtype='bool')\n",
258 | "c, d = tree2matrix(tree, g, dtype='int32')\n",
259 | "print(a)\n",
260 | "print(c)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": null,
266 | "metadata": {
267 | "collapsed": true
268 | },
269 | "outputs": [],
270 | "source": []
271 | }
272 | ],
273 | "metadata": {
274 | "kernelspec": {
275 | "display_name": "Python 3",
276 | "language": "python",
277 | "name": "python3"
278 | },
279 | "language_info": {
280 | "codemirror_mode": {
281 | "name": "ipython",
282 | "version": 3
283 | },
284 | "file_extension": ".py",
285 | "mimetype": "text/x-python",
286 | "name": "python",
287 | "nbconvert_exporter": "python",
288 | "pygments_lexer": "ipython3",
289 | "version": "3.5.1"
290 | }
291 | },
292 | "nbformat": 4,
293 | "nbformat_minor": 0
294 | }
295 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/eda_aug_dev.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import json\n",
12 | "\n",
13 | "aug_data_path = \"/Users/minjoons/data/squad/dev-v1.0-aug.json\"\n",
14 | "aug_data = json.load(open(aug_data_path, 'r'))"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 17,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "(['Denver', 'Broncos'], 'Denver Broncos')\n",
29 | "(['Denver', 'Broncos'], 'Denver Broncos')\n",
30 | "(['Denver', 'Broncos'], 'Denver Broncos ')\n",
31 | "(['Carolina', 'Panthers'], 'Carolina Panthers')\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "def compare_answers():\n",
37 | " for article in aug_data['data']:\n",
38 | " for para in article['paragraphs']:\n",
39 | " deps = para['deps']\n",
40 | " nodess = []\n",
41 | " for dep in deps:\n",
42 | " nodes, edges = dep\n",
43 | " if dep is not None:\n",
44 | " nodess.append(nodes)\n",
45 | " else:\n",
46 | " nodess.append([])\n",
47 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
48 | " for qa in para['qas']:\n",
49 | " for answer in qa['answers']:\n",
50 | " text = answer['text']\n",
51 | " word_start = answer['answer_word_start']\n",
52 | " word_stop = answer['answer_word_stop']\n",
53 | " answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
54 | " yield answer_words, text\n",
55 | "\n",
56 | "ca = compare_answers()\n",
57 | "print(next(ca))\n",
58 | "print(next(ca))\n",
59 | "print(next(ca))\n",
60 | "print(next(ca))"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 18,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "8\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "def counter():\n",
80 | " count = 0\n",
81 | " for article in aug_data['data']:\n",
82 | " for para in article['paragraphs']:\n",
83 | " deps = para['deps']\n",
84 | " nodess = []\n",
85 | " for dep in deps:\n",
86 | " if dep is None:\n",
87 | " count += 1\n",
88 | " print(count)\n",
89 | "counter()\n"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 19,
95 | "metadata": {
96 | "collapsed": false
97 | },
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "0\n"
104 | ]
105 | }
106 | ],
107 | "source": [
108 | "def bad_node_counter():\n",
109 | " count = 0\n",
110 | " for article in aug_data['data']:\n",
111 | " for para in article['paragraphs']:\n",
112 | " sents = para['sents']\n",
113 | " deps = para['deps']\n",
114 | " nodess = []\n",
115 | " for dep in deps:\n",
116 | " if dep is not None:\n",
117 | " nodes, edges = dep\n",
118 | " for node in nodes:\n",
119 | " if len(node) != 5:\n",
120 | " count += 1\n",
121 | " print(count)\n",
122 | "bad_node_counter() "
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 20,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | "7\n"
137 | ]
138 | }
139 | ],
140 | "source": [
141 | "def noanswer_counter():\n",
142 | " count = 0\n",
143 | " for article in aug_data['data']:\n",
144 | " for para in article['paragraphs']:\n",
145 | " deps = para['deps']\n",
146 | " nodess = []\n",
147 | " for dep in deps:\n",
148 | " if dep is not None:\n",
149 | " nodes, edges = dep\n",
150 | " nodess.append(nodes)\n",
151 | " else:\n",
152 | " nodess.append([])\n",
153 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
154 | " for qa in para['qas']:\n",
155 | " for answer in qa['answers']:\n",
156 | " text = answer['text']\n",
157 | " word_start = answer['answer_word_start']\n",
158 | " word_stop = answer['answer_word_stop']\n",
159 | " if word_start is None:\n",
160 | " count += 1\n",
161 | " print(count)\n",
162 | "noanswer_counter()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 22,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "10600\n"
177 | ]
178 | }
179 | ],
180 | "source": [
181 | "print(sum(len(para['qas']) for a in aug_data['data'] for para in a['paragraphs']))"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 5,
187 | "metadata": {
188 | "collapsed": false
189 | },
190 | "outputs": [
191 | {
192 | "name": "stdout",
193 | "output_type": "stream",
194 | "text": [
195 | "10348\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "import nltk\n",
201 | "\n",
202 | "def _set_span(t, i):\n",
203 | " if isinstance(t[0], str):\n",
204 | " t.span = (i, i+len(t))\n",
205 | " else:\n",
206 | " first = True\n",
207 | " for c in t:\n",
208 | " cur_span = _set_span(c, i)\n",
209 | " i = cur_span[1]\n",
210 | " if first:\n",
211 | " min_ = cur_span[0]\n",
212 | " first = False\n",
213 | " max_ = cur_span[1]\n",
214 | " t.span = (min_, max_)\n",
215 | " return t.span\n",
216 | "\n",
217 | "\n",
218 | "def set_span(t):\n",
219 | " assert isinstance(t, nltk.tree.Tree)\n",
220 | " try:\n",
221 | " return _set_span(t, 0)\n",
222 | " except:\n",
223 | " print(t)\n",
224 | " exit()\n",
225 | "\n",
226 | "def same_span_counter():\n",
227 | " count = 0\n",
228 | " for article in aug_data['data']:\n",
229 | " for para in article['paragraphs']:\n",
230 | " consts = para['consts']\n",
231 | " for const in consts:\n",
232 | " tree = nltk.tree.Tree.fromstring(const)\n",
233 | " set_span(tree)\n",
234 | " if len(list(tree.subtrees())) > len(set(t.span for t in tree.subtrees())):\n",
235 | " count += 1\n",
236 | " print(count)\n",
237 | "same_span_counter()"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {
244 | "collapsed": true
245 | },
246 | "outputs": [],
247 | "source": []
248 | }
249 | ],
250 | "metadata": {
251 | "kernelspec": {
252 | "display_name": "Python 3",
253 | "language": "python",
254 | "name": "python3"
255 | },
256 | "language_info": {
257 | "codemirror_mode": {
258 | "name": "ipython",
259 | "version": 3
260 | },
261 | "file_extension": ".py",
262 | "mimetype": "text/x-python",
263 | "name": "python",
264 | "nbconvert_exporter": "python",
265 | "pygments_lexer": "ipython3",
266 | "version": "3.5.1"
267 | }
268 | },
269 | "nbformat": 4,
270 | "nbformat_minor": 0
271 | }
272 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/my/tensorflow/nn.py:
--------------------------------------------------------------------------------
1 | from tensorflow.python.ops.rnn_cell_impl import _linear
2 | from tensorflow.python.util import nest
3 | import tensorflow as tf
4 |
5 | from my.tensorflow import flatten, reconstruct, add_wd, exp_mask
6 |
7 |
8 | def linear(args, output_size, bias, bias_start=0.0, scope=None, squeeze=False, wd=0.0, input_keep_prob=1.0,
9 | is_train=None):
10 | if args is None or (nest.is_sequence(args) and not args):
11 | raise ValueError("`args` must be specified")
12 | if not nest.is_sequence(args):
13 | args = [args]
14 |
15 | flat_args = [flatten(arg, 1) for arg in args]
16 | if input_keep_prob < 1.0:
17 | assert is_train is not None
18 | flat_args = [tf.cond(is_train, lambda: tf.nn.dropout(arg, input_keep_prob), lambda: arg)
19 | for arg in flat_args]
20 | with tf.variable_scope(scope or 'Linear'):
21 | flat_out = _linear(flat_args, output_size, bias, bias_initializer=tf.constant_initializer(bias_start))
22 | out = reconstruct(flat_out, args[0], 1)
23 | if squeeze:
24 | out = tf.squeeze(out, [len(args[0].get_shape().as_list())-1])
25 | if wd:
26 | add_wd(wd)
27 |
28 | return out
29 |
30 |
31 | def dropout(x, keep_prob, is_train, noise_shape=None, seed=None, name=None):
32 | with tf.name_scope(name or "dropout"):
33 | if keep_prob < 1.0:
34 | d = tf.nn.dropout(x, keep_prob, noise_shape=noise_shape, seed=seed)
35 | out = tf.cond(is_train, lambda: d, lambda: x)
36 | return out
37 | return x
38 |
39 |
40 | def softmax(logits, mask=None, scope=None):
41 | with tf.name_scope(scope or "Softmax"):
42 | if mask is not None:
43 | logits = exp_mask(logits, mask)
44 | flat_logits = flatten(logits, 1)
45 | flat_out = tf.nn.softmax(flat_logits)
46 | out = reconstruct(flat_out, logits, 1)
47 |
48 | return out
49 |
50 |
51 | def softsel(target, logits, mask=None, scope=None):
52 | """
53 |
54 | :param target: [ ..., J, d] dtype=float
55 | :param logits: [ ..., J], dtype=float
56 | :param mask: [ ..., J], dtype=bool
57 | :param scope:
58 | :return: [..., d], dtype=float
59 | """
60 | with tf.name_scope(scope or "Softsel"):
61 | a = softmax(logits, mask=mask)
62 | target_rank = len(target.get_shape().as_list())
63 | out = tf.reduce_sum(tf.expand_dims(a, -1) * target, target_rank - 2)
64 | return out
65 |
66 |
67 | def double_linear_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
68 | with tf.variable_scope(scope or "Double_Linear_Logits"):
69 | first = tf.tanh(linear(args, size, bias, bias_start=bias_start, scope='first',
70 | wd=wd, input_keep_prob=input_keep_prob, is_train=is_train))
71 | second = linear(first, 1, bias, bias_start=bias_start, squeeze=True, scope='second',
72 | wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
73 | if mask is not None:
74 | second = exp_mask(second, mask)
75 | return second
76 |
77 |
78 | def linear_logits(args, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None):
79 | with tf.variable_scope(scope or "Linear_Logits"):
80 | logits = linear(args, 1, bias, bias_start=bias_start, squeeze=True, scope='first',
81 | wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
82 | if mask is not None:
83 | logits = exp_mask(logits, mask)
84 | return logits
85 |
86 |
87 | def sum_logits(args, mask=None, name=None):
88 | with tf.name_scope(name or "sum_logits"):
89 | if args is None or (nest.is_sequence(args) and not args):
90 | raise ValueError("`args` must be specified")
91 | if not nest.is_sequence(args):
92 | args = [args]
93 | rank = len(args[0].get_shape())
94 | logits = sum(tf.reduce_sum(arg, rank-1) for arg in args)
95 | if mask is not None:
96 | logits = exp_mask(logits, mask)
97 | return logits
98 |
99 |
100 | def get_logits(args, size, bias, bias_start=0.0, scope=None, mask=None, wd=0.0, input_keep_prob=1.0, is_train=None, func=None):
101 | if func is None:
102 | func = "sum"
103 | if func == 'sum':
104 | return sum_logits(args, mask=mask, name=scope)
105 | elif func == 'linear':
106 | return linear_logits(args, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
107 | is_train=is_train)
108 | elif func == 'double':
109 | return double_linear_logits(args, size, bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
110 | is_train=is_train)
111 | elif func == 'dot':
112 | assert len(args) == 2
113 | arg = args[0] * args[1]
114 | return sum_logits([arg], mask=mask, name=scope)
115 | elif func == 'mul_linear':
116 | assert len(args) == 2
117 | arg = args[0] * args[1]
118 | return linear_logits([arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
119 | is_train=is_train)
120 | elif func == 'proj':
121 | assert len(args) == 2
122 | d = args[1].get_shape()[-1]
123 | proj = linear([args[0]], d, False, bias_start=bias_start, scope=scope, wd=wd, input_keep_prob=input_keep_prob,
124 | is_train=is_train)
125 | return sum_logits([proj * args[1]], mask=mask)
126 | elif func == 'tri_linear':
127 | assert len(args) == 2
128 | new_arg = args[0] * args[1]
129 | return linear_logits([args[0], args[1], new_arg], bias, bias_start=bias_start, scope=scope, mask=mask, wd=wd, input_keep_prob=input_keep_prob,
130 | is_train=is_train)
131 | else:
132 | raise Exception()
133 |
134 |
135 | def highway_layer(arg, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
136 | with tf.variable_scope(scope or "highway_layer"):
137 | d = arg.get_shape()[-1]
138 | trans = linear([arg], d, bias, bias_start=bias_start, scope='trans', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
139 | trans = tf.nn.relu(trans)
140 | gate = linear([arg], d, bias, bias_start=bias_start, scope='gate', wd=wd, input_keep_prob=input_keep_prob, is_train=is_train)
141 | gate = tf.nn.sigmoid(gate)
142 | out = gate * trans + (1 - gate) * arg
143 | return out
144 |
145 |
146 | def highway_network(arg, num_layers, bias, bias_start=0.0, scope=None, wd=0.0, input_keep_prob=1.0, is_train=None):
147 | with tf.variable_scope(scope or "highway_network"):
148 | prev = arg
149 | cur = None
150 | for layer_idx in range(num_layers):
151 | cur = highway_layer(prev, bias, bias_start=bias_start, scope="layer_{}".format(layer_idx), wd=wd,
152 | input_keep_prob=input_keep_prob, is_train=is_train)
153 | prev = cur
154 | return cur
155 |
156 |
157 | def conv1d(in_, filter_size, height, padding, is_train=None, keep_prob=1.0, scope=None):
158 | with tf.variable_scope(scope or "conv1d"):
159 | num_channels = in_.get_shape()[-1]
160 | filter_ = tf.get_variable("filter", shape=[1, height, num_channels, filter_size], dtype='float')
161 | bias = tf.get_variable("bias", shape=[filter_size], dtype='float')
162 | strides = [1, 1, 1, 1]
163 | if is_train is not None and keep_prob < 1.0:
164 | in_ = dropout(in_, keep_prob, is_train)
165 | xxc = tf.nn.conv2d(in_, filter_, strides, padding) + bias # [N*M, JX, W/filter_stride, d]
166 | out = tf.reduce_max(tf.nn.relu(xxc), 2) # [-1, JX, d]
167 | return out
168 |
169 |
170 | def multi_conv1d(in_, filter_sizes, heights, padding, is_train=None, keep_prob=1.0, scope=None):
171 | with tf.variable_scope(scope or "multi_conv1d"):
172 | assert len(filter_sizes) == len(heights)
173 | outs = []
174 | for filter_size, height in zip(filter_sizes, heights):
175 | if filter_size == 0:
176 | continue
177 | out = conv1d(in_, filter_size, height, padding, is_train=is_train, keep_prob=keep_prob, scope="conv1d_{}".format(height))
178 | outs.append(out)
179 | concat_out = tf.concat(axis=2, values=outs)
180 | return concat_out
181 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/tree/evaluator.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 |
4 | from tree.read_data import DataSet
5 | from my.nltk_utils import span_f1
6 |
7 |
8 | class Evaluation(object):
9 | def __init__(self, data_type, global_step, idxs, yp):
10 | self.data_type = data_type
11 | self.global_step = global_step
12 | self.idxs = idxs
13 | self.yp = yp
14 | self.num_examples = len(yp)
15 | self.dict = {'data_type': data_type,
16 | 'global_step': global_step,
17 | 'yp': yp,
18 | 'idxs': idxs,
19 | 'num_examples': self.num_examples}
20 | self.summaries = None
21 |
22 | def __repr__(self):
23 | return "{} step {}".format(self.data_type, self.global_step)
24 |
25 | def __add__(self, other):
26 | if other == 0:
27 | return self
28 | assert self.data_type == other.data_type
29 | assert self.global_step == other.global_step
30 | new_yp = self.yp + other.yp
31 | new_idxs = self.idxs + other.idxs
32 | return Evaluation(self.data_type, self.global_step, new_idxs, new_yp)
33 |
34 | def __radd__(self, other):
35 | return self.__add__(other)
36 |
37 |
38 | class LabeledEvaluation(Evaluation):
39 | def __init__(self, data_type, global_step, idxs, yp, y):
40 | super(LabeledEvaluation, self).__init__(data_type, global_step, idxs, yp)
41 | self.y = y
42 | self.dict['y'] = y
43 |
44 | def __add__(self, other):
45 | if other == 0:
46 | return self
47 | assert self.data_type == other.data_type
48 | assert self.global_step == other.global_step
49 | new_yp = self.yp + other.yp
50 | new_y = self.y + other.y
51 | new_idxs = self.idxs + other.idxs
52 | return LabeledEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y)
53 |
54 |
55 | class AccuracyEvaluation(LabeledEvaluation):
56 | def __init__(self, data_type, global_step, idxs, yp, y, correct, loss):
57 | super(AccuracyEvaluation, self).__init__(data_type, global_step, idxs, yp, y)
58 | self.loss = loss
59 | self.correct = correct
60 | self.acc = sum(correct) / len(correct)
61 | self.dict['loss'] = loss
62 | self.dict['correct'] = correct
63 | self.dict['acc'] = self.acc
64 | loss_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/loss', simple_value=self.loss)])
65 | acc_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/acc', simple_value=self.acc)])
66 | self.summaries = [loss_summary, acc_summary]
67 |
68 | def __repr__(self):
69 | return "{} step {}: accuracy={}, loss={}".format(self.data_type, self.global_step, self.acc, self.loss)
70 |
71 | def __add__(self, other):
72 | if other == 0:
73 | return self
74 | assert self.data_type == other.data_type
75 | assert self.global_step == other.global_step
76 | new_idxs = self.idxs + other.idxs
77 | new_yp = self.yp + other.yp
78 | new_y = self.y + other.y
79 | new_correct = self.correct + other.correct
80 | new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
81 | return AccuracyEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_y, new_correct, new_loss)
82 |
83 |
84 | class Evaluator(object):
85 | def __init__(self, config, model):
86 | self.config = config
87 | self.model = model
88 |
89 | def get_evaluation(self, sess, batch):
90 | idxs, data_set = batch
91 | feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
92 | global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
93 | yp = yp[:data_set.num_examples]
94 | e = Evaluation(data_set.data_type, int(global_step), idxs, yp.tolist())
95 | return e
96 |
97 | def get_evaluation_from_batches(self, sess, batches):
98 | e = sum(self.get_evaluation(sess, batch) for batch in batches)
99 | return e
100 |
101 |
102 | class LabeledEvaluator(Evaluator):
103 | def get_evaluation(self, sess, batch):
104 | idxs, data_set = batch
105 | feed_dict = self.model.get_feed_dict(data_set, False, supervised=False)
106 | global_step, yp = sess.run([self.model.global_step, self.model.yp], feed_dict=feed_dict)
107 | yp = yp[:data_set.num_examples]
108 | y = feed_dict[self.model.y]
109 | e = LabeledEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist())
110 | return e
111 |
112 |
113 | class AccuracyEvaluator(LabeledEvaluator):
114 | def get_evaluation(self, sess, batch):
115 | idxs, data_set = batch
116 | assert isinstance(data_set, DataSet)
117 | feed_dict = self.model.get_feed_dict(data_set, False)
118 | global_step, yp, loss = sess.run([self.model.global_step, self.model.yp, self.model.loss], feed_dict=feed_dict)
119 | y = feed_dict[self.model.y]
120 | yp = yp[:data_set.num_examples]
121 | correct = [self.__class__.compare(yi, ypi) for yi, ypi in zip(y, yp)]
122 | e = AccuracyEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), y.tolist(), correct, float(loss))
123 | return e
124 |
125 | @staticmethod
126 | def compare(yi, ypi):
127 | return int(np.argmax(yi)) == int(np.argmax(ypi))
128 |
129 |
130 | class AccuracyEvaluator2(AccuracyEvaluator):
131 | @staticmethod
132 | def compare(yi, ypi):
133 | i = int(np.argmax(yi.flatten()))
134 | j = int(np.argmax(ypi.flatten()))
135 | # print(i, j, i == j)
136 | return i == j
137 |
138 |
139 | class TempEvaluation(AccuracyEvaluation):
140 | def __init__(self, data_type, global_step, idxs, yp, yp2, y, y2, correct, loss, f1s):
141 | super(TempEvaluation, self).__init__(data_type, global_step, idxs, yp, y, correct, loss)
142 | self.y2 = y2
143 | self.yp2 = yp2
144 | self.f1s = f1s
145 | self.f1 = float(np.mean(f1s))
146 | self.dict['y2'] = y2
147 | self.dict['yp2'] = yp2
148 | self.dict['f1s'] = f1s
149 | self.dict['f1'] = self.f1
150 | f1_summary = tf.Summary(value=[tf.Summary.Value(tag='dev/f1', simple_value=self.f1)])
151 | self.summaries.append(f1_summary)
152 |
153 | def __add__(self, other):
154 | if other == 0:
155 | return self
156 | assert self.data_type == other.data_type
157 | assert self.global_step == other.global_step
158 | new_idxs = self.idxs + other.idxs
159 | new_yp = self.yp + other.yp
160 | new_yp2 = self.yp2 + other.yp2
161 | new_y = self.y + other.y
162 | new_y2 = self.y2 + other.y2
163 | new_correct = self.correct + other.correct
164 | new_f1s = self.f1s + other.f1s
165 | new_loss = (self.loss * self.num_examples + other.loss * other.num_examples) / len(new_correct)
166 | return TempEvaluation(self.data_type, self.global_step, new_idxs, new_yp, new_yp2, new_y, new_y2, new_correct, new_loss, new_f1s)
167 |
168 |
169 | class TempEvaluator(LabeledEvaluator):
170 | def get_evaluation(self, sess, batch):
171 | idxs, data_set = batch
172 | assert isinstance(data_set, DataSet)
173 | feed_dict = self.model.get_feed_dict(data_set, False)
174 | global_step, yp, yp2, loss = sess.run([self.model.global_step, self.model.yp, self.model.yp2, self.model.loss], feed_dict=feed_dict)
175 | y, y2 = feed_dict[self.model.y], feed_dict[self.model.y2]
176 | yp, yp2 = yp[:data_set.num_examples], yp2[:data_set.num_examples]
177 | correct = [self.__class__.compare(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
178 | f1s = [self.__class__.span_f1(yi, y2i, ypi, yp2i) for yi, y2i, ypi, yp2i in zip(y, y2, yp, yp2)]
179 | e = TempEvaluation(data_set.data_type, int(global_step), idxs, yp.tolist(), yp2.tolist(), y.tolist(), y2.tolist(), correct, float(loss), f1s)
180 | return e
181 |
182 | @staticmethod
183 | def compare(yi, y2i, ypi, yp2i):
184 | i = int(np.argmax(yi.flatten()))
185 | j = int(np.argmax(ypi.flatten()))
186 | k = int(np.argmax(y2i.flatten()))
187 | l = int(np.argmax(yp2i.flatten()))
188 | # print(i, j, i == j)
189 | return i == j and k == l
190 |
191 | @staticmethod
192 | def span_f1(yi, y2i, ypi, yp2i):
193 | true_span = (np.argmax(yi.flatten()), np.argmax(y2i.flatten())+1)
194 | pred_span = (np.argmax(ypi.flatten()), np.argmax(yp2i.flatten())+1)
195 | f1 = span_f1(true_span, pred_span)
196 | return f1
197 |
198 |
--------------------------------------------------------------------------------
/tensorflow/SQuAD/squad/eda_aug_train.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import json\n",
12 | "\n",
13 | "aug_data_path = \"/Users/minjoons/data/squad/train-v1.0-aug.json\"\n",
14 | "aug_data = json.load(open(aug_data_path, 'r'))"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [
24 | {
25 | "name": "stdout",
26 | "output_type": "stream",
27 | "text": [
28 | "(['Saint', 'Bernadette', 'Soubirous'], 'Saint Bernadette Soubirous')\n",
29 | "(['a', 'copper', 'statue', 'of', 'Christ'], 'a copper statue of Christ')\n",
30 | "(['the', 'Main', 'Building'], 'the Main Building')\n",
31 | "(['a', 'Marian', 'place', 'of', 'prayer', 'and', 'reflection'], 'a Marian place of prayer and reflection')\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "def compare_answers():\n",
37 | " for article in aug_data['data']:\n",
38 | " for para in article['paragraphs']:\n",
39 | " deps = para['deps']\n",
40 | " nodess = []\n",
41 | " for dep in deps:\n",
42 | " nodes, edges = dep\n",
43 | " if dep is not None:\n",
44 | " nodess.append(nodes)\n",
45 | " else:\n",
46 | " nodess.append([])\n",
47 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
48 | " for qa in para['qas']:\n",
49 | " for answer in qa['answers']:\n",
50 | " text = answer['text']\n",
51 | " word_start = answer['answer_word_start']\n",
52 | " word_stop = answer['answer_word_stop']\n",
53 | " answer_words = wordss[word_start[0]][word_start[1]:word_stop[1]]\n",
54 | " yield answer_words, text\n",
55 | "\n",
56 | "ca = compare_answers()\n",
57 | "print(next(ca))\n",
58 | "print(next(ca))\n",
59 | "print(next(ca))\n",
60 | "print(next(ca))"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 11,
66 | "metadata": {
67 | "collapsed": false
68 | },
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "x: .\n",
75 | "x: .\n",
76 | "x: .\n",
77 | "x: .\n",
78 | "x: .\n",
79 | "x: .\n",
80 | "x: .\n",
81 | "x: .\n",
82 | "q: k\n",
83 | "q: j\n",
84 | "q: n\n",
85 | "q: b\n",
86 | "q: v\n",
87 | "x: .\n",
88 | "x: :208\n",
89 | "x: .\n",
90 | "x: .\n",
91 | "x: .\n",
92 | "x: .\n",
93 | "x: .\n",
94 | "x: .\n",
95 | "x: .\n",
96 | "x: .\n",
97 | "x: .\n",
98 | "x: .\n",
99 | "x: .\n",
100 | "q: dd\n",
101 | "q: dd\n",
102 | "q: dd\n",
103 | "q: dd\n",
104 | "q: d\n",
105 | "x: .\n",
106 | "x: .\n",
107 | "x: .\n",
108 | "x: .\n",
109 | "x: .\n",
110 | "x: .\n",
111 | "x: .\n",
112 | "x: .\n",
113 | "x: :411\n",
114 | "x: .\n",
115 | "x: .\n",
116 | "x: .\n",
117 | "x: .\n",
118 | "x: .\n",
119 | "x: .\n",
120 | "x: :40\n",
121 | "x: .\n",
122 | "x: *\n",
123 | "x: :14\n",
124 | "x: .\n",
125 | "x: .\n",
126 | "x: .\n",
127 | "x: :131\n",
128 | "x: .\n",
129 | "x: .\n",
130 | "x: .\n",
131 | "x: .\n",
132 | "x: .\n",
133 | "x: .\n",
134 | "x: .\n",
135 | "x: .\n",
136 | "x: .\n",
137 | "53 10\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "def nodep_counter():\n",
143 | " x_count = 0\n",
144 | " q_count = 0\n",
145 | " for article in aug_data['data']:\n",
146 | " for para in article['paragraphs']:\n",
147 | " deps = para['deps']\n",
148 | " nodess = []\n",
149 | " for sent, dep in zip(para['sents'], deps):\n",
150 | " if dep is None:\n",
151 | " print(\"x:\", sent)\n",
152 | " x_count += 1\n",
153 | " for qa in para['qas']:\n",
154 | " if qa['dep'] is None:\n",
155 | " print(\"q:\", qa['question'])\n",
156 | " q_count += 1\n",
157 | " print(x_count, q_count)\n",
158 | "nodep_counter()\n"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 4,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | "0\n"
173 | ]
174 | }
175 | ],
176 | "source": [
177 | "def bad_node_counter():\n",
178 | " count = 0\n",
179 | " for article in aug_data['data']:\n",
180 | " for para in article['paragraphs']:\n",
181 | " sents = para['sents']\n",
182 | " deps = para['deps']\n",
183 | " nodess = []\n",
184 | " for dep in deps:\n",
185 | " if dep is not None:\n",
186 | " nodes, edges = dep\n",
187 | " for node in nodes:\n",
188 | " if len(node) != 5:\n",
189 | " count += 1\n",
190 | " print(count)\n",
191 | "bad_node_counter() "
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 5,
197 | "metadata": {
198 | "collapsed": false
199 | },
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "36\n"
206 | ]
207 | }
208 | ],
209 | "source": [
210 | "def noanswer_counter():\n",
211 | " count = 0\n",
212 | " for article in aug_data['data']:\n",
213 | " for para in article['paragraphs']:\n",
214 | " deps = para['deps']\n",
215 | " nodess = []\n",
216 | " for dep in deps:\n",
217 | " if dep is not None:\n",
218 | " nodes, edges = dep\n",
219 | " nodess.append(nodes)\n",
220 | " else:\n",
221 | " nodess.append([])\n",
222 | " wordss = [[node[0] for node in nodes] for nodes in nodess]\n",
223 | " for qa in para['qas']:\n",
224 | " for answer in qa['answers']:\n",
225 | " text = answer['text']\n",
226 | " word_start = answer['answer_word_start']\n",
227 | " word_stop = answer['answer_word_stop']\n",
228 | " if word_start is None:\n",
229 | " count += 1\n",
230 | " print(count)\n",
231 | "noanswer_counter()"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 14,
237 | "metadata": {
238 | "collapsed": false
239 | },
240 | "outputs": [
241 | {
242 | "name": "stdout",
243 | "output_type": "stream",
244 | "text": [
245 | "106\n"
246 | ]
247 | }
248 | ],
249 | "source": [
250 | "def mult_sent_answer_counter():\n",
251 | " count = 0\n",
252 | " for article in aug_data['data']:\n",
253 | " for para in article['paragraphs']:\n",
254 | " for qa in para['qas']:\n",
255 | " for answer in qa['answers']:\n",
256 | " text = answer['text']\n",
257 | " word_start = answer['answer_word_start']\n",
258 | " word_stop = answer['answer_word_stop']\n",
259 | " if word_start is not None and word_start[0] != word_stop[0]:\n",
260 | " count += 1\n",
261 | " print(count)\n",
262 | "mult_sent_answer_counter()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "metadata": {
269 | "collapsed": true
270 | },
271 | "outputs": [],
272 | "source": []
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {
278 | "collapsed": true
279 | },
280 | "outputs": [],
281 | "source": []
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {
287 | "collapsed": true
288 | },
289 | "outputs": [],
290 | "source": []
291 | }
292 | ],
293 | "metadata": {
294 | "kernelspec": {
295 | "display_name": "Python 3",
296 | "language": "python",
297 | "name": "python3"
298 | },
299 | "language_info": {
300 | "codemirror_mode": {
301 | "name": "ipython",
302 | "version": 3
303 | },
304 | "file_extension": ".py",
305 | "mimetype": "text/x-python",
306 | "name": "python",
307 | "nbconvert_exporter": "python",
308 | "pygments_lexer": "ipython3",
309 | "version": "3.5.1"
310 | }
311 | },
312 | "nbformat": 4,
313 | "nbformat_minor": 0
314 | }
315 |
--------------------------------------------------------------------------------