├── model
    ├── __init__.py
    ├── VSLNet_t7.py
    ├── VSLNet.py
    ├── ops.py
    ├── layers.py
    └── layers_t7.py
├── util
    ├── __init__.py
    ├── runner_utils.py
    ├── data_loader_t7.py
    ├── runner_utils_t7.py
    ├── data_loader.py
    ├── data_util.py
    └── data_gen.py
├── prepare
    ├── __init__.py
    ├── extract_activitynet_org.py
    ├── extract_tacos_org.py
    ├── download_activitynet_video.py
    ├── videotransforms.py
    ├── extract_charades.py
    ├── README.md
    ├── extract_tacos.py
    ├── extract_activitynet.py
    └── feature_extractor.py
├── figures
    └── overview.jpg
├── LICENSE
├── .gitignore
├── README.md
├── main.py
└── main_t7.py


/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/prepare/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/figures/overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/26hzhang/VSLNet/HEAD/figures/overview.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 ZHANG HAO
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/prepare/extract_activitynet_org.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import json
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from argparse import ArgumentParser
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--dataset_dir", type=str, required=True, help="dataset path")
10 | parser.add_argument("--hdf5_file", type=str, required=True, help="downloaded activitynet features")
11 | parser.add_argument("--save_dir", type=str, required=True, help="save dir")
12 | args = parser.parse_args()
13 | 
14 | with open(os.path.join(args.dataset_dir, "train.json"), mode="r", encoding="utf-8") as f:
15 |     train_data = json.load(f)
16 | with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f:
17 |     val_data = json.load(f)
18 | with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f:
19 |     test_data = json.load(f)
20 | 
21 | video_ids = list(set(list(train_data.keys()) + list(val_data.keys()) + list(test_data.keys())))
22 | print(video_ids)
23 | print(len(video_ids))
24 | 
25 | if not os.path.exists(args.save_dir):
26 |     os.makedirs(args.save_dir)
27 | 
28 | feature_shapes = dict()
29 | with h5py.File(args.hdf5_file, mode="r") as f:
30 |     group_key = list(f.keys())
31 |     for key in tqdm(group_key, total=len(group_key), desc="extract features"):
32 |         video_id = key
33 |         if video_id not in video_ids:
34 |             continue
35 |         data = f[key]["c3d_features"][()]
36 |         feature_shapes[video_id] = data.shape[0]
37 |         np.save(os.path.join(args.save_dir, video_id), arr=data)
38 | 
39 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
40 |     json.dump(feature_shapes, f)
41 | 


--------------------------------------------------------------------------------
/prepare/extract_tacos_org.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | 
 7 | # 1. step download pre-trained C3D features from https://github.com/jiyanggao/TALL
 8 | # 2. convert the features
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--data_path", type=str, required=True, default="tacos dataset")
12 | parser.add_argument("--feature_path", type=str, required=True, help="pre-trained C3D features")
13 | parser.add_argument("--save_dir", type=str, required=True, help="extracted feature save path")
14 | parser.add_argument("--sample_rate", type=int, default=64, help="sample rate [64 | 128 | 256 | 512]")
15 | args = parser.parse_args()
16 | 
17 | stride = args.sample_rate // 5  # due to 0.8 overlap of the pre-trained C3D features
18 | 
19 | if not os.path.exists(args.save_dir):
20 |     os.makedirs(args.save_dir)
21 | 
22 | with open(os.path.join(args.data_path, "train.json"), mode="r", encoding="utf-8") as f:
23 |     dataset = json.load(f)
24 | with open(os.path.join(args.data_path, "val.json"), mode="r", encoding="utf-8") as f:
25 |     dataset.update(json.load(f))
26 | with open(os.path.join(args.data_path, "test.json"), mode="r", encoding="utf-8") as f:
27 |     dataset.update(json.load(f))
28 | 
29 | feature_shapes = dict()
30 | for video_id, annotations in tqdm(dataset.items(), total=len(dataset), desc=""):
31 |     video_features = []
32 |     num_frames = annotations["num_frames"] - 16  # trick from 2D-TAN
33 |     for idx in range(0, (num_frames - args.sample_rate) // stride + 1):
34 |         s_idx = idx * stride + 1
35 |         e_idx = s_idx + args.sample_rate
36 |         feature_path = os.path.join(args.feature_path, "{}.avi_{}_{}.npy".format(video_id, s_idx, e_idx))
37 |         feature = np.load(feature_path)
38 |         video_features.append(feature)
39 |     video_features = np.stack(video_features, axis=0)
40 |     np.save(os.path.join(args.save_dir, video_id), arr=video_features)
41 |     feature_shapes[video_id] = video_features.shape[0]
42 | 
43 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
44 |     json.dump(feature_shapes, f)
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # add
132 | .idea/
133 | .vscode/
134 | ckpt/
135 | ckpt_t7/
136 | ckpt*/
137 | data/features/
138 | datasets/
139 | .DS_Store
140 | *.DS_Store
141 | 


--------------------------------------------------------------------------------
/prepare/download_activitynet_video.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Codes are modified from https://github.com/waybarrios/Anet_tools2.0
 3 | """
 4 | import os
 5 | import glob
 6 | import json
 7 | from argparse import ArgumentParser
 8 | 
 9 | 
10 | def crosscheck_videos(video_path, all_video_ids):
11 |     # Get existing videos
12 |     existing_videos = glob.glob("%s/*.mp4" % video_path)
13 |     for idx, vid in enumerate(existing_videos):
14 |         basename = os.path.basename(vid).split(".mp4")[0]
15 |         if len(basename) == 13:
16 |             existing_videos[idx] = basename[2:]
17 |         elif len(basename) == 11:
18 |             existing_videos[idx] = basename
19 |         else:
20 |             raise RuntimeError("Unknown filename format: %s", vid)
21 | 
22 |     non_existing_videos = []
23 |     for vid in all_video_ids:
24 |         if vid in existing_videos:
25 |             continue
26 |         else:
27 |             non_existing_videos.append(vid)
28 | 
29 |     return non_existing_videos
30 | 
31 | 
32 | def main(video_dir, dataset_dir, bash_file):
33 |     with open(os.path.join(dataset_dir, "train.json"), mode="r", encoding="utf-8") as f:
34 |         train_ids = list(json.load(f).keys())
35 |         train_ids = [vid[2:] if len(vid) == 13 else vid for vid in train_ids]
36 | 
37 |     with open(os.path.join(dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f:
38 |         val_ids = list(json.load(f).keys())
39 |         val_ids = [vid[2:] if len(vid) == 13 else vid for vid in val_ids]
40 | 
41 |     with open(os.path.join(dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f:
42 |         test_ids = list(json.load(f).keys())
43 |         test_ids = [vid[2:] if len(vid) == 13 else vid for vid in test_ids]
44 | 
45 |     all_video_ids = list(set(train_ids + val_ids + test_ids))
46 |     print("train_video_ids", len(train_ids))
47 |     print("val_1_video_ids", len(val_ids))
48 |     print("val_2_video_ids", len(test_ids))
49 |     print("all_video_ids", len(all_video_ids))
50 | 
51 |     non_existing_videos = crosscheck_videos(video_dir, all_video_ids)
52 | 
53 |     # save command to bash file
54 |     with open(bash_file + '.sh', mode="w", encoding="utf-8") as f:
55 |         f.write("#!/usr/bin/env bash\n\n")  # write bash file header
56 |         filename = os.path.join(video_dir, "v_%s.mp4")
57 |         cmd_base = "youtube-dl -f best -f mp4 "
58 |         cmd_base += '"https://www.youtube.com/watch?v=%s" '
59 |         cmd_base += '-o "%s"' % filename
60 | 
61 |         for vid in non_existing_videos:
62 |             cmd = cmd_base % (vid, vid)
63 |             f.write("%s\n" % cmd)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     parser = ArgumentParser(description="Script to double check video content.")
68 |     parser.add_argument("--video_dir", type=str, required=True, help="where to save the downloaded videos")
69 |     parser.add_argument("--dataset_dir", type=str, required=True, help="where are the annotation files")
70 |     parser.add_argument("--bash_file", type=str, required=True, help="where to save command list script")
71 | 
72 |     args = vars(parser.parse_args())
73 |     main(**args)
74 |     """
75 |     After running this python file, it will generate an script file. Using the terminal to run this script, it will 
76 |     automatically download all the required videos from YouTube.
77 |     """
78 | 


--------------------------------------------------------------------------------
/prepare/videotransforms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numbers
  3 | import random
  4 | 
  5 | 
  6 | class RandomCrop(object):
  7 |     """Crop the given video sequences (t x h x w) at a random location.
  8 |     Args:
  9 |         size (sequence or int): Desired output size of the crop. If size is an
 10 |             int instead of sequence like (h, w), a square crop (size, size) is
 11 |             made.
 12 |     """
 13 | 
 14 |     def __init__(self, size):
 15 |         if isinstance(size, numbers.Number):
 16 |             self.size = (size, size)
 17 |         else:
 18 |             self.size = size
 19 | 
 20 |     @staticmethod
 21 |     def get_params(img, output_size):
 22 |         """Get parameters for ``crop`` for a random crop.
 23 |         Args:
 24 |             img (PIL Image): Image to be cropped.
 25 |             output_size (tuple): Expected output size of the crop.
 26 |         Returns:
 27 |             tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
 28 |         """
 29 |         t, h, w, c = img.shape
 30 |         th, tw = output_size
 31 |         if w == tw and h == th:
 32 |             return 0, 0, h, w
 33 | 
 34 |         i = random.randint(0, h - th) if h != th else 0
 35 |         j = random.randint(0, w - tw) if w != tw else 0
 36 |         return i, j, th, tw
 37 | 
 38 |     def __call__(self, imgs):
 39 | 
 40 |         i, j, h, w = self.get_params(imgs, self.size)
 41 | 
 42 |         imgs = imgs[:, i:i + h, j:j + w, :]
 43 |         return imgs
 44 | 
 45 |     def __repr__(self):
 46 |         return self.__class__.__name__ + '(size={0})'.format(self.size)
 47 | 
 48 | 
 49 | class CenterCrop(object):
 50 |     """Crops the given seq Images at the center.
 51 |     Args:
 52 |         size (sequence or int): Desired output size of the crop. If size is an
 53 |             int instead of sequence like (h, w), a square crop (size, size) is
 54 |             made.
 55 |     """
 56 | 
 57 |     def __init__(self, size):
 58 |         if isinstance(size, numbers.Number):
 59 |             self.size = (size, size)
 60 |         else:
 61 |             self.size = size
 62 | 
 63 |     def __call__(self, imgs):
 64 |         """
 65 |         Args:
 66 |             imgs (PIL Image): Image to be cropped.
 67 |         Returns:
 68 |             PIL Image: Cropped image.
 69 |         """
 70 |         t, h, w, c = imgs.shape
 71 |         th, tw = self.size
 72 |         i = int(np.round((h - th) / 2.))
 73 |         j = int(np.round((w - tw) / 2.))
 74 | 
 75 |         return imgs[:, i:i + th, j:j + tw, :]
 76 | 
 77 |     def __repr__(self):
 78 |         return self.__class__.__name__ + '(size={0})'.format(self.size)
 79 | 
 80 | 
 81 | class RandomHorizontalFlip(object):
 82 |     """Horizontally flip the given seq Images randomly with a given probability.
 83 |     Args:
 84 |         p (float): probability of the image being flipped. Default value is 0.5
 85 |     """
 86 | 
 87 |     def __init__(self, p=0.5):
 88 |         self.p = p
 89 | 
 90 |     def __call__(self, imgs):
 91 |         """
 92 |         Args:
 93 |             imgs (seq Images): seq Images to be flipped.
 94 |         Returns:
 95 |             seq Images: Randomly flipped seq images.
 96 |         """
 97 |         if random.random() < self.p:
 98 |             # t x h x w
 99 |             return np.flip(imgs, axis=2).copy()
100 |         return imgs
101 | 
102 |     def __repr__(self):
103 |         return self.__class__.__name__ + '(p={})'.format(self.p)
104 | 


--------------------------------------------------------------------------------
/util/runner_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | from tqdm import tqdm
 5 | from util.data_util import index_to_time
 6 | 
 7 | if tf.__version__.startswith('2'):
 8 |     tf = tf.compat.v1
 9 |     tf.disable_v2_behavior()
10 |     tf.disable_eager_execution()
11 | 
12 | 
13 | def set_tf_config(seed, gpu_idx):
14 |     # os environment
15 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
16 |     os.environ["CUDA_VISIBLE_DEVICES"] = gpu_idx
17 |     # random seed
18 |     np.random.seed(seed)
19 |     tf.set_random_seed(seed)
20 |     tf.random.set_random_seed(seed)
21 | 
22 | 
23 | def write_tf_summary(writer, value_pairs, global_step):
24 |     for tag, value in value_pairs:
25 |         summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
26 |         writer.add_summary(summ, global_step=global_step)
27 |     writer.flush()
28 | 
29 | 
30 | def calculate_iou_accuracy(ious, threshold):
31 |     total_size = float(len(ious))
32 |     count = 0
33 |     for iou in ious:
34 |         if iou >= threshold:
35 |             count += 1
36 |     return float(count) / total_size * 100.0
37 | 
38 | 
39 | def calculate_iou(i0, i1):
40 |     union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
41 |     inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
42 |     iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0])
43 |     return max(0.0, iou)
44 | 
45 | 
46 | def get_feed_dict(batch_data, model, drop_rate=None, mode='train'):
47 |     if mode == 'train':  # training
48 |         (_, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels) = batch_data
49 |         feed_dict = {model.video_inputs: vfeats, model.video_seq_length: vfeat_lens, model.word_ids: word_ids,
50 |                      model.char_ids: char_ids, model.y1: s_labels, model.y2: e_labels, model.drop_rate: drop_rate,
51 |                      model.highlight_labels: h_labels}
52 |         return feed_dict
53 |     else:  # eval
54 |         raw_data, vfeats, vfeat_lens, word_ids, char_ids = batch_data
55 |         feed_dict = {model.video_inputs: vfeats, model.video_seq_length: vfeat_lens, model.word_ids: word_ids,
56 |                      model.char_ids: char_ids}
57 |         return raw_data, feed_dict
58 | 
59 | 
60 | def eval_test(sess, model, data_loader, epoch=None, global_step=None, mode="test"):
61 |     ious = list()
62 |     for data in tqdm(data_loader.test_iter(mode), total=data_loader.num_batches(mode), desc="evaluate {}".format(mode)):
63 |         raw_data, feed_dict = get_feed_dict(data, model, mode=mode)
64 |         start_indexes, end_indexes = sess.run([model.start_index, model.end_index], feed_dict=feed_dict)
65 |         for record, start_index, end_index in zip(raw_data, start_indexes, end_indexes):
66 |             start_time, end_time = index_to_time(start_index, end_index, record["v_len"], record["duration"])
67 |             iou = calculate_iou(i0=[start_time, end_time], i1=[record["s_time"], record["e_time"]])
68 |             ious.append(iou)
69 |     r1i3 = calculate_iou_accuracy(ious, threshold=0.3)
70 |     r1i5 = calculate_iou_accuracy(ious, threshold=0.5)
71 |     r1i7 = calculate_iou_accuracy(ious, threshold=0.7)
72 |     mi = np.mean(ious) * 100.0
73 |     value_pairs = [("{}/Rank@1, IoU=0.3".format(mode), r1i3), ("{}/Rank@1, IoU=0.5".format(mode), r1i5),
74 |                    ("{}/Rank@1, IoU=0.7".format(mode), r1i7), ("{}/mean IoU".format(mode), mi)]
75 |     # write the scores
76 |     score_str = "Epoch {}, Step {}:\n".format(epoch, global_step)
77 |     score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3)
78 |     score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5)
79 |     score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7)
80 |     score_str += "mean IoU: {:.2f}\n".format(mi)
81 |     return r1i3, r1i5, r1i7, mi, value_pairs, score_str
82 | 


--------------------------------------------------------------------------------
/model/VSLNet_t7.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from model.layers_t7 import Embedding, VisualProjection, FeatureEncoder, CQAttention, CQConcatenate, \
 4 |     ConditionedPredictor, HighLightLayer
 5 | from transformers import AdamW, get_linear_schedule_with_warmup
 6 | 
 7 | 
 8 | def build_optimizer_and_scheduler(model, configs):
 9 |     no_decay = ['bias', 'layer_norm', 'LayerNorm']  # no decay for parameters of layer norm and bias
10 |     optimizer_grouped_parameters = [
11 |         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
12 |          'weight_decay': 0.01},
13 |         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
14 |     optimizer = AdamW(optimizer_grouped_parameters, lr=configs.init_lr)
15 |     scheduler = get_linear_schedule_with_warmup(optimizer, configs.num_train_steps * configs.warmup_proportion,
16 |                                                 configs.num_train_steps)
17 |     return optimizer, scheduler
18 | 
19 | 
20 | class VSLNet(nn.Module):
21 |     def __init__(self, configs, word_vectors):
22 |         super(VSLNet, self).__init__()
23 |         self.configs = configs
24 |         self.embedding_net = Embedding(num_words=configs.word_size, num_chars=configs.char_size, out_dim=configs.dim,
25 |                                        word_dim=configs.word_dim, char_dim=configs.char_dim, word_vectors=word_vectors,
26 |                                        drop_rate=configs.drop_rate)
27 |         self.video_affine = VisualProjection(visual_dim=configs.video_feature_dim, dim=configs.dim,
28 |                                              drop_rate=configs.drop_rate)
29 |         self.feature_encoder = FeatureEncoder(dim=configs.dim, num_heads=configs.num_heads, kernel_size=7, num_layers=4,
30 |                                               max_pos_len=configs.max_pos_len, drop_rate=configs.drop_rate)
31 |         # video and query fusion
32 |         self.cq_attention = CQAttention(dim=configs.dim, drop_rate=configs.drop_rate)
33 |         self.cq_concat = CQConcatenate(dim=configs.dim)
34 |         # query-guided highlighting
35 |         self.highlight_layer = HighLightLayer(dim=configs.dim)
36 |         # conditioned predictor
37 |         self.predictor = ConditionedPredictor(dim=configs.dim, num_heads=configs.num_heads, drop_rate=configs.drop_rate,
38 |                                               max_pos_len=configs.max_pos_len, predictor=configs.predictor)
39 |         # init parameters
40 |         self.init_parameters()
41 | 
42 |     def init_parameters(self):
43 |         def init_weights(m):
44 |             if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear):
45 |                 torch.nn.init.xavier_uniform_(m.weight)
46 |                 if m.bias is not None:
47 |                     torch.nn.init.zeros_(m.bias)
48 |             elif isinstance(m, nn.LSTM):
49 |                 m.reset_parameters()
50 |         self.apply(init_weights)
51 | 
52 |     def forward(self, word_ids, char_ids, video_features, v_mask, q_mask):
53 |         video_features = self.video_affine(video_features)
54 |         query_features = self.embedding_net(word_ids, char_ids)
55 |         video_features = self.feature_encoder(video_features, mask=v_mask)
56 |         query_features = self.feature_encoder(query_features, mask=q_mask)
57 |         features = self.cq_attention(video_features, query_features, v_mask, q_mask)
58 |         features = self.cq_concat(features, query_features, q_mask)
59 |         h_score = self.highlight_layer(features, v_mask)
60 |         features = features * h_score.unsqueeze(2)
61 |         start_logits, end_logits = self.predictor(features, mask=v_mask)
62 |         return h_score, start_logits, end_logits
63 | 
64 |     def extract_index(self, start_logits, end_logits):
65 |         return self.predictor.extract_index(start_logits=start_logits, end_logits=end_logits)
66 | 
67 |     def compute_highlight_loss(self, scores, labels, mask):
68 |         return self.highlight_layer.compute_loss(scores=scores, labels=labels, mask=mask)
69 | 
70 |     def compute_loss(self, start_logits, end_logits, start_labels, end_labels):
71 |         return self.predictor.compute_cross_entropy_loss(start_logits=start_logits, end_logits=end_logits,
72 |                                                          start_labels=start_labels, end_labels=end_labels)
73 | 


--------------------------------------------------------------------------------
/util/data_loader_t7.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.utils.data
 4 | from util.data_util import pad_seq, pad_char_seq, pad_video_seq
 5 | 
 6 | 
 7 | class Dataset(torch.utils.data.Dataset):
 8 |     def __init__(self, dataset, video_features):
 9 |         super(Dataset, self).__init__()
10 |         self.dataset = dataset
11 |         self.video_features = video_features
12 | 
13 |     def __getitem__(self, index):
14 |         record = self.dataset[index]
15 |         video_feature = self.video_features[record['vid']]
16 |         s_ind, e_ind = int(record['s_ind']), int(record['e_ind'])
17 |         word_ids, char_ids = record['w_ids'], record['c_ids']
18 |         return record, video_feature, word_ids, char_ids, s_ind, e_ind
19 | 
20 |     def __len__(self):
21 |         return len(self.dataset)
22 | 
23 | 
24 | def train_collate_fn(data):
25 |     records, video_features, word_ids, char_ids, s_inds, e_inds = zip(*data)
26 |     # process word ids
27 |     word_ids, _ = pad_seq(word_ids)
28 |     word_ids = np.asarray(word_ids, dtype=np.int32)  # (batch_size, w_seq_len)
29 |     # process char ids
30 |     char_ids, _ = pad_char_seq(char_ids)
31 |     char_ids = np.asarray(char_ids, dtype=np.int32)  # (batch_size, w_seq_len, c_seq_len)
32 |     # process video features
33 |     vfeats, vfeat_lens = pad_video_seq(video_features)
34 |     vfeats = np.asarray(vfeats, dtype=np.float32)  # (batch_size, v_seq_len, v_dim)
35 |     vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32)  # (batch_size, )
36 |     # process labels
37 |     max_len = np.max(vfeat_lens)
38 |     batch_size = vfeat_lens.shape[0]
39 |     s_labels = np.asarray(s_inds, dtype=np.int64)
40 |     e_labels = np.asarray(e_inds, dtype=np.int64)
41 |     h_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32)
42 |     extend = 0.1
43 |     for idx in range(batch_size):
44 |         st, et = s_inds[idx], e_inds[idx]
45 |         cur_max_len = vfeat_lens[idx]
46 |         extend_len = round(extend * float(et - st + 1))
47 |         if extend_len > 0:
48 |             st_ = max(0, st - extend_len)
49 |             et_ = min(et + extend_len, cur_max_len - 1)
50 |             h_labels[idx][st_:(et_ + 1)] = 1
51 |         else:
52 |             h_labels[idx][st:(et + 1)] = 1
53 |     # convert to torch tensor
54 |     vfeats = torch.tensor(vfeats, dtype=torch.float32)
55 |     vfeat_lens = torch.tensor(vfeat_lens, dtype=torch.int64)
56 |     word_ids = torch.tensor(word_ids, dtype=torch.int64)
57 |     char_ids = torch.tensor(char_ids, dtype=torch.int64)
58 |     s_labels = torch.tensor(s_labels, dtype=torch.int64)
59 |     e_labels = torch.tensor(e_labels, dtype=torch.int64)
60 |     h_labels = torch.tensor(h_labels, dtype=torch.int64)
61 |     return records, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels
62 | 
63 | 
64 | def test_collate_fn(data):
65 |     records, video_features, word_ids, char_ids, *_ = zip(*data)
66 |     # process word ids
67 |     word_ids, _ = pad_seq(word_ids)
68 |     word_ids = np.asarray(word_ids, dtype=np.int32)  # (batch_size, w_seq_len)
69 |     # process char ids
70 |     char_ids, _ = pad_char_seq(char_ids)
71 |     char_ids = np.asarray(char_ids, dtype=np.int32)  # (batch_size, w_seq_len, c_seq_len)
72 |     # process video features
73 |     vfeats, vfeat_lens = pad_video_seq(video_features)
74 |     vfeats = np.asarray(vfeats, dtype=np.float32)  # (batch_size, v_seq_len, v_dim)
75 |     vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32)  # (batch_size, )
76 |     # convert to torch tensor
77 |     vfeats = torch.tensor(vfeats, dtype=torch.float32)
78 |     vfeat_lens = torch.tensor(vfeat_lens, dtype=torch.int64)
79 |     word_ids = torch.tensor(word_ids, dtype=torch.int64)
80 |     char_ids = torch.tensor(char_ids, dtype=torch.int64)
81 |     return records, vfeats, vfeat_lens, word_ids, char_ids
82 | 
83 | 
84 | def get_train_loader(dataset, video_features, configs):
85 |     train_set = Dataset(dataset=dataset, video_features=video_features)
86 |     train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=configs.batch_size, shuffle=True,
87 |                                                collate_fn=train_collate_fn)
88 |     return train_loader
89 | 
90 | 
91 | def get_test_loader(dataset, video_features, configs):
92 |     test_set = Dataset(dataset=dataset, video_features=video_features)
93 |     test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=configs.batch_size, shuffle=False,
94 |                                               collate_fn=test_collate_fn)
95 |     return test_loader
96 | 


--------------------------------------------------------------------------------
/util/runner_utils_t7.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import random
  4 | import numpy as np
  5 | import torch
  6 | import torch.utils.data
  7 | import torch.backends.cudnn
  8 | from tqdm import tqdm
  9 | from util.data_util import index_to_time
 10 | 
 11 | 
 12 | def set_th_config(seed):
 13 |     random.seed(seed)
 14 |     np.random.seed(seed)
 15 |     torch.manual_seed(seed)
 16 |     torch.cuda.manual_seed(seed)
 17 |     torch.cuda.manual_seed_all(seed)
 18 |     torch.backends.cudnn.benchmark = False
 19 |     torch.backends.cudnn.deterministic = True
 20 | 
 21 | 
 22 | def filter_checkpoints(model_dir, suffix='t7', max_to_keep=5):
 23 |     model_paths = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix)))
 24 |     if len(model_paths) > max_to_keep:
 25 |         model_file_dict = dict()
 26 |         suffix_len = len(suffix) + 1
 27 |         for model_path in model_paths:
 28 |             step = int(os.path.basename(model_path).split('_')[1][0:-suffix_len])
 29 |             model_file_dict[step] = model_path
 30 |         sorted_tuples = sorted(model_file_dict.items())
 31 |         unused_tuples = sorted_tuples[0:-max_to_keep]
 32 |         for _, model_path in unused_tuples:
 33 |             os.remove(model_path)
 34 | 
 35 | 
 36 | def get_last_checkpoint(model_dir, suffix='t7'):
 37 |     model_filenames = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix)))
 38 |     model_file_dict = dict()
 39 |     suffix_len = len(suffix) + 1
 40 |     for model_filename in model_filenames:
 41 |         step = int(os.path.basename(model_filename).split('_')[1][0:-suffix_len])
 42 |         model_file_dict[step] = model_filename
 43 |     sorted_tuples = sorted(model_file_dict.items())
 44 |     last_checkpoint = sorted_tuples[-1]
 45 |     return last_checkpoint[1]
 46 | 
 47 | 
 48 | def convert_length_to_mask(lengths):
 49 |     max_len = lengths.max().item()
 50 |     mask = torch.arange(max_len, device=lengths.device).expand(lengths.size()[0], max_len) < lengths.unsqueeze(1)
 51 |     mask = mask.float()
 52 |     return mask
 53 | 
 54 | 
 55 | def calculate_iou_accuracy(ious, threshold):
 56 |     total_size = float(len(ious))
 57 |     count = 0
 58 |     for iou in ious:
 59 |         if iou >= threshold:
 60 |             count += 1
 61 |     return float(count) / total_size * 100.0
 62 | 
 63 | 
 64 | def calculate_iou(i0, i1):
 65 |     union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
 66 |     inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
 67 |     iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0])
 68 |     return max(0.0, iou)
 69 | 
 70 | 
 71 | def eval_test(model, data_loader, device, mode='test', epoch=None, global_step=None):
 72 |     ious = []
 73 |     with torch.no_grad():
 74 |         for idx, (records, vfeats, vfeat_lens, word_ids, char_ids) in tqdm(
 75 |                 enumerate(data_loader), total=len(data_loader), desc='evaluate {}'.format(mode)):
 76 |             # prepare features
 77 |             vfeats, vfeat_lens = vfeats.to(device), vfeat_lens.to(device)
 78 |             word_ids, char_ids = word_ids.to(device), char_ids.to(device)
 79 |             # generate mask
 80 |             query_mask = (torch.zeros_like(word_ids) != word_ids).float().to(device)
 81 |             video_mask = convert_length_to_mask(vfeat_lens).to(device)
 82 |             # compute predicted results
 83 |             _, start_logits, end_logits = model(word_ids, char_ids, vfeats, video_mask, query_mask)
 84 |             start_indices, end_indices = model.extract_index(start_logits, end_logits)
 85 |             start_indices = start_indices.cpu().numpy()
 86 |             end_indices = end_indices.cpu().numpy()
 87 |             for record, start_index, end_index in zip(records, start_indices, end_indices):
 88 |                 start_time, end_time = index_to_time(start_index, end_index, record["v_len"], record["duration"])
 89 |                 iou = calculate_iou(i0=[start_time, end_time], i1=[record["s_time"], record["e_time"]])
 90 |                 ious.append(iou)
 91 |     r1i3 = calculate_iou_accuracy(ious, threshold=0.3)
 92 |     r1i5 = calculate_iou_accuracy(ious, threshold=0.5)
 93 |     r1i7 = calculate_iou_accuracy(ious, threshold=0.7)
 94 |     mi = np.mean(ious) * 100.0
 95 |     # write the scores
 96 |     score_str = "Epoch {}, Step {}:\n".format(epoch, global_step)
 97 |     score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3)
 98 |     score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5)
 99 |     score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7)
100 |     score_str += "mean IoU: {:.2f}\n".format(mi)
101 |     return r1i3, r1i5, r1i7, mi, score_str
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Span-based Localizing Network for Natural Language Video Localization
  2 | 
  3 | This is implementation for the paper "Span-based Localizing Network for Natural Language Video 
  4 | Localization" (**ACL 2020**, long paper): [ACL version](https://www.aclweb.org/anthology/2020.acl-main.585.pdf), 
  5 | [ArXiv version](https://arxiv.org/abs/2004.13931).
  6 | 
  7 | ![overview](/figures/overview.jpg)
  8 | 
  9 | ## Updates
 10 | - 2021/06/06: rewrite and optimize the codes, and upload complete visual features to the Box drive. Add the stacked
 11 | transformers predictor head (VSLNet with transformer head performs better than that of rnn head in general).
 12 | - 2021/07/21: add support to TensorFlow 2.x (test on Tensorflow `2.5.0` with cuda `11.2` and cudnn `8.2`).
 13 | ```shell
 14 | # preparing environment for TensorFlow 2.5.0
 15 | conda create --name vslnet_tf2 python=3.9
 16 | conda activate vslnet_tf2
 17 | conda install -c conda-forge cudnn  # will install cuda 11.2 automatically
 18 | pip install tensorflow-gpu==2.5.0
 19 | pip install nltk
 20 | pip install torch torchvision torchaudio
 21 | python3.9 -m nltk.downloader punkt
 22 | ```
 23 | 
 24 | ## Prerequisites
 25 | - python 3.x with tensorflow (`1.13.1`), pytorch (`1.1.0`), torchvision, opencv-python, moviepy, tqdm, nltk, 
 26 |   transformers
 27 | - youtube-dl
 28 | - cuda10, cudnn
 29 | 
 30 | If you have [Anaconda](https://www.anaconda.com/distribution/) installed, the conda environment of VSLNet can be built 
 31 | as follow (take python 3.7 as an example):
 32 | ```shell script
 33 | # preparing environment
 34 | conda create --name vslnet python=3.7
 35 | conda activate vslnet
 36 | conda install -c anaconda cudatoolkit=10.0 cudnn
 37 | conda install tensorflow-gpu==1.13.1
 38 | conda install -c anaconda nltk pillow=6.2.1
 39 | conda install pytorch==1.1.0 torchvision==0.3.0 cudatoolkit=10.0 -c pytorch
 40 | conda install -c conda-forge transformers opencv moviepy tqdm youtube-dl
 41 | # download punkt for word tokenizer
 42 | python3.7 -m nltk.downloader punkt
 43 | ```
 44 | 
 45 | ## Preparation
 46 | The details about how to prepare the `Charades-STA`, `ActivityNet Captions` and `TACoS` features are summarized 
 47 | here: [[data preparation]](/prepare). Alternatively, you can download the prepared visual features from 
 48 | [Box Drive](https://app.box.com/s/h0sxa5klco6qve5ahnz50ly2nksmuedw), and place them to the `./data/` directory.
 49 | Download the word embeddings from [here](http://nlp.stanford.edu/data/glove.840B.300d.zip) and place it to 
 50 | `./data/features/` directory.
 51 | 
 52 | ## Quick Start
 53 | ### TensorFlow version
 54 | **Train** and **Test**
 55 | ```shell script
 56 | # processed dataset will be automatically generated or loaded if exist
 57 | # set `--mode test` for evaluation
 58 | # set `--predictor transformer` to change the answer predictor from stacked lstms to stacked transformers
 59 | # train VSLNet on Charades-STA dataset
 60 | python main.py --task charades --predictor rnn --mode train
 61 | # train VSLNet on ActivityNet Captions dataset
 62 | python main.py --task activitynet --predictor rnn --mode train
 63 | # train VSLNet on TACoS dataset
 64 | python main.py --task tacos --predictor rnn --mode train
 65 | ```
 66 | Please refer each python file for more parameter settings. You can also download the checkpoints for each task 
 67 | from [here](https://app.box.com/s/f20aeutwp2wg8c5laaqtbfdg864g8mj0) and the corresponding processed dataset from
 68 | [here](https://app.box.com/s/065efky2sjjgc2xxzyelast15y7tsehs), and save them to the `./ckpt/` and `./datasets/` 
 69 | directories, respectively. More hyper-parameter settings are in the `main.py`.
 70 | 
 71 | ### Pytorch Version
 72 | **Train** and **Test**
 73 | ```shell script
 74 | # the same as the usage of tf version
 75 | # train VSLNet on Charades-STA dataset
 76 | python main.py --task charades --predictor rnn --mode train
 77 | # train VSLNet on ActivityNet Captions dataset
 78 | python main.py --task activitynet --predictor rnn --mode train
 79 | # train VSLNet on TACoS dataset
 80 | python main.py --task tacos --predictor rnn --mode train
 81 | ```
 82 | > For unknown reasons, the performance of PyTorch codes is inferior to that of TensorFlow codes on some datasets.
 83 | 
 84 | ## Citation
 85 | If you feel this project helpful to your research, please cite our work.
 86 | ```
 87 | @inproceedings{zhang2020span,
 88 |     title = "Span-based Localizing Network for Natural Language Video Localization",
 89 |     author = "Zhang, Hao  and Sun, Aixin  and Jing, Wei  and Zhou, Joey Tianyi",
 90 |     booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
 91 |     month = jul,
 92 |     year = "2020",
 93 |     address = "Online",
 94 |     publisher = "Association for Computational Linguistics",
 95 |     url = "https://www.aclweb.org/anthology/2020.acl-main.585",
 96 |     pages = "6543--6554"
 97 | }
 98 | ```
 99 | and
100 | ```
101 | @article{zhang2021natural,
102 |     author={H. {Zhang} and A. {Sun} and W. {Jing} and L. {Zhen} and J. T. {Zhou} and R. S. M. {Goh}},
103 |     journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 
104 |     title={Natural Language Video Localization: A Revisit in Span-based Question Answering Framework}, 
105 |     year={2021},
106 |     doi={10.1109/TPAMI.2021.3060449}
107 | }
108 | ```
109 | 


--------------------------------------------------------------------------------
/prepare/extract_charades.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import json
  4 | import torch
  5 | import argparse
  6 | import subprocess
  7 | import numpy as np
  8 | from . import videotransforms
  9 | from .feature_extractor import InceptionI3d
 10 | from torchvision import transforms
 11 | from torch.autograd import Variable
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index")
 15 | parser.add_argument("--use_finetuned", action="store_true", help="whether use fine-tuned feature extractor")
 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model")
 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos")
 18 | parser.add_argument("--dataset_dir", type=str, required=True, help="where are located the dataset files")
 19 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images")
 20 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features")
 21 | parser.add_argument("--fps", type=int, default=24, help="frames per second")
 22 | parser.add_argument("--video_format", type=str, default="mp4", help="video format")
 23 | parser.add_argument("--strides", type=int, default=24, help="window size")
 24 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space")
 25 | args = parser.parse_args()
 26 | 
 27 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 28 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx
 29 | 
 30 | 
 31 | if not os.path.exists(args.video_dir):
 32 |     raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir))
 33 | 
 34 | if not os.path.exists(args.images_dir):
 35 |     os.makedirs(args.images_dir)
 36 | 
 37 | if not os.path.exists(args.save_dir):
 38 |     os.makedirs(args.save_dir)
 39 | 
 40 | # create I3D model and load pre-trained model
 41 | i3d_model = InceptionI3d(400, in_channels=3)
 42 | if args.use_fine_tuned:
 43 |     i3d_model.replace_logits(157)  # charades has 157 activity types
 44 | i3d_model.load_state_dict(torch.load(args.load_model))
 45 | i3d_model.cuda()
 46 | i3d_model.train(False)
 47 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
 48 | 
 49 | # load video ids
 50 | video_ids = []
 51 | for filename in ["charades_sta_train.txt", "charades_sta_test.txt"]:
 52 |     with open(os.path.join(args.dataset_dir, filename), mode="r", encoding="utf-8") as f:
 53 |         for line in f:
 54 |             line = line.lstrip().rstrip()
 55 |             if len(line) == 0:
 56 |                 continue
 57 |             vid = line.split("##")[0].split(" ")[0]
 58 |             video_ids.append(vid)
 59 | video_ids = list(set(video_ids))
 60 | 
 61 | # extract images and features
 62 | feature_shapes = dict()
 63 | for idx, video_id in enumerate(video_ids):
 64 |     video_path = os.path.join(args.video_dir, "{}.mp4".format(video_id))
 65 |     image_dir = os.path.join(args.images_dir, video_id)
 66 | 
 67 |     print("{} / {}: extract features for video {}".format(idx + 1, len(video_ids), video_id), flush=True)
 68 | 
 69 |     if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))):
 70 |         print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True)
 71 |         continue
 72 | 
 73 |     # extract images
 74 |     if os.path.exists(image_dir):
 75 |         print("the images for video {} already are exist in {}...".format(video_id, args.images_dir))
 76 |     else:
 77 |         os.makedirs(image_dir)
 78 |         print("extract images with fps={}...".format(args.fps), flush=True)
 79 |         if args.fps is None or args.fps <= 0:
 80 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(
 81 |                 video_path, image_dir, video_id), shell=True)
 82 |         else:
 83 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format(
 84 |                 video_path, args.fps, image_dir, video_id), shell=True)
 85 | 
 86 |     # process extracted images
 87 |     print("load RGB frames...", flush=True)
 88 |     num_frames = len(os.listdir(image_dir))
 89 |     frames, raw_w, raw_h = [], None, None
 90 |     for i in range(1, num_frames + 1):
 91 |         # cv2.imread() read image with BGR format by default, so we convert it to RGB format
 92 |         img = cv2.imread(os.path.join(image_dir, "{}-{}.jpg".format(video_id, str(i).zfill(6))))[:, :, [2, 1, 0]]
 93 |         w, h, c = img.shape
 94 |         raw_w, raw_h = w, h
 95 |         if w < 226 or h < 226:
 96 |             d = 226. - min(w, h)
 97 |             sc = 1 + d / min(w, h)
 98 |             img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)
 99 |         img = (img / 255.) * 2 - 1
100 |         frames.append(img)
101 |     frames = np.asarray(frames, dtype=np.float32)
102 |     imgs = video_transforms(frames)
103 |     img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
104 |     print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
105 |           imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
106 | 
107 |     if args.remove_images:
108 |         # remove extract images to release memory space
109 |         subprocess.call("rm -rf {}".format(image_dir), shell=True)
110 | 
111 |     print("extract visual visual features...", flush=True)
112 |     b, c, t, h, w = img_tensor.shape
113 |     features = []
114 |     for start in range(0, t, args.strides):
115 |         end = min(t - 1, start + args.strides)
116 |         if end - start < args.strides:
117 |             start = max(0, end - args.strides)
118 |         ip = Variable(torch.from_numpy(img_tensor.numpy()[:, :, start:end]).cuda(), volatile=True)
119 |         feature = i3d_model.extract_features(ip).data.cpu().numpy()
120 |         features.append(feature)
121 |     features = np.concatenate(features, axis=0)
122 |     np.save(os.path.join(args.save_dir, video_id), arr=features)
123 |     print("extracted feature shape: {}\n".format(features.shape), flush=True)
124 |     feature_shapes[video_id] = features.shape[0]
125 | 
126 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
127 |     json.dump(feature_shapes, f)
128 | 


--------------------------------------------------------------------------------
/util/data_loader.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import random
  3 | import numpy as np
  4 | from util.data_util import pad_seq, pad_char_seq, pad_video_seq
  5 | 
  6 | 
  7 | class TrainLoader:
  8 |     def __init__(self, dataset, visual_features, configs):
  9 |         super(TrainLoader, self).__init__()
 10 |         self.dataset = dataset
 11 |         self.visual_feats = visual_features
 12 |         self.extend = configs.extend
 13 |         self.batch_size = configs.batch_size
 14 | 
 15 |     def set_extend(self, extend):
 16 |         self.extend = extend
 17 | 
 18 |     def set_batch_size(self, batch_size):
 19 |         self.batch_size = batch_size
 20 | 
 21 |     def num_samples(self):
 22 |         return len(self.dataset)
 23 | 
 24 |     def num_batches(self):
 25 |         return math.ceil(len(self.dataset) / self.batch_size)
 26 | 
 27 |     def batch_iter(self):
 28 |         random.shuffle(self.dataset)  # shuffle the train set first
 29 |         for index in range(0, len(self.dataset), self.batch_size):
 30 |             batch_data = self.dataset[index:(index + self.batch_size)]
 31 |             vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels = self.process_batch(batch_data)
 32 |             yield batch_data, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels
 33 | 
 34 |     def process_batch(self, batch_data):
 35 |         vfeats, word_ids, char_ids, s_inds, e_inds = [], [], [], [], []
 36 |         for data in batch_data:
 37 |             vfeat = self.visual_feats[data['vid']]
 38 |             vfeats.append(vfeat)
 39 |             word_ids.append(data['w_ids'])
 40 |             char_ids.append(data['c_ids'])
 41 |             s_inds.append(data['s_ind'])
 42 |             e_inds.append(data['e_ind'])
 43 |         batch_size = len(batch_data)
 44 |         # process word ids
 45 |         word_ids, _ = pad_seq(word_ids)
 46 |         word_ids = np.asarray(word_ids, dtype=np.int32)  # (batch_size, w_seq_len)
 47 |         # process char ids
 48 |         char_ids, _ = pad_char_seq(char_ids)
 49 |         char_ids = np.asarray(char_ids, dtype=np.int32)  # (batch_size, w_seq_len, c_seq_len)
 50 |         # process video features
 51 |         vfeats, vfeat_lens = pad_video_seq(vfeats)
 52 |         vfeats = np.asarray(vfeats, dtype=np.float32)  # (batch_size, v_seq_len, v_dim)
 53 |         vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32)  # (batch_size, )
 54 |         # process labels
 55 |         max_len = np.max(vfeat_lens)
 56 |         s_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32)
 57 |         e_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32)
 58 |         h_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32)
 59 |         for idx in range(batch_size):
 60 |             st, et = s_inds[idx], e_inds[idx]
 61 |             s_labels[idx][st] = 1
 62 |             e_labels[idx][et] = 1
 63 |             cur_max_len = vfeat_lens[idx]
 64 |             extend_len = round(self.extend * float(et - st + 1))
 65 |             if extend_len > 0:
 66 |                 st_ = max(0, st - extend_len)
 67 |                 et_ = min(et + extend_len, cur_max_len - 1)
 68 |                 h_labels[idx][st_:(et_ + 1)] = 1
 69 |             else:
 70 |                 h_labels[idx][st:(et + 1)] = 1
 71 |         return vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels
 72 | 
 73 | 
 74 | class TestLoader:
 75 |     def __init__(self, datasets, visual_features, configs):
 76 |         self.visual_feats = visual_features
 77 |         self.val_set = None if datasets['val_set'] is None else datasets['val_set']
 78 |         self.test_set = datasets['test_set']
 79 |         self.batch_size = configs.batch_size
 80 | 
 81 |     def set_batch_size(self, batch_size):
 82 |         self.batch_size = batch_size
 83 | 
 84 |     def num_samples(self, mode='test'):
 85 |         if mode == 'val':
 86 |             if self.val_set is None:
 87 |                 return 0
 88 |             return len(self.val_set)
 89 |         elif mode == 'test':
 90 |             return len(self.test_set)
 91 |         else:
 92 |             raise ValueError('Unknown mode!!! Only support [val | test | test_iid | test_ood].')
 93 | 
 94 |     def num_batches(self, mode='test'):
 95 |         if mode == 'val':
 96 |             if self.val_set is None:
 97 |                 return 0
 98 |             return math.ceil(len(self.val_set) / self.batch_size)
 99 |         elif mode == 'test':
100 |             return math.ceil(len(self.test_set) / self.batch_size)
101 |         else:
102 |             raise ValueError('Unknown mode!!! Only support [val | test].')
103 | 
104 |     def test_iter(self, mode='test'):
105 |         if mode not in ['val', 'test']:
106 |             raise ValueError('Unknown mode!!! Only support [val | test].')
107 |         test_sets = {'val': self.val_set, 'test': self.test_set}
108 |         dataset = test_sets[mode]
109 |         if mode == 'val' and dataset is None:
110 |             raise ValueError('val set is not available!!!')
111 |         for index in range(0, len(dataset), self.batch_size):
112 |             batch_data = dataset[index:(index + self.batch_size)]
113 |             vfeats, vfeat_lens, word_ids, char_ids = self.process_batch(batch_data)
114 |             yield batch_data, vfeats, vfeat_lens, word_ids, char_ids
115 | 
116 |     def process_batch(self, batch_data):
117 |         vfeats, word_ids, char_ids, s_inds, e_inds = [], [], [], [], []
118 |         for data in batch_data:
119 |             vfeats.append(self.visual_feats[data['vid']])
120 |             word_ids.append(data['w_ids'])
121 |             char_ids.append(data['c_ids'])
122 |             s_inds.append(data['s_ind'])
123 |             e_inds.append(data['e_ind'])
124 |         # process word ids
125 |         word_ids, _ = pad_seq(word_ids)
126 |         word_ids = np.asarray(word_ids, dtype=np.int32)  # (batch_size, w_seq_len)
127 |         # process char ids
128 |         char_ids, _ = pad_char_seq(char_ids)
129 |         char_ids = np.asarray(char_ids, dtype=np.int32)  # (batch_size, w_seq_len, c_seq_len)
130 |         # process video features
131 |         vfeats, vfeat_lens = pad_video_seq(vfeats)
132 |         vfeats = np.asarray(vfeats, dtype=np.float32)  # (batch_size, v_seq_len, v_dim)
133 |         vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32)  # (batch_size, )
134 |         return vfeats, vfeat_lens, word_ids, char_ids
135 | 


--------------------------------------------------------------------------------
/model/VSLNet.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from model.ops import create_optimizer, count_params
 3 | from model.layers import word_embedding_lookup, char_embedding_lookup, conv1d, video_query_attention, highlight_layer
 4 | from model.layers import context_query_concat, feature_encoder, conditioned_predictor, localization_loss
 5 | 
 6 | if tf.__version__.startswith('2'):
 7 |     tf = tf.compat.v1
 8 |     tf.disable_v2_behavior()
 9 |     tf.disable_eager_execution()
10 | 
11 | 
12 | class VSLNet:
13 |     def __init__(self, configs, graph, vectors):
14 |         self.configs = configs
15 |         graph = graph if graph is not None else tf.Graph()
16 |         with graph.as_default():
17 |             self.global_step = tf.train.create_global_step()
18 |             self._add_placeholders()
19 |             self._build_model(vectors)
20 |             if configs.mode == 'train':
21 |                 print('\x1b[1;33m' + 'Total trainable parameters: {}'.format(count_params()) + '\x1b[0m', flush=True)
22 |             else:
23 |                 print('\x1b[1;33m' + 'Total parameters: {}'.format(count_params()) + '\x1b[0m', flush=True)
24 | 
25 |     def _add_placeholders(self):
26 |         self.video_inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, self.configs.video_feature_dim],
27 |                                            name='video_inputs')
28 |         self.video_seq_length = tf.placeholder(dtype=tf.int32, shape=[None], name='video_sequence_length')
29 |         self.word_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='word_ids')
30 |         self.char_ids = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name='char_ids')
31 |         self.highlight_labels = tf.placeholder(dtype=tf.int32, shape=[None, None], name='highlight_labels')
32 |         self.y1 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='start_indexes')
33 |         self.y2 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='end_indexes')
34 |         # hyper-parameters
35 |         self.drop_rate = tf.placeholder_with_default(input=0.0, shape=[], name='dropout_rate')
36 |         # create mask
37 |         self.v_mask = tf.sequence_mask(lengths=self.video_seq_length, maxlen=tf.reduce_max(self.video_seq_length),
38 |                                        dtype=tf.int32)
39 |         self.q_mask = tf.cast(tf.cast(self.word_ids, dtype=tf.bool), dtype=tf.int32)
40 | 
41 |     def _build_model(self, vectors):
42 |         # word embedding & visual features
43 |         word_emb = word_embedding_lookup(self.word_ids, dim=self.configs.word_dim, drop_rate=self.drop_rate,
44 |                                          vectors=vectors, finetune=False, reuse=False, name='word_embeddings')
45 |         char_emb = char_embedding_lookup(self.char_ids, char_size=self.configs.char_size, dim=self.configs.char_dim,
46 |                                          kernels=[1, 2, 3, 4], filters=[10, 20, 30, 40], drop_rate=self.drop_rate,
47 |                                          activation=tf.nn.relu, reuse=False, name='char_embeddings')
48 |         word_emb = tf.concat([word_emb, char_emb], axis=-1)
49 |         video_features = tf.nn.dropout(self.video_inputs, rate=self.drop_rate)
50 |         # feature projection (map both word and video feature to the same dimension)
51 |         vfeats = conv1d(video_features, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='video_conv1d')
52 |         qfeats = conv1d(word_emb, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='query_conv1d')
53 |         # feature encoder
54 |         vfeats = feature_encoder(vfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads,
55 |                                  max_position_length=self.configs.max_pos_len, drop_rate=self.drop_rate,
56 |                                  mask=self.v_mask, reuse=False, name='feature_encoder')
57 |         qfeats = feature_encoder(qfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads,
58 |                                  max_position_length=self.configs.max_pos_len, drop_rate=self.drop_rate,
59 |                                  mask=self.q_mask, reuse=True, name='feature_encoder')
60 |         # video query attention
61 |         outputs, self.vq_score = video_query_attention(vfeats, qfeats, self.v_mask, self.q_mask, reuse=False,
62 |                                                        drop_rate=self.drop_rate, name='video_query_attention')
63 |         # weighted pooling and concatenation
64 |         outputs = context_query_concat(outputs, qfeats, q_mask=self.q_mask, reuse=False, name='context_query_concat')
65 |         # highlighting layer
66 |         self.highlight_loss, self.highlight_scores = highlight_layer(outputs, self.highlight_labels, mask=self.v_mask,
67 |                                                                      reuse=False, name='highlighting_layer')
68 |         outputs = tf.multiply(outputs, tf.expand_dims(self.highlight_scores, axis=-1))
69 |         # prediction layer
70 |         start_logits, end_logits = conditioned_predictor(outputs, hidden_size=self.configs.hidden_size,
71 |                                                          seq_len=self.video_seq_length, mask=self.v_mask,
72 |                                                          num_heads=self.configs.num_heads, drop_rate=self.drop_rate,
73 |                                                          max_position_length=self.configs.max_pos_len, reuse=False,
74 |                                                          mode=self.configs.predictor, name='conditioned_predictor')
75 |         # compute localization loss
76 |         self.start_prob, self.end_prob, self.start_index, self.end_index, self.loss = localization_loss(
77 |             start_logits, end_logits, self.y1, self.y2)
78 |         # add l2 regularizer loss (uncomment if required)
79 |         l2_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
80 |         self.loss += tf.reduce_sum(l2_losses)
81 |         # collect regularization losses
82 |         self.total_loss = self.loss + self.configs.highlight_lambda * self.highlight_loss
83 |         # create optimizer
84 |         if self.configs.warmup_proportion > 1.0:
85 |             num_warmup_steps = int(self.configs.warmup_proportion)
86 |         else:
87 |             num_warmup_steps = int(self.configs.num_train_steps * self.configs.warmup_proportion)
88 |         self.train_op = create_optimizer(self.total_loss, self.configs.init_lr, self.configs.num_train_steps,
89 |                                          num_warmup_steps, clip_norm=self.configs.clip_norm)
90 | 


--------------------------------------------------------------------------------
/prepare/README.md:
--------------------------------------------------------------------------------
  1 | # Extract Features
  2 | 
  3 | - We use the pre-trained 3D ConvNets ([here](https://github.com/piergiaj/pytorch-i3d)) to prepare the visual features, the 
  4 | extraction codes are placed in this folder. Please download the pre-trained weights [`rgb_charades.pt`](
  5 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_charades.pt) and [`rgb_imagenet.pt`](
  6 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_imagenet.pt). 
  7 | - The pre-trained GloVe embedding is available at [here](https://nlp.stanford.edu/projects/glove/), please download
  8 | `glove.840B.300d.zip`, unzip and put it under `data/` folder.
  9 | 
 10 | ## Charades STA
 11 | The train/test datasets of Charades-STA are available at [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 
 12 | ([`charades_sta_train.txt`](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view) and 
 13 | [`charades_sta_test.txt`](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view)).
 14 | 
 15 | The `charades.json` file is required ([here](https://github.com/piergiaj/super-events-cvpr18/blob/master/data/charades.json)), 
 16 | which contains the video length information. Download and place it into the same directory of the train/test datasets.
 17 | 
 18 | The videos/images for Charades-STA dataset is available at [here](https://allenai.org/plato/charades/), please download 
 19 | either `RGB frames at 24fps (76 GB)` (image frames) or `Data (original size) (55 GB)` (videos). For the second one, the 
 20 | extractor will automatically decompose the video into images.
 21 | ```shell script
 22 | # download RGB frames
 23 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1_rgb.tar
 24 | # or, download videos
 25 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1.zip
 26 | ```
 27 | 
 28 | Extract visual features for Charades-STA:
 29 | ```shell script
 30 | # use the weights fine-tuned on Charades or the weights pre-trained on ImageNet
 31 | python3 extract_charades.py --use_finetuned --load_model <path to>/rgb_charades.pt  \  # rgb_imagenet.pt
 32 |       --video_dir <path to video dir>  \
 33 |       --dataset_dir <path to charades-sta dataset dir>  \
 34 |       --images_dir <path to images dir>  \  # if images not exist, decompose video into images
 35 |       --save_dir <path to save extracted visual features>  \
 36 |       --fps 24 --strides 24 --remove_images  # whether remove extract images to release space
 37 | ```
 38 | 
 39 | ## TACoS
 40 | TACoS dataset is from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL), while the videos of TACoS is from MPII 
 41 | Cooking Composite Activities dataset, which can be download [here](
 42 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/human-activity-recognition/mpii-cooking-composite-activities/).
 43 | Note that we also use the processed TACoS dataset in [[microsoft/2D-TAN]](https://github.com/microsoft/2D-TAN). 
 44 | 
 45 | Extract visual features for TACoS:
 46 | ```shell script
 47 | python3 extract_tacos.py --load_model <path to>/rgb_imagenet.pt  \
 48 |       --video_dir <path to video dir>  \
 49 |       --dataset_dir <path to charades-sta dataset dir>  \
 50 |       --images_dir <path to images dir>  \  # if images not exist, decompose video into images
 51 |       --save_dir <path to save extracted visual features>  \
 52 |       --strides 16 --remove_images  # whether remove extracted images to release space
 53 | ```
 54 | 
 55 | (Optional) Convert the pre-trained C3D visual features from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 
 56 | ([Interval64_128_256_512_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view), 
 57 | [Interval128_256_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view)):
 58 | ```shell script
 59 | python3 extract_tacos_org.py --data_path <path to tacos annotation dataset>  \
 60 |       --feature_path <path to downloaded C3D features>  \
 61 |       --save_dir <path to save extracted visual features>  \
 62 |       --sample_rate 64  # sliding windows
 63 | ```
 64 | 
 65 | ## ActivityNet Captions
 66 | The train/test sets of ActivityNet Caption are available at [here](
 67 | https://cs.stanford.edu/people/ranjaykrishna/densevid/). The videos can be downloaded using:
 68 | ```shell script
 69 | python3 download_activitynet_video.py --video_dir <path to save videos>  \
 70 |       --dataset_dir <path to activitynet caption datasets>  \
 71 |       --bash_file <path to save generated bash file for downloading videos>
 72 | ```
 73 | It will generate a bash file which contains the commands to download all the videos. Suppose the generated bash file is 
 74 | `video_downloader.sh`, then simply run `bash video_downloader.sh`, it will download the videos and save them into 
 75 | `video_dir` automatically.
 76 | 
 77 | Extract visual features for ActivityNet Captions:
 78 | ```shell script
 79 | python3 extract_activitynet.py --load_model <path to>/rgb_imagenet.pt  \
 80 |       --video_dir <path to video dir>  \
 81 |       --dataset_dir <path to charades-sta dataset dir>  \
 82 |       --images_dir <path to images dir>  \  # if images not exist, decompose video into images
 83 |       --save_dir <path to save extracted visual features>  \
 84 |       --strides 16 --remove_images  # whether remove extracted images to release space
 85 | ```
 86 | 
 87 | (Optional) We also have the codes to convert the C3D visual features provided in [ActivityNet official website](
 88 | http://activity-net.org/challenges/2016/download.html):
 89 | 
 90 | - download the C3D visual features
 91 | ```shell script
 92 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-00
 93 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-01
 94 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-02
 95 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-03
 96 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-04
 97 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-05
 98 | cat activitynet_v1-3.part-* > features.zip && unzip features.zip
 99 | rm features.zip
100 | rm activitynet_v1-3.part-*
101 | ```
102 | - convert the features as
103 | ```shell script
104 | python3 extract_activitynet_org.py --dataset_dir <path to activitynet caption annotation dataset>  \
105 |       --hdf5_file <path to downloaded C3D features>  \
106 |       --save_dir <path to save extracted features>
107 | ```
108 | 


--------------------------------------------------------------------------------
/util/data_util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import json
  4 | import pickle
  5 | import numpy as np
  6 | from tqdm import tqdm
  7 | 
  8 | 
  9 | def load_json(filename):
 10 |     with open(filename, mode='r', encoding='utf-8') as f:
 11 |         data = json.load(f)
 12 |     return data
 13 | 
 14 | 
 15 | def save_json(data, filename, save_pretty=False, sort_keys=False):
 16 |     with open(filename, mode='w', encoding='utf-8') as f:
 17 |         if save_pretty:
 18 |             f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
 19 |         else:
 20 |             json.dump(data, f)
 21 | 
 22 | 
 23 | def load_lines(filename):
 24 |     with open(filename, mode='r', encoding='utf-8') as f:
 25 |         return [e.strip("\n") for e in f.readlines()]
 26 | 
 27 | 
 28 | def save_lines(data, filename):
 29 |     with open(filename, mode='w', encoding='utf-8') as f:
 30 |         f.write("\n".join(data))
 31 | 
 32 | 
 33 | def load_pickle(filename):
 34 |     with open(filename, mode='rb') as handle:
 35 |         data = pickle.load(handle)
 36 |         return data
 37 | 
 38 | 
 39 | def save_pickle(data, filename):
 40 |     with open(filename, mode='wb') as handle:
 41 |         pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
 42 | 
 43 | 
 44 | def load_video_features(root, max_position_length):
 45 |     video_features = dict()
 46 |     filenames = glob.glob(os.path.join(root, "*.npy"))
 47 |     for filename in tqdm(filenames, total=len(filenames), desc="load video features"):
 48 |         video_id = filename.split("/")[-1].split(".")[0]
 49 |         feature = np.load(filename)
 50 |         if max_position_length is None:
 51 |             video_features[video_id] = feature
 52 |         else:
 53 |             new_feature = visual_feature_sampling(feature, max_num_clips=max_position_length)
 54 |             video_features[video_id] = new_feature
 55 |     return video_features
 56 | 
 57 | 
 58 | def visual_feature_sampling(visual_feature, max_num_clips):
 59 |     num_clips = visual_feature.shape[0]
 60 |     if num_clips <= max_num_clips:
 61 |         return visual_feature
 62 |     idxs = np.arange(0, max_num_clips + 1, 1.0) / max_num_clips * num_clips
 63 |     idxs = np.round(idxs).astype(np.int32)
 64 |     idxs[idxs > num_clips - 1] = num_clips - 1
 65 |     new_visual_feature = []
 66 |     for i in range(max_num_clips):
 67 |         s_idx, e_idx = idxs[i], idxs[i + 1]
 68 |         if s_idx < e_idx:
 69 |             new_visual_feature.append(np.mean(visual_feature[s_idx:e_idx], axis=0))
 70 |         else:
 71 |             new_visual_feature.append(visual_feature[s_idx])
 72 |     new_visual_feature = np.asarray(new_visual_feature)
 73 |     return new_visual_feature
 74 | 
 75 | 
 76 | def compute_overlap(pred, gt):
 77 |     # check format
 78 |     assert isinstance(pred, list) and isinstance(gt, list)
 79 |     pred_is_list = isinstance(pred[0], list)
 80 |     gt_is_list = isinstance(gt[0], list)
 81 |     pred = pred if pred_is_list else [pred]
 82 |     gt = gt if gt_is_list else [gt]
 83 |     # compute overlap
 84 |     pred, gt = np.array(pred), np.array(gt)
 85 |     inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0])
 86 |     inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1])
 87 |     inter = np.maximum(0.0, inter_right - inter_left)
 88 |     union_left = np.minimum(pred[:, 0, None], gt[None, :, 0])
 89 |     union_right = np.maximum(pred[:, 1, None], gt[None, :, 1])
 90 |     union = np.maximum(1e-12, union_right - union_left)
 91 |     overlap = 1.0 * inter / union
 92 |     # reformat output
 93 |     overlap = overlap if gt_is_list else overlap[:, 0]
 94 |     overlap = overlap if pred_is_list else overlap[0]
 95 |     return overlap
 96 | 
 97 | 
 98 | def time_to_index(start_time, end_time, num_units, duration):
 99 |     s_times = np.arange(0, num_units).astype(np.float32) / float(num_units) * duration
100 |     e_times = np.arange(1, num_units + 1).astype(np.float32) / float(num_units) * duration
101 |     candidates = np.stack([np.repeat(s_times[:, None], repeats=num_units, axis=1),
102 |                            np.repeat(e_times[None, :], repeats=num_units, axis=0)], axis=2).reshape((-1, 2))
103 |     overlaps = compute_overlap(candidates.tolist(), [start_time, end_time]).reshape(num_units, num_units)
104 |     start_index = np.argmax(overlaps) // num_units
105 |     end_index = np.argmax(overlaps) % num_units
106 |     return start_index, end_index, overlaps
107 | 
108 | 
109 | def index_to_time(start_index, end_index, num_units, duration):
110 |     s_times = np.arange(0, num_units).astype(np.float32) * duration / float(num_units)
111 |     e_times = np.arange(1, num_units + 1).astype(np.float32) * duration / float(num_units)
112 |     start_time = s_times[start_index]
113 |     end_time = e_times[end_index]
114 |     return start_time, end_time
115 | 
116 | 
117 | def pad_seq(sequences, pad_tok=None, max_length=None):
118 |     if pad_tok is None:
119 |         pad_tok = 0  # 0: "PAD" for words and chars, "PAD" for tags
120 |     if max_length is None:
121 |         max_length = max([len(seq) for seq in sequences])
122 |     sequence_padded, sequence_length = [], []
123 |     for seq in sequences:
124 |         seq_ = seq[:max_length] + [pad_tok] * max(max_length - len(seq), 0)
125 |         sequence_padded.append(seq_)
126 |         sequence_length.append(min(len(seq), max_length))
127 |     return sequence_padded, sequence_length
128 | 
129 | 
130 | def pad_char_seq(sequences, max_length=None, max_length_2=None):
131 |     sequence_padded, sequence_length = [], []
132 |     if max_length is None:
133 |         max_length = max(map(lambda x: len(x), sequences))
134 |     if max_length_2 is None:
135 |         max_length_2 = max([max(map(lambda x: len(x), seq)) for seq in sequences])
136 |     for seq in sequences:
137 |         sp, sl = pad_seq(seq, max_length=max_length_2)
138 |         sequence_padded.append(sp)
139 |         sequence_length.append(sl)
140 |     sequence_padded, _ = pad_seq(sequence_padded, pad_tok=[0] * max_length_2, max_length=max_length)
141 |     sequence_length, _ = pad_seq(sequence_length, max_length=max_length)
142 |     return sequence_padded, sequence_length
143 | 
144 | 
145 | def pad_video_seq(sequences, max_length=None):
146 |     if max_length is None:
147 |         max_length = max([vfeat.shape[0] for vfeat in sequences])
148 |     feature_length = sequences[0].shape[1]
149 |     sequence_padded, sequence_length = [], []
150 |     for seq in sequences:
151 |         add_length = max_length - seq.shape[0]
152 |         sequence_length.append(seq.shape[0])
153 |         if add_length > 0:
154 |             add_feature = np.zeros(shape=[add_length, feature_length], dtype=np.float32)
155 |             seq_ = np.concatenate([seq, add_feature], axis=0)
156 |         else:
157 |             seq_ = seq
158 |         sequence_padded.append(seq_)
159 |     return sequence_padded, sequence_length
160 | 


--------------------------------------------------------------------------------
/prepare/extract_tacos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import glob
  4 | import json
  5 | import torch
  6 | import argparse
  7 | import subprocess
  8 | import numpy as np
  9 | from . import videotransforms
 10 | from .feature_extractor import InceptionI3d
 11 | from torchvision import transforms
 12 | from torch.autograd import Variable
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index")
 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model")
 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos")
 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images")
 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features")
 20 | parser.add_argument("--fps", type=float, default=None, help="frames per second")  # TACoS's default fps is 29.4
 21 | parser.add_argument("--video_format", type=str, default="avi", help="video format")
 22 | parser.add_argument("--strides", type=int, default=16, help="window size")
 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space")
 24 | args = parser.parse_args()
 25 | 
 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx
 28 | 
 29 | 
 30 | def load_images(img_dir, vid, start_frame, lengths):
 31 |     img_frames, raw_height, raw_width = [], None, None
 32 |     for x in range(start_frame, start_frame + lengths):
 33 |         image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]]
 34 |         width, height, channel = image.shape
 35 |         raw_width, raw_height = width, height
 36 |         # resize image
 37 |         scale = 1 + (224.0 - min(width, height)) / min(width, height)
 38 |         image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale)
 39 |         # normalize image to [0, 1]
 40 |         image = (image / 255.0) * 2 - 1
 41 |         img_frames.append(image)
 42 |     return img_frames, raw_width, raw_height
 43 | 
 44 | 
 45 | def extract_features(image_tensor, model, strides):
 46 |     b, c, t, h, w = image_tensor.shape
 47 |     extracted_features = []
 48 |     for start in range(0, t, strides):
 49 |         end = min(t - 1, start + strides)
 50 |         if end - start < strides:
 51 |             start = max(0, end - strides)
 52 |         ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True)
 53 |         feature = model.extract_features(ip).data.cpu().numpy()
 54 |         extracted_features.append(feature)
 55 |     extracted_features = np.concatenate(extracted_features, axis=0)
 56 |     return extracted_features
 57 | 
 58 | 
 59 | if not os.path.exists(args.video_dir):
 60 |     raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir))
 61 | 
 62 | if not os.path.exists(args.images_dir):
 63 |     os.makedirs(args.images_dir)
 64 | 
 65 | if not os.path.exists(args.save_dir):
 66 |     os.makedirs(args.save_dir)
 67 | 
 68 | # create I3D model and load pre-trained model
 69 | i3d_model = InceptionI3d(400, in_channels=3)
 70 | i3d_model.load_state_dict(torch.load(args.load_model))
 71 | i3d_model.cuda()
 72 | i3d_model.train(False)
 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
 74 | 
 75 | # extract images and features
 76 | feature_shapes = dict()
 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format)))
 78 | for idx, video_path in enumerate(video_paths):
 79 |     video_id = os.path.basename(video_path)[0:-4]  # remove suffix
 80 |     image_dir = os.path.join(args.images_dir, video_id)
 81 | 
 82 |     print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True)
 83 | 
 84 |     if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))):
 85 |         print("the visual features for video {} are exist in {}...".format(video_id, args.save_dir), flush=True)
 86 |         continue
 87 | 
 88 |     # extract images
 89 |     if os.path.exists(image_dir):
 90 |         print("the images for video {} already are exist in {}...".format(video_id, args.images_dir))
 91 |     else:
 92 |         os.makedirs(image_dir)
 93 |         print("extract images with fps={}...".format(args.fps), flush=True)
 94 |         subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(video_path, image_dir,
 95 |                                                                                          video_id), shell=True)
 96 | 
 97 |     # process extracted images
 98 |     print("load RGB frames...", flush=True)
 99 |     num_frames = len(os.listdir(image_dir))
100 | 
101 |     if num_frames < 10000:
102 |         frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames)
103 |         frames = np.asarray(frames, dtype=np.float32)
104 |         imgs = video_transforms(frames)
105 |         img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
106 |         print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
107 |               imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
108 | 
109 |         print("extract visual features...", flush=True)
110 |         features = extract_features(img_tensor, i3d_model, args.strides)
111 |         np.save(os.path.join(args.save_dir, video_id), arr=features)
112 |         print("extracted features shape: {}".format(features.shape), flush=True)
113 |         feature_shapes[video_id] = features.shape[0]
114 | 
115 |     else:
116 |         all_features = []
117 |         for start_idx in range(1, num_frames, 10000):
118 |             end_idx = min(start_idx + 10000, num_frames + 1)
119 |             cur_num_frames = end_idx - start_idx
120 |             if cur_num_frames < args.strides:
121 |                 cur_num_frames = args.strides
122 |                 start_idx = end_idx - cur_num_frames
123 |             frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames)
124 |             frames = np.asarray(frames, dtype=np.float32)
125 |             imgs = video_transforms(frames)
126 |             img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
127 |             print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
128 |                   imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
129 |             print("extract visual features...", flush=True)
130 |             features = extract_features(img_tensor, i3d_model, args.strides)
131 |             all_features.append(features)
132 |         all_features = np.concatenate(all_features, axis=0)
133 |         np.save(os.path.join(args.save_dir, video_id), arr=all_features)
134 |         print("extracted features shape: {}".format(all_features.shape), flush=True)
135 |         feature_shapes[video_id] = all_features.shape[0]
136 | 
137 |     if args.remove_images:
138 |         # remove extract images to release memory space
139 |         subprocess.call("rm -rf {}".format(image_dir), shell=True)
140 | 
141 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
142 |     json.dump(feature_shapes, f)
143 | 


--------------------------------------------------------------------------------
/prepare/extract_activitynet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import glob
  4 | import json
  5 | import torch
  6 | import argparse
  7 | import subprocess
  8 | import numpy as np
  9 | from . import videotransforms
 10 | from .feature_extractor import InceptionI3d
 11 | from torchvision import transforms
 12 | from torch.autograd import Variable
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index")
 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model")
 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos")
 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images")
 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features")
 20 | parser.add_argument("--fps", type=int, default=None, help="frames per second")
 21 | parser.add_argument("--video_format", type=str, default="mp4", help="video format")
 22 | parser.add_argument("--strides", type=int, default=16, help="window size")
 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space")
 24 | args = parser.parse_args()
 25 | 
 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx
 28 | 
 29 | 
 30 | def load_images(img_dir, vid, start_frame, lengths):
 31 |     img_frames, raw_height, raw_width = [], None, None
 32 |     for x in range(start_frame, start_frame + lengths):
 33 |         image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]]
 34 |         width, height, channel = image.shape
 35 |         raw_width, raw_height = width, height
 36 |         # resize image
 37 |         scale = 1 + (224.0 - min(width, height)) / min(width, height)
 38 |         image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale)
 39 |         # normalize image to [0, 1]
 40 |         image = (image / 255.0) * 2 - 1
 41 |         img_frames.append(image)
 42 |     return img_frames, raw_width, raw_height
 43 | 
 44 | 
 45 | def extract_features(image_tensor, model, strides):
 46 |     b, c, t, h, w = image_tensor.shape
 47 |     extracted_features = []
 48 |     for start in range(0, t, strides):
 49 |         end = min(t - 1, start + strides)
 50 |         if end - start < strides:
 51 |             start = max(0, end - strides)
 52 |         ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True)
 53 |         feature = model.extract_features(ip).data.cpu().numpy()
 54 |         extracted_features.append(feature)
 55 |     extracted_features = np.concatenate(extracted_features, axis=0)
 56 |     return extracted_features
 57 | 
 58 | 
 59 | if not os.path.exists(args.video_dir):
 60 |     raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir))
 61 | 
 62 | if not os.path.exists(args.images_dir):
 63 |     os.makedirs(args.images_dir)
 64 | 
 65 | if not os.path.exists(args.save_dir):
 66 |     os.makedirs(args.save_dir)
 67 | 
 68 | # create I3D model and load pre-trained model
 69 | i3d_model = InceptionI3d(400, in_channels=3)
 70 | i3d_model.load_state_dict(torch.load(args.load_model))
 71 | i3d_model.cuda()
 72 | i3d_model.train(False)
 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
 74 | 
 75 | # extract images and features
 76 | feature_shapes = dict()
 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format)))
 78 | for idx, video_path in enumerate(video_paths):
 79 |     video_id = os.path.basename(video_path)[0:-4]  # remove suffix
 80 |     image_dir = os.path.join(args.images_dir, video_id)
 81 | 
 82 |     print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True)
 83 | 
 84 |     if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))):
 85 |         print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True)
 86 |         continue
 87 | 
 88 |     # extract images
 89 |     if os.path.exists(image_dir):
 90 |         print("the images for video {} already are exist in {}...".format(video_id, args.images_dir))
 91 |     else:
 92 |         os.makedirs(image_dir)
 93 |         print("extract images with fps={}...".format(args.fps), flush=True)
 94 |         if args.fps is None or args.fps <= 0:
 95 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(
 96 |                 video_path, image_dir, video_id), shell=True)
 97 |         else:
 98 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format(
 99 |                 video_path, args.fps, image_dir, video_id), shell=True)
100 | 
101 |     # process extracted images
102 |     print("load RGB frames...", flush=True)
103 |     num_frames = len(os.listdir(image_dir))
104 | 
105 |     if num_frames < 10000:
106 |         frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames)
107 |         frames = np.asarray(frames, dtype=np.float32)
108 |         imgs = video_transforms(frames)
109 |         img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
110 |         print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
111 |               imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
112 | 
113 |         print("extract visual features...", flush=True)
114 |         features = extract_features(img_tensor, i3d_model, args.strides)
115 |         np.save(os.path.join(args.save_dir, video_id), arr=features)
116 |         print("extracted features shape: {}".format(features.shape), flush=True)
117 |         feature_shapes[video_id] = features.shape[0]
118 | 
119 |     else:
120 |         all_features = []
121 |         for start_idx in range(1, num_frames, 10000):
122 |             end_idx = min(start_idx + 10000, num_frames + 1)
123 |             cur_num_frames = end_idx - start_idx
124 |             if cur_num_frames < args.strides:
125 |                 cur_num_frames = args.strides
126 |                 start_idx = end_idx - cur_num_frames
127 |             frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames)
128 |             frames = np.asarray(frames, dtype=np.float32)
129 |             imgs = video_transforms(frames)
130 |             img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
131 |             print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
132 |                   imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
133 |             print("extract visual features...", flush=True)
134 |             features = extract_features(img_tensor, i3d_model, args.strides)
135 |             all_features.append(features)
136 |         all_features = np.concatenate(all_features, axis=0)
137 |         np.save(os.path.join(args.save_dir, video_id), arr=all_features)
138 |         print("extracted features shape: {}".format(all_features.shape), flush=True)
139 |         feature_shapes[video_id] = all_features.shape[0]
140 | 
141 |     if args.remove_images:
142 |         # remove extract images to release memory space
143 |         subprocess.call("rm -rf {}".format(image_dir), shell=True)
144 | 
145 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
146 |     json.dump(feature_shapes, f)
147 | 


--------------------------------------------------------------------------------
/model/ops.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | if tf.__version__.startswith('2'):
  6 |     tf = tf.compat.v1
  7 |     tf.disable_v2_behavior()
  8 |     tf.disable_eager_execution()
  9 |     regularizer = tf.keras.regularizers.l2(l2=3e-7)
 10 | else:
 11 |     regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7)
 12 | 
 13 | 
 14 | def count_params(scope=None):
 15 |     if scope is None:
 16 |         return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))
 17 |     else:
 18 |         return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables(scope)]))
 19 | 
 20 | 
 21 | def get_shape_list(tensor):
 22 |     shape = tensor.shape.as_list()
 23 |     non_static_indexes = []
 24 |     for (index, dim) in enumerate(shape):
 25 |         if dim is None:
 26 |             non_static_indexes.append(index)
 27 |     if not non_static_indexes:
 28 |         return shape
 29 |     dyn_shape = tf.shape(tensor)
 30 |     for index in non_static_indexes:
 31 |         shape[index] = dyn_shape[index]
 32 |     return shape
 33 | 
 34 | 
 35 | def mask_logits(inputs, mask, mask_value=-1e30):
 36 |     mask = tf.cast(mask, tf.float32)
 37 |     return inputs * mask + mask_value * (1.0 - mask)
 38 | 
 39 | 
 40 | def trilinear_attention(args, v_maxlen, q_maxlen, drop_rate=0.0, reuse=None, name='efficient_trilinear'):
 41 |     assert len(args) == 2, 'just use for computing attention with two input'
 42 |     arg0_shape = args[0].get_shape().as_list()
 43 |     arg1_shape = args[1].get_shape().as_list()
 44 |     if len(arg0_shape) != 3 or len(arg1_shape) != 3:
 45 |         raise ValueError('`args` must be 3 dims (batch_size, len, dimension)')
 46 |     if arg0_shape[2] != arg1_shape[2]:
 47 |         raise ValueError('the last dimension of `args` must equal')
 48 |     arg_size = arg0_shape[2]
 49 |     dtype = args[0].dtype
 50 |     drop_args = [tf.nn.dropout(arg, rate=drop_rate) for arg in args]
 51 |     with tf.variable_scope(name, reuse=reuse):
 52 |         weights4arg0 = tf.get_variable('linear_kernel4arg0', [arg_size, 1], dtype=dtype, regularizer=regularizer)
 53 |         weights4arg1 = tf.get_variable('linear_kernel4arg1', [arg_size, 1], dtype=dtype, regularizer=regularizer)
 54 |         weights4mlu = tf.get_variable('linear_kernel4mul', [1, 1, arg_size], dtype=dtype, regularizer=regularizer)
 55 |         # compute results
 56 |         weights4arg0 = tf.tile(tf.expand_dims(weights4arg0, axis=0), multiples=[tf.shape(args[0])[0], 1, 1])
 57 |         subres0 = tf.tile(tf.matmul(drop_args[0], weights4arg0), [1, 1, q_maxlen])
 58 |         weights4arg1 = tf.tile(tf.expand_dims(weights4arg1, axis=0), multiples=[tf.shape(args[1])[0], 1, 1])
 59 |         subres1 = tf.tile(tf.transpose(tf.matmul(drop_args[1], weights4arg1), perm=(0, 2, 1)), [1, v_maxlen, 1])
 60 |         subres2 = tf.matmul(drop_args[0] * weights4mlu, tf.transpose(drop_args[1], perm=(0, 2, 1)))
 61 |         res = subres0 + subres1 + subres2
 62 |         return res
 63 | 
 64 | 
 65 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, clip_norm=1.0):
 66 |     """Creates an optimizer training op."""
 67 |     global_step = tf.train.get_or_create_global_step()
 68 | 
 69 |     learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 70 |     learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0,
 71 |                                               power=1.0, cycle=False)
 72 |     if num_warmup_steps:
 73 |         global_steps_int = tf.cast(global_step, tf.int32)
 74 |         warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 75 |         global_steps_float = tf.cast(global_steps_int, tf.float32)
 76 |         warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 77 |         warmup_percent_done = global_steps_float / warmup_steps_float
 78 |         warmup_learning_rate = init_lr * warmup_percent_done
 79 |         is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 80 |         learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 81 |     optimizer = AdamWeightDecayOptimizer(learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999,
 82 |                                          epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
 83 |     tvars = tf.trainable_variables()
 84 |     grads = tf.gradients(loss, tvars)
 85 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
 86 |     train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
 87 |     # Normally the global step update is done inside of `apply_gradients`. However, `AdamWeightDecayOptimizer` doesn't
 88 |     # do this. But if you use a different optimizer, you should probably take this line out.
 89 |     new_global_step = global_step + 1
 90 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 91 |     return train_op
 92 | 
 93 | 
 94 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 95 |     """A basic Adam optimizer that includes "correct" L2 weight decay."""
 96 | 
 97 |     def __init__(self, learning_rate, weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6,
 98 |                  exclude_from_weight_decay=None, name='AdamWeightDecayOptimizer'):
 99 |         """Constructs a AdamWeightDecayOptimizer."""
100 |         super(AdamWeightDecayOptimizer, self).__init__(False, name)
101 |         self.learning_rate = learning_rate
102 |         self.weight_decay_rate = weight_decay_rate
103 |         self.beta_1 = beta_1
104 |         self.beta_2 = beta_2
105 |         self.epsilon = epsilon
106 |         self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |         """See base class."""
110 |         assignments = []
111 |         for (grad, param) in grads_and_vars:
112 |             if grad is None or param is None:
113 |                 continue
114 |             param_name = self._get_variable_name(param.name)
115 |             m = tf.get_variable(name=param_name + '/adam_m', shape=param.shape.as_list(), dtype=tf.float32,
116 |                                 trainable=False, initializer=tf.zeros_initializer())
117 |             v = tf.get_variable(name=param_name + '/adam_v', shape=param.shape.as_list(), dtype=tf.float32,
118 |                                 trainable=False, initializer=tf.zeros_initializer())
119 |             next_m = (tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
120 |             next_v = (tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad)))
121 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
122 |             if self._do_use_weight_decay(param_name):
123 |                 update += self.weight_decay_rate * param
124 |             update_with_lr = self.learning_rate * update
125 |             next_param = param - update_with_lr
126 |             assignments.extend([param.assign(next_param), m.assign(next_m), v.assign(next_v)])
127 |         return tf.group(*assignments, name=name)
128 | 
129 |     def _do_use_weight_decay(self, param_name):
130 |         """Whether to use L2 weight decay for `param_name`."""
131 |         if not self.weight_decay_rate:
132 |             return False
133 |         if self.exclude_from_weight_decay:
134 |             for r in self.exclude_from_weight_decay:
135 |                 if re.search(r, param_name) is not None:
136 |                     return False
137 |         return True
138 | 
139 |     @staticmethod
140 |     def _get_variable_name(param_name):
141 |         """Get the variable name from the tensor name."""
142 |         m = re.match("^(.*):\\d+$", param_name)
143 |         if m is not None:
144 |             param_name = m.group(1)
145 |         return param_name
146 | 
147 |     def _apply_dense(self, grad, var):
148 |         pass
149 | 
150 |     def _resource_apply_dense(self, grad, handle):
151 |         pass
152 | 
153 |     def _resource_apply_sparse(self, grad, handle, indices):
154 |         pass
155 | 
156 |     def _apply_sparse(self, grad, var):
157 |         pass
158 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import tensorflow as tf
  4 | from tqdm import tqdm
  5 | from model.VSLNet import VSLNet
  6 | from util.data_gen import gen_or_load_dataset
  7 | from util.data_util import load_video_features, save_json, load_json
  8 | from util.data_loader import TrainLoader, TestLoader
  9 | from util.runner_utils import set_tf_config, get_feed_dict, write_tf_summary, eval_test
 10 | 
 11 | if tf.__version__.startswith('2'):
 12 |     tf = tf.compat.v1
 13 |     tf.disable_v2_behavior()
 14 |     tf.disable_eager_execution()
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | # data parameters
 18 | parser.add_argument('--save_dir', type=str, default='datasets', help='path to save processed dataset')
 19 | parser.add_argument('--task', type=str, default='charades', help='target task')
 20 | parser.add_argument('--fv', type=str, default='new', help='[new | org] for visual features')
 21 | parser.add_argument('--max_pos_len', type=int, default=128, help='maximal position sequence length allowed')
 22 | # model parameters
 23 | parser.add_argument("--char_size", type=int, default=None, help="number of characters")
 24 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension")
 25 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension")
 26 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension, set to 100 for activitynet")
 27 | parser.add_argument("--hidden_size", type=int, default=128, help="hidden size")
 28 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region")
 29 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads")
 30 | parser.add_argument("--drop_rate", type=float, default=0.2, help="dropout rate")
 31 | parser.add_argument('--predictor', type=str, default='rnn', help='[rnn | transformer]')
 32 | # training/evaluation parameters
 33 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index")
 34 | parser.add_argument("--seed", type=int, default=12345, help="random seed")
 35 | parser.add_argument("--mode", type=str, default="train", help="[train | test]")
 36 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 37 | parser.add_argument("--batch_size", type=int, default=16, help="batch size")
 38 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps")
 39 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate")
 40 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm")
 41 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion")
 42 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension")
 43 | parser.add_argument("--period", type=int, default=100, help="training loss print period")
 44 | parser.add_argument('--model_dir', type=str, default='ckpt', help='path to save trained model weights')
 45 | parser.add_argument('--model_name', type=str, default='vslnet', help='model name')
 46 | parser.add_argument('--suffix', type=str, default=None, help='set to the last `_xxx` in ckpt repo to eval results')
 47 | configs = parser.parse_args()
 48 | 
 49 | # set tensorflow configs
 50 | set_tf_config(configs.seed, configs.gpu_idx)
 51 | 
 52 | # prepare or load dataset
 53 | if tf.__version__.startswith('2'):
 54 |     configs.save_dir = 'datasets_tf2'  # avoid `ValueError: unsupported pickle protocol: 5`
 55 |     configs.model_dir = 'ckpt_tf2'
 56 | dataset = gen_or_load_dataset(configs)
 57 | configs.char_size = dataset['n_chars']
 58 | 
 59 | # get train and test loader
 60 | visual_features = load_video_features(os.path.join('data', 'features', configs.task, configs.fv), configs.max_pos_len)
 61 | train_loader = TrainLoader(dataset=dataset['train_set'], visual_features=visual_features, configs=configs)
 62 | test_loader = TestLoader(datasets=dataset, visual_features=visual_features, configs=configs)
 63 | configs.num_train_steps = train_loader.num_batches() * configs.epochs
 64 | num_train_batches = train_loader.num_batches()
 65 | 
 66 | # create model dir
 67 | home_dir = os.path.join(configs.model_dir, '_'.join([configs.model_name, configs.task, configs.fv,
 68 |                                                      str(configs.max_pos_len), configs.predictor]))
 69 | if configs.suffix is not None:
 70 |     home_dir = home_dir + '_' + configs.suffix
 71 | log_dir = os.path.join(home_dir, "event")
 72 | model_dir = os.path.join(home_dir, "model")
 73 | 
 74 | # train and test
 75 | if configs.mode.lower() == 'train':
 76 |     if not os.path.exists(model_dir):
 77 |         os.makedirs(model_dir)
 78 |     if not os.path.exists(log_dir):
 79 |         os.makedirs(log_dir)
 80 |     eval_period = num_train_batches // 2
 81 |     save_json(vars(configs), os.path.join(model_dir, 'configs.json'), sort_keys=True, save_pretty=True)
 82 |     with tf.Graph().as_default() as graph:
 83 |         model = VSLNet(configs, graph=graph, vectors=dataset['word_vector'])
 84 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 85 |         sess_config.gpu_options.allow_growth = True
 86 |         with tf.Session(config=sess_config) as sess:
 87 |             saver = tf.train.Saver(max_to_keep=3)
 88 |             writer = tf.summary.FileWriter(log_dir)
 89 |             sess.run(tf.global_variables_initializer())
 90 |             best_r1i7 = -1.0
 91 |             score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8")
 92 |             for epoch in range(configs.epochs):
 93 |                 for data in tqdm(train_loader.batch_iter(), total=num_train_batches, desc='Epoch %3d / 3%d' % (
 94 |                         epoch + 1, configs.epochs)):
 95 |                     # run the model
 96 |                     feed_dict = get_feed_dict(data, model, drop_rate=configs.drop_rate)
 97 |                     _, loss, h_loss, global_step = sess.run([model.train_op, model.loss, model.highlight_loss,
 98 |                                                              model.global_step], feed_dict=feed_dict)
 99 |                     if global_step % configs.period == 0:
100 |                         write_tf_summary(writer, [("train/loss", loss), ("train/highlight_loss", h_loss)], global_step)
101 |                     # evaluate
102 |                     if global_step % eval_period == 0 or global_step % num_train_batches == 0:
103 |                         r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test(
104 |                             sess=sess, model=model, data_loader=test_loader, epoch=epoch + 1, global_step=global_step,
105 |                             mode="test")
106 |                         print('\nEpoch: %2d | Step: %5d | r1i3: %.2f | r1i5: %.2f | r1i7: %.2f | mIoU: %.2f' % (
107 |                             epoch + 1, global_step, r1i3, r1i5, r1i7, mi), flush=True)
108 |                         write_tf_summary(writer, value_pairs, global_step)
109 |                         score_writer.write(score_str)
110 |                         score_writer.flush()
111 |                         if r1i7 > best_r1i7:
112 |                             best_r1i7 = r1i7
113 |                             filename = os.path.join(model_dir, "{}_{}.ckpt".format(configs.model_name, global_step))
114 |                             saver.save(sess, filename)
115 |             score_writer.close()
116 | 
117 | elif configs.mode.lower() == 'test':
118 |     if not os.path.exists(model_dir):
119 |         raise ValueError('No pre-trained weights exist')
120 |     # load previous configs
121 |     pre_configs = load_json(os.path.join(model_dir, "configs.json"))
122 |     parser.set_defaults(**pre_configs)
123 |     configs = parser.parse_args()
124 |     with tf.Graph().as_default() as graph:
125 |         model = VSLNet(configs, graph=graph, vectors=dataset['word_vector'])
126 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
127 |         sess_config.gpu_options.allow_growth = True
128 |         with tf.Session(config=sess_config) as sess:
129 |             saver = tf.train.Saver()
130 |             sess.run(tf.global_variables_initializer())
131 |             saver.restore(sess, tf.train.latest_checkpoint(model_dir))
132 |             r1i3, r1i5, r1i7, mi, *_ = eval_test(sess, model, data_loader=test_loader, mode="test")
133 |             print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True)
134 |             print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True)
135 |             print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True)
136 |             print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi) + "\x1b[0m", flush=True)
137 | 
138 | else:
139 |     raise ValueError("Unknown mode {}!!!".format(configs.mode))
140 | 


--------------------------------------------------------------------------------
/main_t7.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import torch
  4 | import torch.nn as nn
  5 | from tqdm import tqdm
  6 | from model.VSLNet_t7 import VSLNet, build_optimizer_and_scheduler
  7 | from util.data_util import load_video_features, save_json, load_json
  8 | from util.data_gen import gen_or_load_dataset
  9 | from util.data_loader_t7 import get_train_loader, get_test_loader
 10 | from util.runner_utils_t7 import set_th_config, convert_length_to_mask, eval_test, filter_checkpoints, \
 11 |     get_last_checkpoint
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | # data parameters
 15 | parser.add_argument('--save_dir', type=str, default='datasets_t7', help='path to save processed dataset')
 16 | parser.add_argument('--task', type=str, default='charades', help='target task')
 17 | parser.add_argument('--fv', type=str, default='new', help='[new | org] for visual features')
 18 | parser.add_argument('--max_pos_len', type=int, default=128, help='maximal position sequence length allowed')
 19 | # model parameters
 20 | parser.add_argument("--word_size", type=int, default=None, help="number of words")
 21 | parser.add_argument("--char_size", type=int, default=None, help="number of characters")
 22 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension")
 23 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension")
 24 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension, set to 100 for activitynet")
 25 | parser.add_argument("--dim", type=int, default=128, help="hidden size")
 26 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region")
 27 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads")
 28 | parser.add_argument("--drop_rate", type=float, default=0.2, help="dropout rate")
 29 | parser.add_argument('--predictor', type=str, default='rnn', help='[rnn | transformer]')
 30 | # training/evaluation parameters
 31 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index")
 32 | parser.add_argument("--seed", type=int, default=12345, help="random seed")
 33 | parser.add_argument("--mode", type=str, default="train", help="[train | test]")
 34 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 35 | parser.add_argument("--batch_size", type=int, default=16, help="batch size")
 36 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps")
 37 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate")
 38 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm")
 39 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion")
 40 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension")
 41 | parser.add_argument("--period", type=int, default=100, help="training loss print period")
 42 | parser.add_argument('--model_dir', type=str, default='ckpt_t7', help='path to save trained model weights')
 43 | parser.add_argument('--model_name', type=str, default='vslnet', help='model name')
 44 | parser.add_argument('--suffix', type=str, default=None, help='set to the last `_xxx` in ckpt repo to eval results')
 45 | configs = parser.parse_args()
 46 | 
 47 | # set tensorflow configs
 48 | set_th_config(configs.seed)
 49 | 
 50 | # prepare or load dataset
 51 | dataset = gen_or_load_dataset(configs)
 52 | configs.char_size = dataset['n_chars']
 53 | configs.word_size = dataset['n_words']
 54 | 
 55 | # get train and test loader
 56 | visual_features = load_video_features(os.path.join('data', 'features', configs.task, configs.fv), configs.max_pos_len)
 57 | train_loader = get_train_loader(dataset=dataset['train_set'], video_features=visual_features, configs=configs)
 58 | val_loader = None if dataset['val_set'] is None else get_test_loader(dataset['val_set'], visual_features, configs)
 59 | test_loader = get_test_loader(dataset=dataset['test_set'], video_features=visual_features, configs=configs)
 60 | configs.num_train_steps = len(train_loader) * configs.epochs
 61 | num_train_batches = len(train_loader)
 62 | num_val_batches = 0 if val_loader is None else len(val_loader)
 63 | num_test_batches = len(test_loader)
 64 | 
 65 | # Device configuration
 66 | cuda_str = 'cuda' if configs.gpu_idx is None else 'cuda:{}'.format(configs.gpu_idx)
 67 | device = torch.device(cuda_str if torch.cuda.is_available() else 'cpu')
 68 | 
 69 | # create model dir
 70 | home_dir = os.path.join(configs.model_dir, '_'.join([configs.model_name, configs.task, configs.fv,
 71 |                                                      str(configs.max_pos_len), configs.predictor]))
 72 | if configs.suffix is not None:
 73 |     home_dir = home_dir + '_' + configs.suffix
 74 | model_dir = os.path.join(home_dir, "model")
 75 | 
 76 | # train and test
 77 | if configs.mode.lower() == 'train':
 78 |     if not os.path.exists(model_dir):
 79 |         os.makedirs(model_dir)
 80 |     eval_period = num_train_batches // 2
 81 |     save_json(vars(configs), os.path.join(model_dir, 'configs.json'), sort_keys=True, save_pretty=True)
 82 |     # build model
 83 |     model = VSLNet(configs=configs, word_vectors=dataset['word_vector']).to(device)
 84 |     optimizer, scheduler = build_optimizer_and_scheduler(model, configs=configs)
 85 |     # start training
 86 |     best_r1i7 = -1.0
 87 |     score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8")
 88 |     print('start training...', flush=True)
 89 |     global_step = 0
 90 |     for epoch in range(configs.epochs):
 91 |         model.train()
 92 |         for data in tqdm(train_loader, total=num_train_batches, desc='Epoch %3d / %3d' % (epoch + 1, configs.epochs)):
 93 |             global_step += 1
 94 |             _, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels = data
 95 |             # prepare features
 96 |             vfeats, vfeat_lens = vfeats.to(device), vfeat_lens.to(device)
 97 |             word_ids, char_ids = word_ids.to(device), char_ids.to(device)
 98 |             s_labels, e_labels, h_labels = s_labels.to(device), e_labels.to(device), h_labels.to(device)
 99 |             # generate mask
100 |             query_mask = (torch.zeros_like(word_ids) != word_ids).float().to(device)
101 |             video_mask = convert_length_to_mask(vfeat_lens).to(device)
102 |             # compute logits
103 |             h_score, start_logits, end_logits = model(word_ids, char_ids, vfeats, video_mask, query_mask)
104 |             # compute loss
105 |             highlight_loss = model.compute_highlight_loss(h_score, h_labels, video_mask)
106 |             loc_loss = model.compute_loss(start_logits, end_logits, s_labels, e_labels)
107 |             total_loss = loc_loss + configs.highlight_lambda * highlight_loss
108 |             # compute and apply gradients
109 |             optimizer.zero_grad()
110 |             total_loss.backward()
111 |             nn.utils.clip_grad_norm_(model.parameters(), configs.clip_norm)  # clip gradient
112 |             optimizer.step()
113 |             scheduler.step()
114 |             # evaluate
115 |             if global_step % eval_period == 0 or global_step % num_train_batches == 0:
116 |                 model.eval()
117 |                 r1i3, r1i5, r1i7, mi, score_str = eval_test(model=model, data_loader=test_loader, device=device,
118 |                                                             mode='test', epoch=epoch + 1, global_step=global_step)
119 |                 print('\nEpoch: %2d | Step: %5d | r1i3: %.2f | r1i5: %.2f | r1i7: %.2f | mIoU: %.2f' % (
120 |                     epoch + 1, global_step, r1i3, r1i5, r1i7, mi), flush=True)
121 |                 score_writer.write(score_str)
122 |                 score_writer.flush()
123 |                 if r1i7 >= best_r1i7:
124 |                     best_r1i7 = r1i7
125 |                     torch.save(model.state_dict(), os.path.join(model_dir, '{}_{}.t7'.format(configs.model_name,
126 |                                                                                              global_step)))
127 |                     # only keep the top-3 model checkpoints
128 |                     filter_checkpoints(model_dir, suffix='t7', max_to_keep=3)
129 |                 model.train()
130 |     score_writer.close()
131 | 
132 | elif configs.mode.lower() == 'test':
133 |     if not os.path.exists(model_dir):
134 |         raise ValueError('No pre-trained weights exist')
135 |     # load previous configs
136 |     pre_configs = load_json(os.path.join(model_dir, "configs.json"))
137 |     parser.set_defaults(**pre_configs)
138 |     configs = parser.parse_args()
139 |     # build model
140 |     model = VSLNet(configs=configs, word_vectors=dataset['word_vector']).to(device)
141 |     # get last checkpoint file
142 |     filename = get_last_checkpoint(model_dir, suffix='t7')
143 |     model.load_state_dict(torch.load(filename))
144 |     model.eval()
145 |     r1i3, r1i5, r1i7, mi, _ = eval_test(model=model, data_loader=test_loader, device=device, mode='test')
146 |     print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True)
147 |     print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True)
148 |     print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True)
149 |     print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi) + "\x1b[0m", flush=True)
150 | 


--------------------------------------------------------------------------------
/util/data_gen.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import codecs
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from collections import Counter
  6 | from nltk.tokenize import word_tokenize
  7 | from util.data_util import load_json, load_lines, load_pickle, save_pickle, time_to_index
  8 | 
  9 | PAD, UNK = "<PAD>", "<UNK>"
 10 | 
 11 | 
 12 | class CharadesProcessor:
 13 |     def __init__(self):
 14 |         super(CharadesProcessor, self).__init__()
 15 |         self.idx_counter = 0
 16 | 
 17 |     def reset_idx_counter(self):
 18 |         self.idx_counter = 0
 19 | 
 20 |     def process_data(self, data, charades, scope):
 21 |         results = []
 22 |         for line in tqdm(data, total=len(data), desc='process charades-sta {}'.format(scope)):
 23 |             line = line.lstrip().rstrip()
 24 |             if len(line) == 0:
 25 |                 continue
 26 |             video_info, sentence = line.split('##')
 27 |             vid, start_time, end_time = video_info.split(' ')
 28 |             duration = float(charades[vid]['duration'])
 29 |             start_time = max(0.0, float(start_time))
 30 |             end_time = min(float(end_time), duration)
 31 |             words = word_tokenize(sentence.strip().lower(), language="english")
 32 |             record = {'sample_id': self.idx_counter, 'vid': str(vid), 's_time': start_time, 'e_time': end_time,
 33 |                       'duration': duration, 'words': words}
 34 |             results.append(record)
 35 |             self.idx_counter += 1
 36 |         return results
 37 | 
 38 |     def convert(self, data_dir):
 39 |         self.reset_idx_counter()
 40 |         if not os.path.exists(data_dir):
 41 |             raise ValueError('data dir {} does not exist'.format(data_dir))
 42 |         # load raw data
 43 |         charades = load_json(os.path.join(data_dir, 'charades.json'))
 44 |         train_data = load_lines(os.path.join(data_dir, 'charades_sta_train.txt'))
 45 |         test_data = load_lines(os.path.join(data_dir, 'charades_sta_test.txt'))
 46 |         # process data
 47 |         train_set = self.process_data(train_data, charades, scope='train')
 48 |         test_set = self.process_data(test_data, charades, scope='test')
 49 |         return train_set, None, test_set  # train/val/test
 50 | 
 51 | 
 52 | class ActivityNetProcessor:
 53 |     def __init__(self):
 54 |         super(ActivityNetProcessor, self).__init__()
 55 |         self.idx_counter = 0
 56 | 
 57 |     def reset_idx_counter(self):
 58 |         self.idx_counter = 0
 59 | 
 60 |     def process_data(self, data, scope):
 61 |         results = []
 62 |         for vid, data_item in tqdm(data.items(), total=len(data), desc='process activitynet {}'.format(scope)):
 63 |             duration = float(data_item['duration'])
 64 |             for timestamp, sentence in zip(data_item["timestamps"], data_item["sentences"]):
 65 |                 start_time = max(0.0, float(timestamp[0]))
 66 |                 end_time = min(float(timestamp[1]), duration)
 67 |                 words = word_tokenize(sentence.strip().lower(), language="english")
 68 |                 record = {'sample_id': self.idx_counter, 'vid': str(vid), 's_time': start_time, 'e_time': end_time,
 69 |                           'duration': duration, 'words': words}
 70 |                 results.append(record)
 71 |                 self.idx_counter += 1
 72 |         return results
 73 | 
 74 |     def convert(self, data_dir):
 75 |         self.reset_idx_counter()
 76 |         if not os.path.exists(data_dir):
 77 |             raise ValueError('data dir {} does not exist'.format(data_dir))
 78 |         # load raw data
 79 |         train_data = load_json(os.path.join(data_dir, 'train.json'))
 80 |         val_data = load_json(os.path.join(data_dir, 'val_2.json'))
 81 |         test_data = load_json(os.path.join(data_dir, 'val_1.json'))
 82 |         # process data
 83 |         train_set = self.process_data(train_data, scope='train')
 84 |         val_set = self.process_data(val_data, scope='val')
 85 |         test_set = self.process_data(test_data, scope='test')
 86 |         return train_set, val_set, test_set
 87 | 
 88 | 
 89 | class TACoSProcessor:
 90 |     def __init__(self):
 91 |         super(TACoSProcessor, self).__init__()
 92 |         self.idx_counter = 0
 93 | 
 94 |     def reset_idx_counter(self):
 95 |         self.idx_counter = 0
 96 | 
 97 |     def process_data_tan(self, data, scope):
 98 |         results = []
 99 |         for vid, data_item in tqdm(data.items(), total=len(data), desc='process tacos {}'.format(scope)):
100 |             if vid.endswith('.avi'):
101 |                 vid = vid[0:-4]
102 |             fps = float(data_item['fps'])
103 |             duration = float(data_item['num_frames']) / fps
104 |             for timestamp, sentence in zip(data_item['timestamps'], data_item['sentences']):
105 |                 start_time = max(0.0, float(timestamp[0]) / fps)
106 |                 end_time = min(float(timestamp[1]) / fps, duration)
107 |                 words = word_tokenize(sentence.strip().lower(), language="english")
108 |                 record = {'sample_id': self.idx_counter, 'vid': str(vid), 's_time': start_time, 'e_time': end_time,
109 |                           'duration': duration, 'words': words}
110 |                 results.append(record)
111 |                 self.idx_counter += 1
112 |         return results
113 | 
114 |     def convert(self, data_dir):
115 |         self.reset_idx_counter()
116 |         if not os.path.exists(data_dir):
117 |             raise ValueError('data dir {} does not exist'.format(data_dir))
118 |         # load raw data
119 |         train_data = load_json(os.path.join(data_dir, 'train.json'))
120 |         val_data = load_json(os.path.join(data_dir, 'val.json'))
121 |         test_data = load_json(os.path.join(data_dir, 'test.json'))
122 |         # process data
123 |         train_set = self.process_data_tan(train_data, scope='train')
124 |         val_set = self.process_data_tan(val_data, scope='val')
125 |         test_set = self.process_data_tan(test_data, scope='test')
126 |         return train_set, val_set, test_set
127 | 
128 | 
129 | def load_glove(glove_path):
130 |     vocab = list()
131 |     with codecs.open(glove_path, mode="r", encoding="utf-8") as f:
132 |         for line in tqdm(f, total=2196018, desc="load glove vocabulary"):
133 |             line = line.lstrip().rstrip().split(" ")
134 |             if len(line) == 2 or len(line) != 301:
135 |                 continue
136 |             word = line[0]
137 |             vocab.append(word)
138 |     return set(vocab)
139 | 
140 | 
141 | def filter_glove_embedding(word_dict, glove_path):
142 |     vectors = np.zeros(shape=[len(word_dict), 300], dtype=np.float32)
143 |     with codecs.open(glove_path, mode="r", encoding="utf-8") as f:
144 |         for line in tqdm(f, total=2196018, desc="load glove embeddings"):
145 |             line = line.lstrip().rstrip().split(" ")
146 |             if len(line) == 2 or len(line) != 301:
147 |                 continue
148 |             word = line[0]
149 |             if word in word_dict:
150 |                 vector = [float(x) for x in line[1:]]
151 |                 word_index = word_dict[word]
152 |                 vectors[word_index] = np.asarray(vector)
153 |     return np.asarray(vectors)
154 | 
155 | 
156 | def vocab_emb_gen(datasets, emb_path):
157 |     # generate word dict and vectors
158 |     emb_vocab = load_glove(emb_path)
159 |     word_counter, char_counter = Counter(), Counter()
160 |     for data in datasets:
161 |         for record in data:
162 |             for word in record['words']:
163 |                 word_counter[word] += 1
164 |                 for char in list(word):
165 |                     char_counter[char] += 1
166 |     word_vocab = list()
167 |     for word, _ in word_counter.most_common():
168 |         if word in emb_vocab:
169 |             word_vocab.append(word)
170 |     tmp_word_dict = dict([(word, index) for index, word in enumerate(word_vocab)])
171 |     vectors = filter_glove_embedding(tmp_word_dict, emb_path)
172 |     word_vocab = [PAD, UNK] + word_vocab
173 |     word_dict = dict([(word, idx) for idx, word in enumerate(word_vocab)])
174 |     # generate character dict
175 |     char_vocab = [PAD, UNK] + [char for char, count in char_counter.most_common() if count >= 5]
176 |     char_dict = dict([(char, idx) for idx, char in enumerate(char_vocab)])
177 |     return word_dict, char_dict, vectors
178 | 
179 | 
180 | def dataset_gen(data, vfeat_lens, word_dict, char_dict, max_pos_len, scope):
181 |     dataset = list()
182 |     for record in tqdm(data, total=len(data), desc='process {} data'.format(scope)):
183 |         vid = record['vid']
184 |         if vid not in vfeat_lens:
185 |             continue
186 |         s_ind, e_ind, _ = time_to_index(record['s_time'], record['e_time'], vfeat_lens[vid], record['duration'])
187 |         word_ids, char_ids = [], []
188 |         for word in record['words'][0:max_pos_len]:
189 |             word_id = word_dict[word] if word in word_dict else word_dict[UNK]
190 |             char_id = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word]
191 |             word_ids.append(word_id)
192 |             char_ids.append(char_id)
193 |         result = {'sample_id': record['sample_id'], 'vid': record['vid'], 's_time': record['s_time'],
194 |                   'e_time': record['e_time'], 'duration': record['duration'], 'words': record['words'],
195 |                   's_ind': int(s_ind), 'e_ind': int(e_ind), 'v_len': vfeat_lens[vid], 'w_ids': word_ids,
196 |                   'c_ids': char_ids}
197 |         dataset.append(result)
198 |     return dataset
199 | 
200 | 
201 | def gen_or_load_dataset(configs):
202 |     if not os.path.exists(configs.save_dir):
203 |         os.makedirs(configs.save_dir)
204 |     data_dir = os.path.join('data', 'dataset', configs.task)
205 |     feature_dir = os.path.join('data', 'features', configs.task, configs.fv)
206 |     if configs.suffix is None:
207 |         save_path = os.path.join(configs.save_dir, '_'.join([configs.task, configs.fv, str(configs.max_pos_len)]) +
208 |                                  '.pkl')
209 |     else:
210 |         save_path = os.path.join(configs.save_dir, '_'.join([configs.task, configs.fv, str(configs.max_pos_len),
211 |                                                              configs.suffix]) + '.pkl')
212 |     if os.path.exists(save_path):
213 |         dataset = load_pickle(save_path)
214 |         return dataset
215 |     feat_len_path = os.path.join(feature_dir, 'feature_shapes.json')
216 |     emb_path = os.path.join('data', 'features', 'glove.840B.300d.txt')
217 |     # load video feature length
218 |     vfeat_lens = load_json(feat_len_path)
219 |     for vid, vfeat_len in vfeat_lens.items():
220 |         vfeat_lens[vid] = min(configs.max_pos_len, vfeat_len)
221 |     # load data
222 |     if configs.task == 'charades':
223 |         processor = CharadesProcessor()
224 |     elif configs.task == 'activitynet':
225 |         processor = ActivityNetProcessor()
226 |     elif configs.task == 'tacos':
227 |         processor = TACoSProcessor()
228 |     else:
229 |         raise ValueError('Unknown task {}!!!'.format(configs.task))
230 |     train_data, val_data, test_data = processor.convert(data_dir)
231 |     # generate dataset
232 |     data_list = [train_data, test_data] if val_data is None else [train_data, val_data, test_data]
233 |     word_dict, char_dict, vectors = vocab_emb_gen(data_list, emb_path)
234 |     train_set = dataset_gen(train_data, vfeat_lens, word_dict, char_dict, configs.max_pos_len, 'train')
235 |     val_set = None if val_data is None else dataset_gen(val_data, vfeat_lens, word_dict, char_dict,
236 |                                                         configs.max_pos_len, 'val')
237 |     test_set = dataset_gen(test_data, vfeat_lens, word_dict, char_dict, configs.max_pos_len, 'test')
238 |     # save dataset
239 |     n_val = 0 if val_set is None else len(val_set)
240 |     dataset = {'train_set': train_set, 'val_set': val_set, 'test_set': test_set, 'word_dict': word_dict,
241 |                'char_dict': char_dict, 'word_vector': vectors, 'n_train': len(train_set), 'n_val': n_val,
242 |                'n_test': len(test_set), 'n_words': len(word_dict), 'n_chars': len(char_dict)}
243 |     save_pickle(dataset, save_path)
244 |     return dataset
245 | 


--------------------------------------------------------------------------------
/prepare/feature_extractor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloaded from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py
  3 | Minor modification are applied to fit our requirements
  4 | """
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | 
 10 | class MaxPool3dSamePadding(nn.MaxPool3d):
 11 | 
 12 |     def compute_pad(self, dim, s):
 13 |         if s % self.stride[dim] == 0:
 14 |             return max(self.kernel_size[dim] - self.stride[dim], 0)
 15 |         else:
 16 |             return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
 17 | 
 18 |     def forward(self, x):
 19 |         # compute 'same' padding
 20 |         (batch, channel, t, h, w) = x.size()
 21 |         pad_t = self.compute_pad(0, t)
 22 |         pad_h = self.compute_pad(1, h)
 23 |         pad_w = self.compute_pad(2, w)
 24 | 
 25 |         pad_t_f = pad_t // 2
 26 |         pad_t_b = pad_t - pad_t_f
 27 |         pad_h_f = pad_h // 2
 28 |         pad_h_b = pad_h - pad_h_f
 29 |         pad_w_f = pad_w // 2
 30 |         pad_w_b = pad_w - pad_w_f
 31 | 
 32 |         pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b]
 33 |         x = F.pad(x, pad)
 34 |         return super(MaxPool3dSamePadding, self).forward(x)
 35 | 
 36 | 
 37 | class Unit3D(nn.Module):
 38 | 
 39 |     def __init__(self, in_channels,
 40 |                  output_channels,
 41 |                  kernel_shape=(1, 1, 1),
 42 |                  stride=(1, 1, 1),
 43 |                  padding=0,
 44 |                  activation_fn=None,
 45 |                  use_batch_norm=True,
 46 |                  use_bias=False,
 47 |                  name='unit_3d'):
 48 | 
 49 |         """Initializes Unit3D module."""
 50 |         super(Unit3D, self).__init__()
 51 | 
 52 |         self._output_channels = output_channels
 53 |         self._kernel_shape = kernel_shape
 54 |         self._stride = stride
 55 |         self._use_batch_norm = use_batch_norm
 56 |         self._activation_fn = activation_fn
 57 |         self._use_bias = use_bias
 58 |         self.name = name
 59 |         self.padding = padding
 60 | 
 61 |         self.conv3d = nn.Conv3d(in_channels=in_channels,
 62 |                                 out_channels=self._output_channels,
 63 |                                 kernel_size=self._kernel_shape,
 64 |                                 stride=self._stride,
 65 |                                 padding=0,
 66 |                                 # we always want padding to be 0 here. We will dynamically pad based on input size
 67 |                                 # in forward function
 68 |                                 bias=self._use_bias)
 69 | 
 70 |         if self._use_batch_norm:
 71 |             self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
 72 | 
 73 |     def compute_pad(self, dim, s):
 74 |         if s % self._stride[dim] == 0:
 75 |             return max(self._kernel_shape[dim] - self._stride[dim], 0)
 76 |         else:
 77 |             return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
 78 | 
 79 |     def forward(self, x):
 80 |         # compute 'same' padding
 81 |         (batch, channel, t, h, w) = x.size()
 82 |         pad_t = self.compute_pad(0, t)
 83 |         pad_h = self.compute_pad(1, h)
 84 |         pad_w = self.compute_pad(2, w)
 85 | 
 86 |         pad_t_f = pad_t // 2
 87 |         pad_t_b = pad_t - pad_t_f
 88 |         pad_h_f = pad_h // 2
 89 |         pad_h_b = pad_h - pad_h_f
 90 |         pad_w_f = pad_w // 2
 91 |         pad_w_b = pad_w - pad_w_f
 92 | 
 93 |         pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b]
 94 |         x = F.pad(x, pad)
 95 | 
 96 |         x = self.conv3d(x)
 97 |         if self._use_batch_norm:
 98 |             x = self.bn(x)
 99 |         if self._activation_fn is not None:
100 |             x = self._activation_fn(x)
101 |         return x
102 | 
103 | 
104 | class InceptionModule(nn.Module):
105 |     def __init__(self, in_channels, out_channels, name):
106 |         super(InceptionModule, self).__init__()
107 | 
108 |         self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
109 |                          activation_fn=F.relu, name=name + '/Branch_0/Conv3d_0a_1x1')
110 |         self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
111 |                           activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0a_1x1')
112 |         self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
113 |                           activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0b_3x3')
114 |         self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
115 |                           activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0a_1x1')
116 |         self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
117 |                           activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0b_3x3')
118 |         self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
119 |                                         stride=(1, 1, 1), padding=0)
120 |         self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
121 |                           activation_fn=F.relu, name=name + '/Branch_3/Conv3d_0b_1x1')
122 |         self.name = name
123 | 
124 |     def forward(self, x):
125 |         b0 = self.b0(x)
126 |         b1 = self.b1b(self.b1a(x))
127 |         b2 = self.b2b(self.b2a(x))
128 |         b3 = self.b3b(self.b3a(x))
129 |         return torch.cat([b0, b1, b2, b3], dim=1)
130 | 
131 | 
132 | class InceptionI3d(nn.Module):
133 |     """Inception-v1 I3D architecture.
134 |     The model is introduced in:
135 |         Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
136 |         Joao Carreira, Andrew Zisserman
137 |         https://arxiv.org/pdf/1705.07750v1.pdf.
138 |     See also the Inception architecture, introduced in:
139 |         Going deeper with convolutions
140 |         Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
141 |         Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
142 |         http://arxiv.org/pdf/1409.4842v1.pdf.
143 |     """
144 | 
145 |     # Endpoints of the model in order. During construction, all the endpoints up
146 |     # to a designated `final_endpoint` are returned in a dictionary as the
147 |     # second return value.
148 |     VALID_ENDPOINTS = (
149 |         'Conv3d_1a_7x7',
150 |         'MaxPool3d_2a_3x3',
151 |         'Conv3d_2b_1x1',
152 |         'Conv3d_2c_3x3',
153 |         'MaxPool3d_3a_3x3',
154 |         'Mixed_3b',
155 |         'Mixed_3c',
156 |         'MaxPool3d_4a_3x3',
157 |         'Mixed_4b',
158 |         'Mixed_4c',
159 |         'Mixed_4d',
160 |         'Mixed_4e',
161 |         'Mixed_4f',
162 |         'MaxPool3d_5a_2x2',
163 |         'Mixed_5b',
164 |         'Mixed_5c',
165 |         'Logits',
166 |         'Predictions',
167 |     )
168 | 
169 |     def __init__(self, num_classes=400, spatial_squeeze=True,
170 |                  final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
171 |         """Initializes I3D model instance.
172 |         Args:
173 |           num_classes: The number of outputs in the logit layer (default 400, which
174 |               matches the Kinetics dataset).
175 |           spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
176 |               before returning (default True).
177 |           final_endpoint: The model contains many possible endpoints.
178 |               `final_endpoint` specifies the last endpoint for the model to be built
179 |               up to. In addition to the output at `final_endpoint`, all the outputs
180 |               at endpoints up to `final_endpoint` will also be returned, in a
181 |               dictionary. `final_endpoint` must be one of
182 |               InceptionI3d.VALID_ENDPOINTS (default 'Logits').
183 |           name: A string (optional). The name of this module.
184 |         Raises:
185 |           ValueError: if `final_endpoint` is not recognized.
186 |         """
187 | 
188 |         if final_endpoint not in self.VALID_ENDPOINTS:
189 |             raise ValueError('Unknown final endpoint %s' % final_endpoint)
190 | 
191 |         super(InceptionI3d, self).__init__()
192 |         self._num_classes = num_classes
193 |         self._spatial_squeeze = spatial_squeeze
194 |         self._final_endpoint = final_endpoint
195 |         self.logits = None
196 | 
197 |         if self._final_endpoint not in self.VALID_ENDPOINTS:
198 |             raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
199 | 
200 |         self.end_points = {}
201 |         end_point = 'Conv3d_1a_7x7'
202 |         self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
203 |                                             activation_fn=F.relu, stride=(2, 2, 2), padding=3,  # padding=(3, 3, 3),
204 |                                             name=name + end_point)
205 |         if self._final_endpoint == end_point:
206 |             return
207 | 
208 |         end_point = 'MaxPool3d_2a_3x3'
209 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
210 |                                                           padding=0)
211 |         if self._final_endpoint == end_point:
212 |             return
213 | 
214 |         end_point = 'Conv3d_2b_1x1'
215 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
216 |                                             activation_fn=F.relu, name=name + end_point)
217 |         if self._final_endpoint == end_point:
218 |             return
219 | 
220 |         end_point = 'Conv3d_2c_3x3'
221 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
222 |                                             activation_fn=F.relu, name=name + end_point)
223 |         if self._final_endpoint == end_point:
224 |             return
225 | 
226 |         end_point = 'MaxPool3d_3a_3x3'
227 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
228 |                                                           padding=0)
229 |         if self._final_endpoint == end_point:
230 |             return
231 | 
232 |         end_point = 'Mixed_3b'
233 |         self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point)
234 |         if self._final_endpoint == end_point:
235 |             return
236 | 
237 |         end_point = 'Mixed_3c'
238 |         self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point)
239 |         if self._final_endpoint == end_point:
240 |             return
241 | 
242 |         end_point = 'MaxPool3d_4a_3x3'
243 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
244 |                                                           padding=0)
245 |         if self._final_endpoint == end_point:
246 |             return
247 | 
248 |         end_point = 'Mixed_4b'
249 |         self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
250 |         if self._final_endpoint == end_point:
251 |             return
252 | 
253 |         end_point = 'Mixed_4c'
254 |         self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
255 |         if self._final_endpoint == end_point:
256 |             return
257 | 
258 |         end_point = 'Mixed_4d'
259 |         self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
260 |         if self._final_endpoint == end_point:
261 |             return
262 | 
263 |         end_point = 'Mixed_4e'
264 |         self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
265 |         if self._final_endpoint == end_point:
266 |             return
267 | 
268 |         end_point = 'Mixed_4f'
269 |         self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128],
270 |                                                      name + end_point)
271 |         if self._final_endpoint == end_point:
272 |             return
273 | 
274 |         end_point = 'MaxPool3d_5a_2x2'
275 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
276 |                                                           padding=0)
277 |         if self._final_endpoint == end_point:
278 |             return
279 | 
280 |         end_point = 'Mixed_5b'
281 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128],
282 |                                                      name + end_point)
283 |         if self._final_endpoint == end_point:
284 |             return
285 | 
286 |         end_point = 'Mixed_5c'
287 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128],
288 |                                                      name + end_point)
289 |         if self._final_endpoint == end_point:
290 |             return
291 | 
292 |         # end_point = 'Logits'
293 |         self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
294 |         self.dropout = nn.Dropout(dropout_keep_prob)
295 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes,
296 |                              kernel_shape=[1, 1, 1],
297 |                              padding=0,
298 |                              use_batch_norm=False,
299 |                              use_bias=True,
300 |                              name='logits')
301 | 
302 |         self.build()
303 | 
304 |     def replace_logits(self, num_classes):
305 |         self._num_classes = num_classes
306 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes,
307 |                              kernel_shape=[1, 1, 1],
308 |                              padding=0,
309 |                              use_batch_norm=False,
310 |                              use_bias=True,
311 |                              name='logits')
312 | 
313 |     def build(self):
314 |         for k in self.end_points.keys():
315 |             self.add_module(k, self.end_points[k])
316 | 
317 |     def forward(self, x):
318 |         for end_point in self.VALID_ENDPOINTS:
319 |             if end_point in self.end_points:
320 |                 x = self._modules[end_point](x)  # use _modules to work with data parallel
321 |         x = self.avg_pool(x)
322 |         logits = self.logits(self.dropout(x))
323 |         if self._spatial_squeeze:
324 |             logits = x.squeeze(3).squeeze(3)
325 |         # logits is batch X time X classes, which is what we want to work with
326 |         return logits
327 | 
328 |     def extract_features(self, x):
329 |         for end_point in self.VALID_ENDPOINTS:
330 |             if end_point in self.end_points:
331 |                 x = self._modules[end_point](x)
332 |         # x = [batch_size, channels, time, height, width]
333 |         x = self.avg_pool(x)  # 384 + 384 + 128 + 128 = 1024
334 |         x = x.squeeze(0).permute(1, 2, 3, 0)  # x = [time, height, width, channels]
335 |         x = x.squeeze(1).squeeze(1)  # x = [time, channels]
336 |         return x
337 | 


--------------------------------------------------------------------------------
/model/layers.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import tensorflow as tf
  3 | from model.ops import get_shape_list, mask_logits, trilinear_attention, regularizer
  4 | 
  5 | if tf.__version__.startswith('2'):
  6 |     tf = tf.compat.v1
  7 |     tf.disable_v2_behavior()
  8 |     tf.disable_eager_execution()
  9 | 
 10 | 
 11 | def layer_norm(inputs, epsilon=1e-6, reuse=None, name='layer_norm'):
 12 |     """Layer normalize the tensor x, averaging over the last dimension."""
 13 |     with tf.variable_scope(name, default_name="layer_norm", values=[inputs], reuse=reuse):
 14 |         dim = get_shape_list(inputs)[-1]
 15 |         scale = tf.get_variable("layer_norm_scale", [dim], initializer=tf.ones_initializer(), regularizer=regularizer)
 16 |         bias = tf.get_variable("layer_norm_bias", [dim], initializer=tf.zeros_initializer(), regularizer=regularizer)
 17 |         mean = tf.reduce_mean(inputs, axis=[-1], keep_dims=True)
 18 |         variance = tf.reduce_mean(tf.square(inputs - mean), axis=[-1], keep_dims=True)
 19 |         norm_inputs = (inputs - mean) * tf.rsqrt(variance + epsilon)
 20 |         result = norm_inputs * scale + bias
 21 |         return result
 22 | 
 23 | 
 24 | def word_embedding_lookup(word_ids, dim, vectors, drop_rate=0.0, finetune=False, reuse=None, name='word_embeddings'):
 25 |     with tf.variable_scope(name, reuse=reuse):
 26 |         table = tf.Variable(vectors, name='word_table', dtype=tf.float32, trainable=finetune)
 27 |         unk = tf.get_variable(name='unk', shape=[1, dim], dtype=tf.float32, trainable=True)
 28 |         zero = tf.zeros(shape=[1, dim], dtype=tf.float32)
 29 |         word_table = tf.concat([zero, unk, table], axis=0)
 30 |         word_emb = tf.nn.embedding_lookup(word_table, word_ids)
 31 |         word_emb = tf.nn.dropout(word_emb, rate=drop_rate)
 32 |         return word_emb
 33 | 
 34 | 
 35 | def char_embedding_lookup(char_ids, char_size, dim, kernels, filters, drop_rate=0.0, activation=tf.nn.relu,
 36 |                           padding='VALID', reuse=None, name='char_embeddings'):
 37 |     with tf.variable_scope(name, reuse=reuse):
 38 |         # char embeddings lookup
 39 |         table = tf.get_variable(name='char_table', shape=[char_size - 1, dim], dtype=tf.float32, trainable=True)
 40 |         zero = tf.zeros(shape=[1, dim], dtype=tf.float32)
 41 |         char_table = tf.concat([zero, table], axis=0)
 42 |         char_emb = tf.nn.embedding_lookup(char_table, char_ids)
 43 |         char_emb = tf.nn.dropout(char_emb, rate=drop_rate)
 44 |         # char-level cnn
 45 |         outputs = []
 46 |         for i, (kernel, channel) in enumerate(zip(kernels, filters)):
 47 |             weight = tf.get_variable('filter_%d' % i, shape=[1, kernel, dim, channel], dtype=tf.float32,
 48 |                                      regularizer=regularizer)
 49 |             bias = tf.get_variable('bias_%d' % i, shape=[channel], dtype=tf.float32, initializer=tf.zeros_initializer(),
 50 |                                    regularizer=regularizer)
 51 |             output = tf.nn.conv2d(char_emb, weight, strides=[1, 1, 1, 1], padding=padding, name='conv_%d' % i)
 52 |             output = tf.nn.bias_add(output, bias=bias)
 53 |             output = tf.reduce_max(activation(output), axis=2)
 54 |             outputs.append(output)
 55 |         outputs = tf.concat(values=outputs, axis=-1)
 56 |         return outputs
 57 | 
 58 | 
 59 | def conv1d(inputs, dim, kernel_size=1, use_bias=False, activation=None, padding='VALID', reuse=None, name='conv1d'):
 60 |     with tf.variable_scope(name, reuse=reuse):
 61 |         shapes = get_shape_list(inputs)
 62 |         kernel = tf.get_variable(name='kernel', shape=[kernel_size, shapes[-1], dim], dtype=tf.float32,
 63 |                                  regularizer=regularizer)
 64 |         outputs = tf.nn.conv1d(inputs, filters=kernel, stride=1, padding=padding)
 65 |         if use_bias:
 66 |             bias = tf.get_variable(name='bias', shape=[1, 1, dim], dtype=tf.float32, initializer=tf.zeros_initializer(),
 67 |                                    regularizer=regularizer)
 68 |             outputs += bias
 69 |         if activation is not None:
 70 |             return activation(outputs)
 71 |         else:
 72 |             return outputs
 73 | 
 74 | 
 75 | def depthwise_separable_conv(inputs, kernel_size, dim, use_bias=True, reuse=None, activation=tf.nn.relu,
 76 |                              name='depthwise_separable_conv'):
 77 |     with tf.variable_scope(name, reuse=reuse):
 78 |         shapes = get_shape_list(inputs)
 79 |         depthwise_filter = tf.get_variable(name='depthwise_filter', dtype=tf.float32, regularizer=regularizer,
 80 |                                            shape=[kernel_size[0], kernel_size[1], shapes[-1], 1])
 81 |         pointwise_filter = tf.get_variable(name='pointwise_filter', shape=[1, 1, shapes[-1], dim], dtype=tf.float32,
 82 |                                            regularizer=regularizer)
 83 |         outputs = tf.nn.separable_conv2d(inputs, depthwise_filter, pointwise_filter, strides=[1, 1, 1, 1],
 84 |                                          padding='SAME')
 85 |         if use_bias:
 86 |             b = tf.get_variable('bias', outputs.shape[-1], initializer=tf.zeros_initializer(), regularizer=regularizer)
 87 |             outputs += b
 88 |         outputs = activation(outputs)
 89 |         return outputs
 90 | 
 91 | 
 92 | def add_positional_embedding(inputs, max_position_length, reuse=None, name='positional_embedding'):
 93 |     with tf.variable_scope(name, reuse=reuse):
 94 |         batch_size, seq_length, dim = get_shape_list(inputs)
 95 |         assert_op = tf.assert_less_equal(seq_length, max_position_length)
 96 |         with tf.control_dependencies([assert_op]):
 97 |             full_position_embeddings = tf.get_variable(name='position_embeddings', shape=[max_position_length, dim],
 98 |                                                        dtype=tf.float32)
 99 |             position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])
100 |             num_dims = len(inputs.shape.as_list())
101 |             position_broadcast_shape = []
102 |             for _ in range(num_dims - 2):
103 |                 position_broadcast_shape.append(1)
104 |             position_broadcast_shape.extend([seq_length, dim])
105 |             position_embeddings = tf.reshape(position_embeddings, shape=position_broadcast_shape)
106 |             outputs = inputs + position_embeddings
107 |         return outputs
108 | 
109 | 
110 | def conv_block(inputs, kernel_size, dim, num_layers, drop_rate=0.0, reuse=None, name='conv_block'):
111 |     with tf.variable_scope(name, reuse=reuse):
112 |         outputs = tf.expand_dims(inputs, axis=2)
113 |         for layer_idx in range(num_layers):
114 |             residual = outputs
115 |             outputs = layer_norm(outputs, reuse=reuse, name='layer_norm_%d' % layer_idx)
116 |             outputs = depthwise_separable_conv(outputs, kernel_size=(kernel_size, 1), dim=dim, use_bias=True,
117 |                                                activation=tf.nn.relu, name='depthwise_conv_layers_%d' % layer_idx,
118 |                                                reuse=reuse)
119 |             outputs = tf.nn.dropout(outputs, rate=drop_rate) + residual
120 |         return tf.squeeze(outputs, 2)
121 | 
122 | 
123 | def multihead_attention(inputs, dim, num_heads, mask=None, drop_rate=0.0, reuse=None, name='multihead_attention'):
124 |     with tf.variable_scope(name, reuse=reuse):
125 |         if dim % num_heads != 0:
126 |             raise ValueError('The hidden size (%d) is not a multiple of the attention heads (%d)' % (dim, num_heads))
127 |         batch_size, seq_length, _ = get_shape_list(inputs)
128 |         head_size = dim // num_heads
129 | 
130 |         def transpose_for_scores(input_tensor, batch_size_, seq_length_, num_heads_, head_size_):
131 |             output_tensor = tf.reshape(input_tensor, shape=[batch_size_, seq_length_, num_heads_, head_size_])
132 |             output_tensor = tf.transpose(output_tensor, perm=[0, 2, 1, 3])
133 |             return output_tensor
134 | 
135 |         # projection
136 |         query = conv1d(inputs, dim=dim, use_bias=True, reuse=reuse, name='query')
137 |         key = conv1d(inputs, dim=dim, use_bias=True, reuse=reuse, name='key')
138 |         value = conv1d(inputs, dim=dim, use_bias=True, reuse=reuse, name='value')
139 |         # reshape & transpose: (batch_size, seq_length, dim) --> (batch_size, num_heads, seq_length, head_size)
140 |         query = transpose_for_scores(query, batch_size, seq_length, num_heads, head_size)
141 |         key = transpose_for_scores(key, batch_size, seq_length, num_heads, head_size)
142 |         value = transpose_for_scores(value, batch_size, seq_length, num_heads, head_size)
143 |         # compute attention score
144 |         query = tf.multiply(query, 1.0 / math.sqrt(float(head_size)))
145 |         attention_score = tf.matmul(query, key, transpose_b=True)
146 |         if mask is not None:
147 |             shapes = get_shape_list(attention_score)
148 |             mask = tf.cast(tf.reshape(mask, shape=[shapes[0], 1, 1, shapes[-1]]), dtype=tf.float32)
149 |             attention_score += (1.0 - mask) * -1e30
150 |         attention_score = tf.nn.softmax(attention_score)  # shape = (batch_size, num_heads, seq_length, seq_length)
151 |         attention_score = tf.nn.dropout(attention_score, rate=drop_rate)
152 |         # compute value
153 |         value = tf.matmul(attention_score, value)  # shape = (batch_size, num_heads, seq_length, head_size)
154 |         value = tf.transpose(value, perm=[0, 2, 1, 3])
155 |         value = tf.reshape(value, shape=[batch_size, seq_length, num_heads * head_size])
156 |         return value
157 | 
158 | 
159 | def multihead_attention_block(inputs, dim, num_heads, mask=None, use_bias=True, drop_rate=0.0, reuse=None,
160 |                               name='multihead_attention_block'):
161 |     with tf.variable_scope(name, reuse=reuse):
162 |         # multihead attention layer
163 |         outputs = layer_norm(inputs, reuse=reuse, name='layer_norm_1')
164 |         outputs = tf.nn.dropout(outputs, rate=drop_rate)
165 |         outputs = multihead_attention(outputs, dim=dim, num_heads=num_heads, mask=mask, drop_rate=drop_rate,
166 |                                       name='multihead_attention')
167 |         outputs = tf.nn.dropout(outputs, rate=drop_rate)
168 |         residual = outputs + inputs
169 |         # feed forward layer
170 |         outputs = layer_norm(residual, reuse=reuse, name='layer_norm_2')
171 |         outputs = tf.nn.dropout(outputs, rate=drop_rate)
172 |         outputs = conv1d(outputs, dim=dim, use_bias=use_bias, activation=None, reuse=reuse, name='dense')
173 |         outputs = tf.nn.dropout(outputs, rate=drop_rate)
174 |         outputs = outputs + residual
175 |         return outputs
176 | 
177 | 
178 | def feature_encoder(inputs, hidden_size, num_heads, max_position_length, drop_rate, mask, reuse=None,
179 |                     name='feature_encoder'):
180 |     with tf.variable_scope(name, reuse=reuse):
181 |         features = add_positional_embedding(inputs, max_position_length=max_position_length, reuse=reuse,
182 |                                             name='positional_embedding')
183 |         features = conv_block(features, kernel_size=7, dim=hidden_size, num_layers=4, reuse=reuse, drop_rate=drop_rate,
184 |                               name='conv_block')
185 |         features = multihead_attention_block(features, dim=hidden_size, num_heads=num_heads, mask=mask, use_bias=True,
186 |                                              drop_rate=drop_rate, reuse=False, name='multihead_attention_block')
187 |         return features
188 | 
189 | 
190 | def video_query_attention(video_features, query_features, v_mask, q_mask, drop_rate=0.0, reuse=None,
191 |                           name='video_query_attention'):
192 |     with tf.variable_scope(name, reuse=reuse):
193 |         dim = get_shape_list(video_features)[-1]
194 |         v_maxlen = tf.reduce_max(tf.reduce_sum(v_mask, axis=1))
195 |         q_maxlen = tf.reduce_max(tf.reduce_sum(q_mask, axis=1))
196 |         score = trilinear_attention([video_features, query_features], v_maxlen=v_maxlen, q_maxlen=q_maxlen,
197 |                                     drop_rate=drop_rate, reuse=reuse, name='efficient_trilinear')
198 |         mask_q = tf.expand_dims(q_mask, 1)
199 |         score_ = tf.nn.softmax(mask_logits(score, mask=mask_q))
200 |         mask_v = tf.expand_dims(v_mask, 2)
201 |         score_t = tf.transpose(tf.nn.softmax(mask_logits(score, mask=mask_v), dim=1), perm=[0, 2, 1])
202 |         v2q = tf.matmul(score_, query_features)
203 |         q2v = tf.matmul(tf.matmul(score_, score_t), video_features)
204 |         attention_outputs = tf.concat([video_features, v2q, video_features * v2q, video_features * q2v], axis=-1)
205 |         outputs = conv1d(attention_outputs, dim=dim, use_bias=False, activation=None, reuse=reuse, name='dense')
206 |         return outputs, score
207 | 
208 | 
209 | def context_query_concat(inputs, qfeats, q_mask, reuse=None, name='context_query_concat'):
210 |     with tf.variable_scope(name, reuse=reuse):
211 |         dim = get_shape_list(qfeats)[-1]
212 |         # compute pooled query feature
213 |         weight = tf.get_variable(name='weight', shape=[dim, 1], dtype=tf.float32, regularizer=regularizer)
214 |         x = tf.tensordot(qfeats, weight, axes=1)  # shape = (batch_size, seq_length, 1)
215 |         q_mask = tf.expand_dims(q_mask, axis=-1)  # shape = (batch_size, seq_length, 1)
216 |         x = mask_logits(x, mask=q_mask)
217 |         alphas = tf.nn.softmax(x, axis=1)
218 |         q_pooled = tf.matmul(tf.transpose(qfeats, perm=[0, 2, 1]), alphas)
219 |         q_pooled = tf.squeeze(q_pooled, axis=-1)  # shape = (batch_size, dim)
220 |         # concatenation
221 |         q_pooled = tf.tile(tf.expand_dims(q_pooled, axis=1), multiples=[1, tf.shape(inputs)[1], 1])
222 |         outputs = tf.concat([inputs, q_pooled], axis=-1)
223 |         outputs = conv1d(outputs, dim=dim, use_bias=True, reuse=False, name='dense')
224 |         return outputs
225 | 
226 | 
227 | def highlight_layer(inputs, labels, mask, epsilon=1e-12, reuse=None, name='highlight_layer'):
228 |     with tf.variable_scope(name, reuse=reuse):
229 |         logits = conv1d(inputs, dim=1, use_bias=True, padding='VALID', reuse=reuse, name='dense')
230 |         logits = tf.squeeze(logits, axis=-1)  # (batch_size, seq_length)
231 |         logits = mask_logits(logits, mask=mask)
232 |         # prepare labels and weights
233 |         labels = tf.cast(labels, dtype=logits.dtype)
234 |         weights = tf.where(tf.equal(labels, 0.0), x=labels + 1.0, y=labels * 2.0)
235 |         # binary cross entropy with sigmoid activation
236 |         loss_per_location = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
237 |         loss_per_location = loss_per_location * weights
238 |         mask = tf.cast(mask, dtype=logits.dtype)
239 |         loss = tf.reduce_sum(loss_per_location * mask) / (tf.reduce_sum(mask) + epsilon)
240 |         # compute scores
241 |         scores = tf.sigmoid(logits)
242 |         return loss, scores
243 | 
244 | 
245 | def dynamic_rnn(inputs, seq_len, dim, reuse=None, name='dynamic_rnn'):
246 |     with tf.variable_scope(name, reuse=reuse):
247 |         cell = tf.nn.rnn_cell.LSTMCell(num_units=dim, use_peepholes=False, name='lstm_cell')
248 |         outputs, _ = tf.nn.dynamic_rnn(cell, inputs, sequence_length=seq_len, dtype=tf.float32)
249 |         return outputs
250 | 
251 | 
252 | def conditioned_predictor(inputs, hidden_size, seq_len, mask, num_heads, max_position_length, drop_rate, mode='rnn',
253 |                           reuse=None, name='conditioned_predictor'):
254 |     with tf.variable_scope(name, reuse=reuse):
255 |         if mode == 'rnn':
256 |             start_features = dynamic_rnn(inputs, seq_len, dim=hidden_size, reuse=False, name='start_rnn')
257 |             end_features = dynamic_rnn(start_features, seq_len, dim=hidden_size, reuse=False, name='end_rnn')
258 |         else:
259 |             start_features = feature_encoder(inputs, hidden_size=hidden_size, num_heads=num_heads, mask=mask,
260 |                                              max_position_length=max_position_length, drop_rate=drop_rate, reuse=False,
261 |                                              name='feature_encoder')
262 |             end_features = feature_encoder(start_features, hidden_size=hidden_size, num_heads=num_heads, mask=mask,
263 |                                            max_position_length=max_position_length, drop_rate=drop_rate, reuse=True,
264 |                                            name='feature_encoder')
265 |             start_features = layer_norm(start_features, reuse=False, name='s_layer_norm')
266 |             end_features = layer_norm(end_features, reuse=False, name='e_layer_norm')
267 |         start_features = conv1d(tf.concat([start_features, inputs], axis=-1), dim=hidden_size, use_bias=True,
268 |                                 reuse=False, activation=tf.nn.relu, name='start_hidden')
269 |         end_features = conv1d(tf.concat([end_features, inputs], axis=-1), dim=hidden_size, use_bias=True, reuse=False,
270 |                               activation=tf.nn.relu, name='end_hidden')
271 |         start_logits = conv1d(start_features, dim=1, use_bias=True, reuse=reuse, name='start_dense')
272 |         end_logits = conv1d(end_features, dim=1, use_bias=True, reuse=reuse, name='end_dense')
273 |         start_logits = mask_logits(tf.squeeze(start_logits, axis=-1), mask=mask)  # shape = (batch_size, seq_length)
274 |         end_logits = mask_logits(tf.squeeze(end_logits, axis=-1), mask=mask)  # shape = (batch_size, seq_length)
275 |         return start_logits, end_logits
276 | 
277 | 
278 | def localization_loss(start_logits, end_logits, y1, y2):
279 |     start_prob = tf.nn.softmax(start_logits, axis=1)
280 |     end_prob = tf.nn.softmax(end_logits, axis=1)
281 |     outer = tf.matmul(tf.expand_dims(start_prob, axis=2), tf.expand_dims(end_prob, axis=1))
282 |     outer = tf.matrix_band_part(outer, num_lower=0, num_upper=-1)
283 |     start_index = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
284 |     end_index = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
285 |     start_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=start_logits, labels=y1)
286 |     end_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=end_logits, labels=y2)
287 |     loss = tf.reduce_mean(start_losses + end_losses)
288 |     return start_prob, end_prob, start_index, end_index, loss
289 | 


--------------------------------------------------------------------------------
/model/layers_t7.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | def mask_logits(inputs, mask, mask_value=-1e30):
  8 |     mask = mask.type(torch.float32)
  9 |     return inputs + (1.0 - mask) * mask_value
 10 | 
 11 | 
 12 | class Conv1D(nn.Module):
 13 |     def __init__(self, in_dim, out_dim, kernel_size=1, stride=1, padding=0, bias=True):
 14 |         super(Conv1D, self).__init__()
 15 |         self.conv1d = nn.Conv1d(in_channels=in_dim, out_channels=out_dim, kernel_size=kernel_size, padding=padding,
 16 |                                 stride=stride, bias=bias)
 17 | 
 18 |     def forward(self, x):
 19 |         # suppose all the input with shape (batch_size, seq_len, dim)
 20 |         x = x.transpose(1, 2)  # (batch_size, dim, seq_len)
 21 |         x = self.conv1d(x)
 22 |         return x.transpose(1, 2)  # (batch_size, seq_len, dim)
 23 | 
 24 | 
 25 | class WordEmbedding(nn.Module):
 26 |     def __init__(self, num_words, word_dim, drop_rate, word_vectors=None):
 27 |         super(WordEmbedding, self).__init__()
 28 |         self.is_pretrained = False if word_vectors is None else True
 29 |         if self.is_pretrained:
 30 |             self.pad_vec = nn.Parameter(torch.zeros(size=(1, word_dim), dtype=torch.float32), requires_grad=False)
 31 |             unk_vec = torch.empty(size=(1, word_dim), requires_grad=True, dtype=torch.float32)
 32 |             nn.init.xavier_uniform_(unk_vec)
 33 |             self.unk_vec = nn.Parameter(unk_vec, requires_grad=True)
 34 |             self.glove_vec = nn.Parameter(torch.tensor(word_vectors, dtype=torch.float32), requires_grad=False)
 35 |         else:
 36 |             self.word_emb = nn.Embedding(num_words, word_dim, padding_idx=0)
 37 |         self.dropout = nn.Dropout(p=drop_rate)
 38 | 
 39 |     def forward(self, word_ids):
 40 |         if self.is_pretrained:
 41 |             word_emb = F.embedding(word_ids, torch.cat([self.pad_vec, self.unk_vec, self.glove_vec], dim=0),
 42 |                                    padding_idx=0)
 43 |         else:
 44 |             word_emb = self.word_emb(word_ids)
 45 |         return self.dropout(word_emb)
 46 | 
 47 | 
 48 | class CharacterEmbedding(nn.Module):
 49 |     def __init__(self, num_chars, char_dim, drop_rate):
 50 |         super(CharacterEmbedding, self).__init__()
 51 |         self.char_emb = nn.Embedding(num_chars, char_dim, padding_idx=0)
 52 |         kernels, channels = [1, 2, 3, 4], [10, 20, 30, 40]
 53 |         self.char_convs = nn.ModuleList([
 54 |             nn.Sequential(
 55 |                 nn.Conv2d(in_channels=char_dim, out_channels=channel, kernel_size=(1, kernel), stride=(1, 1), padding=0,
 56 |                           bias=True),
 57 |                 nn.ReLU()
 58 |             ) for kernel, channel in zip(kernels, channels)
 59 |         ])
 60 |         self.dropout = nn.Dropout(p=drop_rate)
 61 | 
 62 |     def forward(self, char_ids):
 63 |         char_emb = self.char_emb(char_ids)  # (batch_size, w_seq_len, c_seq_len, char_dim)
 64 |         char_emb = self.dropout(char_emb)
 65 |         char_emb = char_emb.permute(0, 3, 1, 2)  # (batch_size, char_dim, w_seq_len, c_seq_len)
 66 |         char_outputs = []
 67 |         for conv_layer in self.char_convs:
 68 |             output = conv_layer(char_emb)
 69 |             output, _ = torch.max(output, dim=3, keepdim=False)  # reduce max (batch_size, channel, w_seq_len)
 70 |             char_outputs.append(output)
 71 |         char_output = torch.cat(char_outputs, dim=1)  # (batch_size, sum(channels), w_seq_len)
 72 |         return char_output.permute(0, 2, 1)  # (batch_size, w_seq_len, sum(channels))
 73 | 
 74 | 
 75 | class Embedding(nn.Module):
 76 |     def __init__(self, num_words, num_chars, word_dim, char_dim, drop_rate, out_dim, word_vectors=None):
 77 |         super(Embedding, self).__init__()
 78 |         self.word_emb = WordEmbedding(num_words, word_dim, drop_rate, word_vectors=word_vectors)
 79 |         self.char_emb = CharacterEmbedding(num_chars, char_dim, drop_rate)
 80 |         # output linear layer
 81 |         self.linear = Conv1D(in_dim=word_dim + 100, out_dim=out_dim, kernel_size=1, stride=1, padding=0, bias=True)
 82 | 
 83 |     def forward(self, word_ids, char_ids):
 84 |         word_emb = self.word_emb(word_ids)  # (batch_size, w_seq_len, word_dim)
 85 |         char_emb = self.char_emb(char_ids)  # (batch_size, w_seq_len, 100)
 86 |         emb = torch.cat([word_emb, char_emb], dim=2)  # (batch_size, w_seq_len, word_dim + 100)
 87 |         emb = self.linear(emb)  # (batch_size, w_seq_len, dim)
 88 |         return emb
 89 | 
 90 | 
 91 | class PositionalEmbedding(nn.Module):
 92 |     """Construct the embeddings from word, position and token_type embeddings."""
 93 |     def __init__(self, num_embeddings, embedding_dim):
 94 |         super(PositionalEmbedding, self).__init__()
 95 |         self.position_embeddings = nn.Embedding(num_embeddings, embedding_dim)
 96 | 
 97 |     def forward(self, inputs):
 98 |         bsz, seq_length = inputs.shape[:2]
 99 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=inputs.device)
100 |         position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
101 |         position_embeddings = self.position_embeddings(position_ids)
102 |         return position_embeddings
103 | 
104 | 
105 | class VisualProjection(nn.Module):
106 |     def __init__(self, visual_dim, dim, drop_rate=0.0):
107 |         super(VisualProjection, self).__init__()
108 |         self.drop = nn.Dropout(p=drop_rate)
109 |         self.linear = Conv1D(in_dim=visual_dim, out_dim=dim, kernel_size=1, stride=1, bias=True, padding=0)
110 | 
111 |     def forward(self, visual_features):
112 |         # the input visual feature with shape (batch_size, seq_len, visual_dim)
113 |         visual_features = self.drop(visual_features)
114 |         output = self.linear(visual_features)  # (batch_size, seq_len, dim)
115 |         return output
116 | 
117 | 
118 | class DepthwiseSeparableConvBlock(nn.Module):
119 |     def __init__(self, dim, kernel_size, drop_rate, num_layers=4):
120 |         super(DepthwiseSeparableConvBlock, self).__init__()
121 |         self.depthwise_separable_conv = nn.ModuleList([
122 |             nn.Sequential(
123 |                 nn.Conv1d(in_channels=dim, out_channels=dim, kernel_size=kernel_size, groups=dim,
124 |                           padding=kernel_size // 2, bias=False),
125 |                 nn.Conv1d(in_channels=dim, out_channels=dim, kernel_size=1, padding=0, bias=True),
126 |                 nn.ReLU(),
127 |             ) for _ in range(num_layers)])
128 |         self.layer_norms = nn.ModuleList([nn.LayerNorm(dim, eps=1e-6) for _ in range(num_layers)])
129 |         self.dropout = nn.Dropout(p=drop_rate)
130 | 
131 |     def forward(self, x):
132 |         output = x  # (batch_size, seq_len, dim)
133 |         for idx, conv_layer in enumerate(self.depthwise_separable_conv):
134 |             residual = output
135 |             output = self.layer_norms[idx](output)  # (batch_size, seq_len, dim)
136 |             output = output.transpose(1, 2)  # (batch_size, dim, seq_len)
137 |             output = conv_layer(output)
138 |             output = self.dropout(output)
139 |             output = output.transpose(1, 2) + residual  # (batch_size, seq_len, dim)
140 |         return output
141 | 
142 | 
143 | class MultiHeadAttentionBlock(nn.Module):
144 |     def __init__(self, dim, num_heads, drop_rate):
145 |         super(MultiHeadAttentionBlock, self).__init__()
146 |         assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (dim, num_heads)
147 |         self.head_size, self.num_heads, self.dim = int(dim / num_heads), num_heads, dim
148 |         self.dropout = nn.Dropout(p=drop_rate)
149 |         self.query = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True)
150 |         self.key = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True)
151 |         self.value = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True)
152 |         self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6)
153 |         self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6)
154 |         self.out_layer = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True)
155 | 
156 |     def transpose_for_scores(self, x):
157 |         new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size)
158 |         x = x.view(*new_x_shape)
159 |         return x.permute(0, 2, 1, 3)  # (batch_size, num_heads, w_seq_len, head_size)
160 | 
161 |     @staticmethod
162 |     def combine_last_two_dim(x):
163 |         old_shape = list(x.size())
164 |         new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]]
165 |         return x.reshape(shape=new_shape)
166 | 
167 |     def forward(self, x, mask=None):
168 |         output = self.layer_norm1(x)  # (batch_size, seq_len, dim)
169 |         output = self.dropout(output)
170 |         # multi-head attention layer
171 |         query = self.transpose_for_scores(self.query(output))  # (batch_size, num_heads, seq_len, head_size)
172 |         key = self.transpose_for_scores(self.key(output))
173 |         value = self.transpose_for_scores(self.value(output))
174 |         attention_scores = torch.matmul(query, key.transpose(-1, -2))  # (batch_size, num_heads, seq_len, seq_len)
175 |         attention_scores = attention_scores / math.sqrt(self.head_size)
176 |         if mask is not None:  # masking
177 |             mask = mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)
178 |             attention_scores = mask_logits(attention_scores, mask)
179 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)  # (batch_size, num_heads, seq_len, seq_len)
180 |         attention_probs = self.dropout(attention_probs)
181 |         value = torch.matmul(attention_probs, value)  # (batch_size, num_heads, seq_len, head_size)
182 |         value = self.combine_last_two_dim(value.permute(0, 2, 1, 3))  # (batch_size, seq_len, dim)
183 |         # intermediate layer
184 |         output = self.dropout(value)
185 |         residual = output + x
186 |         output = self.layer_norm2(residual)
187 |         output = self.dropout(output)
188 |         output = self.out_layer(output)
189 |         output = self.dropout(output) + residual
190 |         return output
191 | 
192 | 
193 | class FeatureEncoder(nn.Module):
194 |     def __init__(self, dim, num_heads, max_pos_len, kernel_size=7, num_layers=4, drop_rate=0.0):
195 |         super(FeatureEncoder, self).__init__()
196 |         self.pos_embedding = PositionalEmbedding(num_embeddings=max_pos_len, embedding_dim=dim)
197 |         self.conv_block = DepthwiseSeparableConvBlock(dim=dim, kernel_size=kernel_size, drop_rate=drop_rate,
198 |                                                       num_layers=num_layers)
199 |         self.attention_block = MultiHeadAttentionBlock(dim=dim, num_heads=num_heads, drop_rate=drop_rate)
200 | 
201 |     def forward(self, x, mask=None):
202 |         features = x + self.pos_embedding(x)  # (batch_size, seq_len, dim)
203 |         features = self.conv_block(features)  # (batch_size, seq_len, dim)
204 |         features = self.attention_block(features, mask=mask)  # (batch_size, seq_len, dim)
205 |         return features
206 | 
207 | 
208 | class CQAttention(nn.Module):
209 |     def __init__(self, dim, drop_rate=0.0):
210 |         super(CQAttention, self).__init__()
211 |         w4C = torch.empty(dim, 1)
212 |         w4Q = torch.empty(dim, 1)
213 |         w4mlu = torch.empty(1, 1, dim)
214 |         nn.init.xavier_uniform_(w4C)
215 |         nn.init.xavier_uniform_(w4Q)
216 |         nn.init.xavier_uniform_(w4mlu)
217 |         self.w4C = nn.Parameter(w4C, requires_grad=True)
218 |         self.w4Q = nn.Parameter(w4Q, requires_grad=True)
219 |         self.w4mlu = nn.Parameter(w4mlu, requires_grad=True)
220 |         self.dropout = nn.Dropout(p=drop_rate)
221 |         self.cqa_linear = Conv1D(in_dim=4 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True)
222 | 
223 |     def forward(self, context, query, c_mask, q_mask):
224 |         score = self.trilinear_attention(context, query)  # (batch_size, c_seq_len, q_seq_len)
225 |         score_ = nn.Softmax(dim=2)(mask_logits(score, q_mask.unsqueeze(1)))  # (batch_size, c_seq_len, q_seq_len)
226 |         score_t = nn.Softmax(dim=1)(mask_logits(score, c_mask.unsqueeze(2)))  # (batch_size, c_seq_len, q_seq_len)
227 |         score_t = score_t.transpose(1, 2)  # (batch_size, q_seq_len, c_seq_len)
228 |         c2q = torch.matmul(score_, query)  # (batch_size, c_seq_len, dim)
229 |         q2c = torch.matmul(torch.matmul(score_, score_t), context)  # (batch_size, c_seq_len, dim)
230 |         output = torch.cat([context, c2q, torch.mul(context, c2q), torch.mul(context, q2c)], dim=2)
231 |         output = self.cqa_linear(output)  # (batch_size, c_seq_len, dim)
232 |         return output
233 | 
234 |     def trilinear_attention(self, context, query):
235 |         batch_size, c_seq_len, dim = context.shape
236 |         batch_size, q_seq_len, dim = query.shape
237 |         context = self.dropout(context)
238 |         query = self.dropout(query)
239 |         subres0 = torch.matmul(context, self.w4C).expand([-1, -1, q_seq_len])  # (batch_size, c_seq_len, q_seq_len)
240 |         subres1 = torch.matmul(query, self.w4Q).transpose(1, 2).expand([-1, c_seq_len, -1])
241 |         subres2 = torch.matmul(context * self.w4mlu, query.transpose(1, 2))
242 |         res = subres0 + subres1 + subres2  # (batch_size, c_seq_len, q_seq_len)
243 |         return res
244 | 
245 | 
246 | class WeightedPool(nn.Module):
247 |     def __init__(self, dim):
248 |         super(WeightedPool, self).__init__()
249 |         weight = torch.empty(dim, 1)
250 |         nn.init.xavier_uniform_(weight)
251 |         self.weight = nn.Parameter(weight, requires_grad=True)
252 | 
253 |     def forward(self, x, mask):
254 |         alpha = torch.tensordot(x, self.weight, dims=1)  # shape = (batch_size, seq_length, 1)
255 |         alpha = mask_logits(alpha, mask=mask.unsqueeze(2))
256 |         alphas = nn.Softmax(dim=1)(alpha)
257 |         pooled_x = torch.matmul(x.transpose(1, 2), alphas)  # (batch_size, dim, 1)
258 |         pooled_x = pooled_x.squeeze(2)
259 |         return pooled_x
260 | 
261 | 
262 | class CQConcatenate(nn.Module):
263 |     def __init__(self, dim):
264 |         super(CQConcatenate, self).__init__()
265 |         self.weighted_pool = WeightedPool(dim=dim)
266 |         self.conv1d = Conv1D(in_dim=2 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True)
267 | 
268 |     def forward(self, context, query, q_mask):
269 |         pooled_query = self.weighted_pool(query, q_mask)  # (batch_size, dim)
270 |         _, c_seq_len, _ = context.shape
271 |         pooled_query = pooled_query.unsqueeze(1).repeat(1, c_seq_len, 1)  # (batch_size, c_seq_len, dim)
272 |         output = torch.cat([context, pooled_query], dim=2)  # (batch_size, c_seq_len, 2*dim)
273 |         output = self.conv1d(output)
274 |         return output
275 | 
276 | 
277 | class HighLightLayer(nn.Module):
278 |     def __init__(self, dim):
279 |         super(HighLightLayer, self).__init__()
280 |         self.conv1d = Conv1D(in_dim=dim, out_dim=1, kernel_size=1, stride=1, padding=0, bias=True)
281 | 
282 |     def forward(self, x, mask):
283 |         # compute logits
284 |         logits = self.conv1d(x)
285 |         logits = logits.squeeze(2)
286 |         logits = mask_logits(logits, mask)
287 |         # compute score
288 |         scores = nn.Sigmoid()(logits)
289 |         return scores
290 | 
291 |     @staticmethod
292 |     def compute_loss(scores, labels, mask, epsilon=1e-12):
293 |         labels = labels.type(torch.float32)
294 |         weights = torch.where(labels == 0.0, labels + 1.0, 2.0 * labels)
295 |         loss_per_location = nn.BCELoss(reduction='none')(scores, labels)
296 |         loss_per_location = loss_per_location * weights
297 |         mask = mask.type(torch.float32)
298 |         loss = torch.sum(loss_per_location * mask) / (torch.sum(mask) + epsilon)
299 |         return loss
300 | 
301 | 
302 | class DynamicRNN(nn.Module):
303 |     def __init__(self, dim):
304 |         super(DynamicRNN, self).__init__()
305 |         self.lstm = nn.LSTM(input_size=dim, hidden_size=dim, num_layers=1, bias=True, batch_first=True,
306 |                             bidirectional=False)
307 | 
308 |     def forward(self, x, mask):
309 |         out, _ = self.lstm(x)  # (bsz, seq_len, dim)
310 |         mask = mask.type(torch.float32)
311 |         mask = mask.unsqueeze(2)
312 |         out = out * mask
313 |         return out
314 | 
315 | 
316 | class ConditionedPredictor(nn.Module):
317 |     def __init__(self, dim, num_heads, max_pos_len, drop_rate=0.0, predictor='rnn'):
318 |         super(ConditionedPredictor, self).__init__()
319 |         self.predictor = predictor
320 |         if predictor == 'rnn':
321 |             self.start_encoder = DynamicRNN(dim=dim)
322 |             self.end_encoder = DynamicRNN(dim=dim)
323 |         else:
324 |             self.encoder = FeatureEncoder(dim=dim, num_heads=num_heads, kernel_size=7, num_layers=4,
325 |                                           max_pos_len=max_pos_len, drop_rate=drop_rate)
326 |             self.start_layer_norm = nn.LayerNorm(dim, eps=1e-6)
327 |             self.end_layer_norm = nn.LayerNorm(dim, eps=1e-6)
328 | 
329 |         self.start_block = nn.Sequential(
330 |             Conv1D(in_dim=2 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True),
331 |             nn.ReLU(),
332 |             Conv1D(in_dim=dim, out_dim=1, kernel_size=1, stride=1, padding=0, bias=True)
333 |         )
334 |         self.end_block = nn.Sequential(
335 |             Conv1D(in_dim=2 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True),
336 |             nn.ReLU(),
337 |             Conv1D(in_dim=dim, out_dim=1, kernel_size=1, stride=1, padding=0, bias=True)
338 |         )
339 | 
340 |     def forward(self, x, mask):
341 |         if self.predictor == 'rnn':
342 |             start_features = self.start_encoder(x, mask)  # (batch_size, seq_len, dim)
343 |             end_features = self.end_encoder(start_features, mask)
344 |         else:
345 |             start_features = self.encoder(x, mask)
346 |             end_features = self.encoder(start_features, mask)
347 |             start_features = self.start_layer_norm(start_features)
348 |             end_features = self.end_layer_norm(end_features)
349 |         start_features = self.start_block(torch.cat([start_features, x], dim=2))  # (batch_size, seq_len, 1)
350 |         end_features = self.end_block(torch.cat([end_features, x], dim=2))
351 |         start_logits = mask_logits(start_features.squeeze(2), mask=mask)
352 |         end_logits = mask_logits(end_features.squeeze(2), mask=mask)
353 |         return start_logits, end_logits
354 | 
355 |     @staticmethod
356 |     def extract_index(start_logits, end_logits):
357 |         start_prob = nn.Softmax(dim=1)(start_logits)
358 |         end_prob = nn.Softmax(dim=1)(end_logits)
359 |         outer = torch.matmul(start_prob.unsqueeze(dim=2), end_prob.unsqueeze(dim=1))
360 |         outer = torch.triu(outer, diagonal=0)
361 |         _, start_index = torch.max(torch.max(outer, dim=2)[0], dim=1)  # (batch_size, )
362 |         _, end_index = torch.max(torch.max(outer, dim=1)[0], dim=1)  # (batch_size, )
363 |         return start_index, end_index
364 | 
365 |     @staticmethod
366 |     def compute_cross_entropy_loss(start_logits, end_logits, start_labels, end_labels):
367 |         start_loss = nn.CrossEntropyLoss(reduction='mean')(start_logits, start_labels)
368 |         end_loss = nn.CrossEntropyLoss(reduction='mean')(end_logits, end_labels)
369 |         return start_loss + end_loss
370 | 


--------------------------------------------------------------------------------