├── LICENSE ├── README.md ├── Thumos14-Annotations ├── Ambiguous_test.txt ├── classlist.npy ├── duration.npy ├── labels.npy ├── labels_all.npy ├── segments.npy ├── subset.npy └── videoname.npy ├── Thumos14reduced-Annotations ├── Ambiguous_test.txt ├── classlist.npy ├── duration.npy ├── extracted_fps.npy ├── labels.npy ├── labels_all.npy ├── original_fps.npy ├── segments.npy ├── subset.npy └── videoname.npy ├── classificationMAP.py ├── detectionMAP.py ├── main.py ├── model.py ├── options.py ├── test.py ├── test_set_meta.mat ├── train.py ├── utils.py └── video_dataset.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Sujoy Paul 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # W-TALC: Weakly-supervised Temporal Activity Localization and Classification 2 | 3 | ## Overview 4 | This package is a PyTorch implementation of the paper [W-TALC: Weakly-supervised Temporal Activity Localization and Classification](http://openaccess.thecvf.com/content_ECCV_2018/papers/Sujoy_Paul_W-TALC_Weakly-supervised_Temporal_ECCV_2018_paper.pdf), by [Sujoy Paul](www.ee.ucr.edu/~supaul/ 5 | ), Sourya Roy and [Amit K Roy-Chowdhury](http://www.ee.ucr.edu/~amitrc/) and published at [ECCV 2018](https://eccv2018.org/). The TensorFlow implementation can be found [here](https://github.com/sujoyp/wtalc-tensorflow). 6 | 7 | ## Dependencies 8 | This package uses or depends on the the following packages: 9 | 1. PyTorch 0.4.1, Tensorboard Logger 0.1.0 10 | 2. Python 3.6 11 | 3. numpy, scipy among others 12 | 13 | ## Data 14 | The features for Thumos14 and ActivityNet1.2 dataset can be downloaded [here](https://emailucr-my.sharepoint.com/:f:/g/personal/sujoy_paul_email_ucr_edu/Es1zbHQY4PxKhUkdgvWHtU0BK-_yugaSjXK84kWsB0XD0w?e=I836Fl). The annotations are included with this package. 15 | 16 | ## Running 17 | This code can be run using two diferent datasets - Thumos14 and Thumos14reduced. The later dataset contain only the data points which has temporal boundaries of Thumos14. There are two options of features only for Thumos14reduced. The dataset name (with other parameters can be changed in options.py). The file to be executed is main.py. The results can be viewed using tensorboard logger or the text file named .log generated during execution. The options for I3D features are the ones mentioned in options.py. For UNT features, the options to be used are as follows: 18 | 19 | ```javascript 20 | python main.py --max-seqlen 1200 --lr 0.00001 --feature-type UNT 21 | ``` 22 | 23 | ## Citation 24 | Please cite the following work if you use this package. 25 | ```javascript 26 | @inproceedings{paul2018w, 27 | title={W-TALC: Weakly-supervised Temporal Activity Localization and Classification}, 28 | author={Paul, Sujoy and Roy, Sourya and Roy-Chowdhury, Amit K}, 29 | booktitle={Proceedings of the European Conference on Computer Vision (ECCV)}, 30 | pages={563--579}, 31 | year={2018} 32 | } 33 | ``` 34 | 35 | ## Contact 36 | Please contact the first author of the associated paper - Sujoy Paul (supaul@ece.ucr.edu) for any further queries. 37 | 38 | 39 | -------------------------------------------------------------------------------- /Thumos14-Annotations/Ambiguous_test.txt: -------------------------------------------------------------------------------- 1 | video_test_0000278 0.0 1.4 2 | video_test_0000278 95.7 97.2 3 | video_test_0000293 50.6 54.6 4 | video_test_0000293 67.4 71.7 5 | video_test_0000293 99.7 106.4 6 | video_test_0000293 118.1 126.4 7 | video_test_0000293 145.8 149.8 8 | video_test_0000293 162.9 168.2 9 | video_test_0000293 181.3 184.0 10 | video_test_0000367 56.4 63.8 11 | video_test_0000367 167.8 170.7 12 | video_test_0000405 15.7 18.4 13 | video_test_0000426 17.8 18.6 14 | video_test_0000426 24.0 24.8 15 | video_test_0000426 40.1 41.8 16 | video_test_0000426 113.9 115.0 17 | video_test_0000426 118.8 119.7 18 | video_test_0000426 124.0 125.2 19 | video_test_0000426 135.2 136.9 20 | video_test_0000437 1.6 12.1 21 | video_test_0000437 47.2 48.4 22 | video_test_0000437 53.2 54.0 23 | video_test_0000437 65.9 67.8 24 | video_test_0000448 42.2 53.8 25 | video_test_0000461 14.0 16.7 26 | video_test_0000549 28.2 33.6 27 | video_test_0000549 14.4 17.4 28 | video_test_0000549 55.0 57.1 29 | video_test_0000593 23.6 29.0 30 | video_test_0000593 36.9 44.3 31 | video_test_0000611 43.0 45.9 32 | video_test_0000611 55.6 58.8 33 | video_test_0000611 59.9 71.0 34 | video_test_0000615 136.9 142.5 35 | video_test_0000615 152.7 159.8 36 | video_test_0000615 164.7 168.0 37 | video_test_0000624 2.5 6.8 38 | video_test_0000664 4.2 5.6 39 | video_test_0000691 36.3 80.8 40 | video_test_0000691 123.9 151.0 41 | video_test_0000714 136.7 137.5 42 | video_test_0000718 13.3 15.5 43 | video_test_0000847 33.7 35.4 44 | video_test_0000847 46.0 52.0 45 | video_test_0000847 58.7 67.0 46 | video_test_0000847 82.8 98.1 47 | video_test_0000847 136.2 171.2 48 | video_test_0000847 175.0 178.5 49 | video_test_0000847 204.5 212.8 50 | video_test_0000940 90.3 92.0 51 | video_test_0000989 170.6 188.4 52 | video_test_0001075 12.3 13.3 53 | video_test_0001075 142.6 143.9 54 | video_test_0001076 17.0 18.4 55 | video_test_0001076 23.8 25.9 56 | video_test_0001076 47.5 57.8 57 | video_test_0001079 335.1 342.8 58 | video_test_0001079 416.0 420.7 59 | video_test_0001127 76.1 161.2 60 | video_test_0001134 2.6 4.7 61 | video_test_0001134 21.2 22.8 62 | video_test_0001134 30.2 36.1 63 | video_test_0001134 41.4 45.4 64 | video_test_0001134 72.4 73.0 65 | video_test_0001168 52.6 78.2 66 | video_test_0001201 122.4 125.3 67 | video_test_0001209 122.9 141.3 68 | video_test_0001267 81.7 84.8 69 | video_test_0001292 78.4 113.2 70 | video_test_0001292 39.9 47.4 71 | video_test_0001292 141.2 154.1 72 | video_test_0001343 224.9 226.6 73 | video_test_0001343 241.0 244.3 74 | video_test_0001433 16.4 17.7 75 | video_test_0001496 22.2 23.9 76 | video_test_0001496 41.9 44.1 77 | video_test_0001496 54.8 56.6 78 | video_test_0001496 62.1 64.4 79 | video_test_0001496 71.9 73.4 80 | video_test_0001496 119.4 121.0 81 | video_test_0001496 124.2 126.3 82 | video_test_0001496 136.0 137.7 83 | video_test_0001496 145.2 147.1 84 | video_test_0001508 11.2 13.1 85 | video_test_0001508 19.4 23.3 86 | video_test_0001508 23.9 27.2 87 | video_test_0001508 29.2 32.2 88 | video_test_0001508 33.2 36.6 89 | video_test_0001508 43.0 45.6 90 | video_test_0001508 46.5 48.5 91 | video_test_0001508 131.4 132.5 92 | video_test_0001508 139.7 141.7 93 | video_test_0001508 149.1 151.9 94 | video_test_0001508 153.4 155.3 95 | video_test_0001508 160.8 166.0 96 | video_test_0001512 7.5 10.0 97 | video_test_0001532 57.3 121.4 98 | video_test_0001549 15.9 26.7 99 | video_test_0001549 75.0 100.2 100 | -------------------------------------------------------------------------------- /Thumos14-Annotations/classlist.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/classlist.npy -------------------------------------------------------------------------------- /Thumos14-Annotations/duration.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/duration.npy -------------------------------------------------------------------------------- /Thumos14-Annotations/labels.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/labels.npy -------------------------------------------------------------------------------- /Thumos14-Annotations/labels_all.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/labels_all.npy -------------------------------------------------------------------------------- /Thumos14-Annotations/segments.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/segments.npy -------------------------------------------------------------------------------- /Thumos14-Annotations/subset.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/subset.npy -------------------------------------------------------------------------------- /Thumos14-Annotations/videoname.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14-Annotations/videoname.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/Ambiguous_test.txt: -------------------------------------------------------------------------------- 1 | video_test_0000278 0.0 1.4 2 | video_test_0000278 95.7 97.2 3 | video_test_0000293 50.6 54.6 4 | video_test_0000293 67.4 71.7 5 | video_test_0000293 99.7 106.4 6 | video_test_0000293 118.1 126.4 7 | video_test_0000293 145.8 149.8 8 | video_test_0000293 162.9 168.2 9 | video_test_0000293 181.3 184.0 10 | video_test_0000367 56.4 63.8 11 | video_test_0000367 167.8 170.7 12 | video_test_0000405 15.7 18.4 13 | video_test_0000426 17.8 18.6 14 | video_test_0000426 24.0 24.8 15 | video_test_0000426 40.1 41.8 16 | video_test_0000426 113.9 115.0 17 | video_test_0000426 118.8 119.7 18 | video_test_0000426 124.0 125.2 19 | video_test_0000426 135.2 136.9 20 | video_test_0000437 1.6 12.1 21 | video_test_0000437 47.2 48.4 22 | video_test_0000437 53.2 54.0 23 | video_test_0000437 65.9 67.8 24 | video_test_0000448 42.2 53.8 25 | video_test_0000461 14.0 16.7 26 | video_test_0000549 28.2 33.6 27 | video_test_0000549 14.4 17.4 28 | video_test_0000549 55.0 57.1 29 | video_test_0000593 23.6 29.0 30 | video_test_0000593 36.9 44.3 31 | video_test_0000611 43.0 45.9 32 | video_test_0000611 55.6 58.8 33 | video_test_0000611 59.9 71.0 34 | video_test_0000615 136.9 142.5 35 | video_test_0000615 152.7 159.8 36 | video_test_0000615 164.7 168.0 37 | video_test_0000624 2.5 6.8 38 | video_test_0000664 4.2 5.6 39 | video_test_0000691 36.3 80.8 40 | video_test_0000691 123.9 151.0 41 | video_test_0000714 136.7 137.5 42 | video_test_0000718 13.3 15.5 43 | video_test_0000847 33.7 35.4 44 | video_test_0000847 46.0 52.0 45 | video_test_0000847 58.7 67.0 46 | video_test_0000847 82.8 98.1 47 | video_test_0000847 136.2 171.2 48 | video_test_0000847 175.0 178.5 49 | video_test_0000847 204.5 212.8 50 | video_test_0000940 90.3 92.0 51 | video_test_0000989 170.6 188.4 52 | video_test_0001075 12.3 13.3 53 | video_test_0001075 142.6 143.9 54 | video_test_0001076 17.0 18.4 55 | video_test_0001076 23.8 25.9 56 | video_test_0001076 47.5 57.8 57 | video_test_0001079 335.1 342.8 58 | video_test_0001079 416.0 420.7 59 | video_test_0001127 76.1 161.2 60 | video_test_0001134 2.6 4.7 61 | video_test_0001134 21.2 22.8 62 | video_test_0001134 30.2 36.1 63 | video_test_0001134 41.4 45.4 64 | video_test_0001134 72.4 73.0 65 | video_test_0001168 52.6 78.2 66 | video_test_0001201 122.4 125.3 67 | video_test_0001209 122.9 141.3 68 | video_test_0001267 81.7 84.8 69 | video_test_0001292 78.4 113.2 70 | video_test_0001292 39.9 47.4 71 | video_test_0001292 141.2 154.1 72 | video_test_0001343 224.9 226.6 73 | video_test_0001343 241.0 244.3 74 | video_test_0001433 16.4 17.7 75 | video_test_0001496 22.2 23.9 76 | video_test_0001496 41.9 44.1 77 | video_test_0001496 54.8 56.6 78 | video_test_0001496 62.1 64.4 79 | video_test_0001496 71.9 73.4 80 | video_test_0001496 119.4 121.0 81 | video_test_0001496 124.2 126.3 82 | video_test_0001496 136.0 137.7 83 | video_test_0001496 145.2 147.1 84 | video_test_0001508 11.2 13.1 85 | video_test_0001508 19.4 23.3 86 | video_test_0001508 23.9 27.2 87 | video_test_0001508 29.2 32.2 88 | video_test_0001508 33.2 36.6 89 | video_test_0001508 43.0 45.6 90 | video_test_0001508 46.5 48.5 91 | video_test_0001508 131.4 132.5 92 | video_test_0001508 139.7 141.7 93 | video_test_0001508 149.1 151.9 94 | video_test_0001508 153.4 155.3 95 | video_test_0001508 160.8 166.0 96 | video_test_0001512 7.5 10.0 97 | video_test_0001532 57.3 121.4 98 | video_test_0001549 15.9 26.7 99 | video_test_0001549 75.0 100.2 100 | -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/classlist.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/classlist.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/duration.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/duration.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/extracted_fps.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/extracted_fps.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/labels.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/labels.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/labels_all.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/labels_all.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/original_fps.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/original_fps.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/segments.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/segments.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/subset.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/subset.npy -------------------------------------------------------------------------------- /Thumos14reduced-Annotations/videoname.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/Thumos14reduced-Annotations/videoname.npy -------------------------------------------------------------------------------- /classificationMAP.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def getAP(conf,labels): 4 | assert len(conf)==len(labels) 5 | sortind = np.argsort(-conf) 6 | tp = labels[sortind]==1; fp = labels[sortind]!=1 7 | npos = np.sum(labels); 8 | 9 | fp = np.cumsum(fp).astype('float32'); tp = np.cumsum(tp).astype('float32') 10 | rec=tp/npos; prec=tp/(fp+tp) 11 | tmp = (labels[sortind]==1).astype('float32') 12 | 13 | return np.sum(tmp*prec)/npos 14 | 15 | def getClassificationMAP(confidence,labels): 16 | ''' confidence and labels are of dimension n_samples x n_label ''' 17 | 18 | AP = [] 19 | for i in range(np.shape(labels)[1]): 20 | AP.append(getAP(confidence[:,i], labels[:,i])) 21 | return 100*sum(AP)/len(AP) 22 | -------------------------------------------------------------------------------- /detectionMAP.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | from scipy.signal import savgol_filter 4 | import sys 5 | import scipy.io as sio 6 | 7 | def str2ind(categoryname,classlist): 8 | return [i for i in range(len(classlist)) if categoryname==classlist[i]][0] 9 | 10 | def smooth(v): 11 | return v 12 | #l = min(351, len(v)); l = l - (1-l%2) 13 | #if len(v) <= 3: 14 | # return v 15 | #return savgol_filter(v, l, 1) #savgol_filter(v, l, 1) #0.5*(np.concatenate([v[1:],v[-1:]],axis=0) + v) 16 | 17 | def filter_segments(segment_predict, videonames, ambilist, factor): 18 | ind = np.zeros(np.shape(segment_predict)[0]) 19 | for i in range(np.shape(segment_predict)[0]): 20 | vn = videonames[int(segment_predict[i,0])] 21 | for a in ambilist: 22 | if a[0]==vn: 23 | gt = range(int(round(float(a[2])*factor)), int(round(float(a[3])*factor))) 24 | pd = range(int(segment_predict[i][1]),int(segment_predict[i][2])) 25 | IoU = float(len(set(gt).intersection(set(pd))))/float(len(set(gt).union(set(pd)))) 26 | if IoU > 0: 27 | ind[i] = 1 28 | s = [segment_predict[i,:] for i in range(np.shape(segment_predict)[0]) if ind[i]==0] 29 | return np.array(s) 30 | 31 | def getLocMAP(predictions, th, annotation_path, args): 32 | 33 | gtsegments = np.load(annotation_path + '/segments.npy') 34 | gtlabels = np.load(annotation_path + '/labels.npy') 35 | gtlabels = np.load(annotation_path + '/labels.npy') 36 | videoname = np.load(annotation_path + '/videoname.npy'); videoname = np.array([v.decode('utf-8') for v in videoname]) 37 | subset = np.load(annotation_path + '/subset.npy'); subset = np.array([s.decode('utf-8') for s in subset]) 38 | classlist = np.load(annotation_path + '/classlist.npy'); classlist = np.array([c.decode('utf-8') for c in classlist]) 39 | duration = np.load(annotation_path + '/duration.npy') 40 | ambilist = annotation_path + '/Ambiguous_test.txt' 41 | if args.feature_type == 'UNT': 42 | factor = 10.0/4.0 43 | else: 44 | factor = 25.0/16.0 45 | 46 | ambilist = list(open(ambilist,'r')) 47 | ambilist = [a.strip('\n').split(' ') for a in ambilist] 48 | 49 | # keep training gtlabels for plotting 50 | gtltr = [] 51 | for i,s in enumerate(subset): 52 | if subset[i]=='validation' and len(gtsegments[i]): 53 | gtltr.append(gtlabels[i]) 54 | gtlabelstr = gtltr 55 | 56 | # Keep only the test subset annotations 57 | gts, gtl, vn, dn = [], [], [], [] 58 | for i, s in enumerate(subset): 59 | if subset[i]=='test': 60 | gts.append(gtsegments[i]) 61 | gtl.append(gtlabels[i]) 62 | vn.append(videoname[i]) 63 | dn.append(duration[i,0]) 64 | gtsegments = gts 65 | gtlabels = gtl 66 | videoname = vn 67 | duration = dn 68 | 69 | # keep ground truth and predictions for instances with temporal annotations 70 | gts, gtl, vn, pred, dn = [], [], [], [], [] 71 | for i, s in enumerate(gtsegments): 72 | if len(s): 73 | gts.append(gtsegments[i]) 74 | gtl.append(gtlabels[i]) 75 | vn.append(videoname[i]) 76 | pred.append(predictions[i]) 77 | dn.append(duration[i]) 78 | gtsegments = gts 79 | gtlabels = gtl 80 | videoname = vn 81 | predictions = pred 82 | 83 | # which categories have temporal labels ? 84 | templabelcategories = sorted(list(set([l for gtl in gtlabels for l in gtl]))) 85 | 86 | # the number index for those categories. 87 | templabelidx = [] 88 | for t in templabelcategories: 89 | templabelidx.append(str2ind(t,classlist)) 90 | 91 | 92 | # process the predictions such that classes having greater than a certain threshold are detected only 93 | predictions_mod = [] 94 | c_score = [] 95 | for p in predictions: 96 | pp = - p; [pp[:,i].sort() for i in range(np.shape(pp)[1])]; pp=-pp 97 | c_s = np.mean(pp[:int(np.shape(pp)[0]/8),:],axis=0) 98 | ind = c_s > 0.0 99 | c_score.append(c_s) 100 | new_pred = np.zeros((np.shape(p)[0],np.shape(p)[1]), dtype='float32') 101 | predictions_mod.append(p*ind) 102 | predictions = predictions_mod 103 | 104 | detection_results = [] 105 | for i,vn in enumerate(videoname): 106 | detection_results.append([]) 107 | detection_results[i].append(vn) 108 | 109 | ap = [] 110 | for c in templabelidx: 111 | segment_predict = [] 112 | # Get list of all predictions for class c 113 | for i in range(len(predictions)): 114 | tmp = smooth(predictions[i][:,c]) 115 | threshold = np.max(tmp) - (np.max(tmp) - np.min(tmp))*0.5 116 | vid_pred = np.concatenate([np.zeros(1),(tmp>threshold).astype('float32'),np.zeros(1)], axis=0) 117 | vid_pred_diff = [vid_pred[idt]-vid_pred[idt-1] for idt in range(1,len(vid_pred))] 118 | s = [idk for idk,item in enumerate(vid_pred_diff) if item==1] 119 | e = [idk for idk,item in enumerate(vid_pred_diff) if item==-1] 120 | for j in range(len(s)): 121 | aggr_score = np.max(tmp[s[j]:e[j]]) + 0.7*c_score[i][c] 122 | if e[j]-s[j]>=2: 123 | segment_predict.append([i,s[j],e[j],np.max(tmp[s[j]:e[j]])+0.7*c_score[i][c]]) 124 | detection_results[i].append([classlist[c], s[j], e[j], np.max(tmp[s[j]:e[j]])+0.7*c_score[i][c]]) 125 | segment_predict = np.array(segment_predict) 126 | segment_predict = filter_segments(segment_predict, videoname, ambilist, factor) 127 | 128 | # Sort the list of predictions for class c based on score 129 | if len(segment_predict) == 0: 130 | return 0 131 | segment_predict = segment_predict[np.argsort(-segment_predict[:,3])] 132 | 133 | # Create gt list 134 | segment_gt = [[i, gtsegments[i][j][0], gtsegments[i][j][1]] for i in range(len(gtsegments)) for j in range(len(gtsegments[i])) if str2ind(gtlabels[i][j],classlist)==c] 135 | gtpos = len(segment_gt) 136 | 137 | # Compare predictions and gt 138 | tp, fp = [], [] 139 | for i in range(len(segment_predict)): 140 | flag = 0. 141 | for j in range(len(segment_gt)): 142 | if segment_predict[i][0]==segment_gt[j][0]: 143 | gt = range(int(round(segment_gt[j][1]*factor)), int(round(segment_gt[j][2]*factor))) 144 | p = range(int(segment_predict[i][1]),int(segment_predict[i][2])) 145 | IoU = float(len(set(gt).intersection(set(p))))/float(len(set(gt).union(set(p)))) 146 | if IoU >= th: 147 | flag = 1. 148 | del segment_gt[j] 149 | break 150 | tp.append(flag) 151 | fp.append(1.-flag) 152 | tp_c = np.cumsum(tp) 153 | fp_c = np.cumsum(fp) 154 | if sum(tp)==0: 155 | prc = 0. 156 | else: 157 | prc = np.sum((tp_c/(fp_c+tp_c))*tp)/gtpos 158 | ap.append(prc) 159 | 160 | return 100*np.mean(ap) 161 | 162 | 163 | def getDetectionMAP(predictions, annotation_path, args): 164 | iou_list = [0.1, 0.2, 0.3, 0.4, 0.5] 165 | dmap_list = [] 166 | for iou in iou_list: 167 | print('Testing for IoU %f' %iou) 168 | dmap_list.append(getLocMAP(predictions, iou, annotation_path, args)) 169 | 170 | return dmap_list, iou_list 171 | 172 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import os 4 | import torch 5 | from model import Model 6 | from video_dataset import Dataset 7 | from test import test 8 | from train import train 9 | from tensorboard_logger import Logger 10 | import options 11 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 12 | import torch.optim as optim 13 | 14 | if __name__ == '__main__': 15 | 16 | args = options.parser.parse_args() 17 | torch.manual_seed(args.seed) 18 | device = torch.device("cuda") 19 | 20 | dataset = Dataset(args) 21 | if not os.path.exists('./ckpt/'): 22 | os.makedirs('./ckpt/') 23 | if not os.path.exists('./logs/' + args.model_name): 24 | os.makedirs('./logs/' + args.model_name) 25 | logger = Logger('./logs/' + args.model_name) 26 | 27 | model = Model(dataset.feature_size, dataset.num_class).to(device) 28 | 29 | if args.pretrained_ckpt is not None: 30 | model.load_state_dict(torch.load(args.pretrained_ckpt)) 31 | 32 | optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=0.0005) 33 | 34 | for itr in range(args.max_iter): 35 | train(itr, dataset, args, model, optimizer, logger, device) 36 | if itr % 500 == 0 and not itr == 0: 37 | torch.save(model.state_dict(), './ckpt/' + args.model_name + '.pkl') 38 | test(itr, dataset, args, model, logger, device) 39 | 40 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.nn.init as torch_init 6 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 7 | 8 | def weights_init(m): 9 | classname = m.__class__.__name__ 10 | if classname.find('Conv') != -1 or classname.find('Linear') != -1: 11 | torch_init.xavier_uniform_(m.weight) 12 | m.bias.data.fill_(0) 13 | 14 | 15 | class Model(torch.nn.Module): 16 | def __init__(self, n_feature, n_class): 17 | super(Model, self).__init__() 18 | 19 | self.fc = nn.Linear(n_feature, n_feature) 20 | self.fc1 = nn.Linear(n_feature, n_feature) 21 | self.classifier = nn.Linear(n_feature, n_class) 22 | self.dropout = nn.Dropout(0.7) 23 | 24 | self.apply(weights_init) 25 | 26 | #self.train() 27 | 28 | def forward(self, inputs, is_training=True): 29 | 30 | x = F.relu(self.fc(inputs)) 31 | if is_training: 32 | x = self.dropout(x) 33 | #x = F.relu(self.fc1(x)) 34 | #if is_training: 35 | # x = self.dropout(x) 36 | 37 | 38 | return x, self.classifier(x) 39 | -------------------------------------------------------------------------------- /options.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | parser = argparse.ArgumentParser(description='WTALC') 4 | parser.add_argument('--lr', type=float, default=0.0001,help='learning rate (default: 0.0001)') 5 | parser.add_argument('--batch-size', type=int, default=10, help='number of instances in a batch of data (default: 10)') 6 | parser.add_argument('--model-name', default='weakloc', help='name to save model') 7 | parser.add_argument('--pretrained-ckpt', default=None, help='ckpt for pretrained model') 8 | parser.add_argument('--feature-size', default=2048, help='size of feature (default: 2048)') 9 | parser.add_argument('--num-class', default=20, help='number of classes (default: )') 10 | parser.add_argument('--dataset-name', default='Thumos14reduced', help='dataset to train on (default: )') 11 | parser.add_argument('--max-seqlen', type=int, default=750, help='maximum sequence length during training (default: 750)') 12 | parser.add_argument('--Lambda', type=float, default=0.5, help='weight on Co-Activity Loss (default: 0.5)') 13 | parser.add_argument('--num-similar', default=3, help='number of similar pairs in a batch of data (default: 3)') 14 | parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') 15 | parser.add_argument('--max-iter', type=int, default=100000, help='maximum iteration to train (default: 50000)') 16 | parser.add_argument('--feature-type', type=str, default='I3D', help='type of feature to be used I3D or UNT (default: I3D)') 17 | 18 | 19 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | from model import Model 5 | from video_dataset import Dataset 6 | from tensorboard_logger import log_value 7 | import utils 8 | import numpy as np 9 | from torch.autograd import Variable 10 | from classificationMAP import getClassificationMAP as cmAP 11 | from detectionMAP import getDetectionMAP as dmAP 12 | import scipy.io as sio 13 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 14 | 15 | def test(itr, dataset, args, model, logger, device): 16 | 17 | done = False 18 | instance_logits_stack = [] 19 | element_logits_stack = [] 20 | labels_stack = [] 21 | while not done: 22 | if dataset.currenttestidx % 100 ==0: 23 | print('Testing test data point %d of %d' %(dataset.currenttestidx, len(dataset.testidx))) 24 | 25 | features, labels, done = dataset.load_data(is_training=False) 26 | features = torch.from_numpy(features).float().to(device) 27 | 28 | with torch.no_grad(): 29 | _, element_logits = model(Variable(features), is_training=False) 30 | tmp = F.softmax(torch.mean(torch.topk(element_logits, k=int(np.ceil(len(features)/8)), dim=0)[0], dim=0), dim=0).cpu().data.numpy() 31 | element_logits = element_logits.cpu().data.numpy() 32 | 33 | instance_logits_stack.append(tmp) 34 | element_logits_stack.append(element_logits) 35 | labels_stack.append(labels) 36 | 37 | instance_logits_stack = np.array(instance_logits_stack) 38 | labels_stack = np.array(labels_stack) 39 | 40 | dmap, iou = dmAP(element_logits_stack, dataset.path_to_annotations, args) 41 | 42 | if args.dataset_name == 'Thumos14': 43 | test_set = sio.loadmat('test_set_meta.mat')['test_videos'][0] 44 | for i in range(np.shape(labels_stack)[0]): 45 | if test_set[i]['background_video'] == 'YES': 46 | labels_stack[i,:] = np.zeros_like(labels_stack[i,:]) 47 | 48 | cmap = cmAP(instance_logits_stack, labels_stack) 49 | print('Classification map %f' %cmap) 50 | print('Detection map @ %f = %f' %(iou[0], dmap[0])) 51 | print('Detection map @ %f = %f' %(iou[1], dmap[1])) 52 | print('Detection map @ %f = %f' %(iou[2], dmap[2])) 53 | print('Detection map @ %f = %f' %(iou[3], dmap[3])) 54 | print('Detection map @ %f = %f' %(iou[4], dmap[4])) 55 | 56 | logger.log_value('Test Classification mAP', cmap, itr) 57 | for item in list(zip(dmap,iou)): 58 | logger.log_value('Test Detection mAP @ IoU = ' + str(item[1]), item[0], itr) 59 | 60 | utils.write_to_file(args.dataset_name, dmap, cmap, itr) 61 | 62 | -------------------------------------------------------------------------------- /test_set_meta.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sujoyp/wtalc-pytorch/81ea98264a8456881eb7e98df4872b66d307c6cb/test_set_meta.mat -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import torch.optim as optim 4 | from model import Model 5 | from video_dataset import Dataset 6 | from tensorboard_logger import log_value 7 | import utils 8 | import numpy as np 9 | from torch.autograd import Variable 10 | from classificationMAP import getClassificationMAP as cmAP 11 | import time 12 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 13 | 14 | 15 | def MILL(element_logits, seq_len, batch_size, labels, device): 16 | ''' element_logits should be torch tensor of dimension (B, n_element, n_class), 17 | k should be numpy array of dimension (B,) indicating the top k locations to average over, 18 | labels should be a numpy array of dimension (B, n_class) of 1 or 0 19 | return is a torch tensor of dimension (B, n_class) ''' 20 | 21 | k = np.ceil(seq_len/8).astype('int32') 22 | labels = labels / torch.sum(labels, dim=1, keepdim=True) 23 | instance_logits = torch.zeros(0).to(device) 24 | for i in range(batch_size): 25 | tmp, _ = torch.topk(element_logits[i][:seq_len[i]], k=int(k[i]), dim=0) 26 | instance_logits = torch.cat([instance_logits, torch.mean(tmp, 0, keepdim=True)], dim=0) 27 | milloss = -torch.mean(torch.sum(Variable(labels) * F.log_softmax(instance_logits, dim=1), dim=1), dim=0) 28 | return milloss 29 | 30 | def CASL(x, element_logits, seq_len, n_similar, labels, device): 31 | ''' x is the torch tensor of feature from the last layer of model of dimension (n_similar, n_element, n_feature), 32 | element_logits should be torch tensor of dimension (n_similar, n_element, n_class) 33 | seq_len should be numpy array of dimension (B,) 34 | labels should be a numpy array of dimension (B, n_class) of 1 or 0 ''' 35 | 36 | sim_loss = 0. 37 | n_tmp = 0. 38 | for i in range(0, n_similar*2, 2): 39 | atn1 = F.softmax(element_logits[i][:seq_len[i]], dim=0) 40 | atn2 = F.softmax(element_logits[i+1][:seq_len[i+1]], dim=0) 41 | 42 | n1 = torch.FloatTensor([np.maximum(seq_len[i]-1, 1)]).to(device) 43 | n2 = torch.FloatTensor([np.maximum(seq_len[i+1]-1, 1)]).to(device) 44 | Hf1 = torch.mm(torch.transpose(x[i][:seq_len[i]], 1, 0), atn1) 45 | Hf2 = torch.mm(torch.transpose(x[i+1][:seq_len[i+1]], 1, 0), atn2) 46 | Lf1 = torch.mm(torch.transpose(x[i][:seq_len[i]], 1, 0), (1 - atn1)/n1) 47 | Lf2 = torch.mm(torch.transpose(x[i+1][:seq_len[i+1]], 1, 0), (1 - atn2)/n2) 48 | 49 | d1 = 1 - torch.sum(Hf1*Hf2, dim=0) / (torch.norm(Hf1, 2, dim=0) * torch.norm(Hf2, 2, dim=0)) 50 | d2 = 1 - torch.sum(Hf1*Lf2, dim=0) / (torch.norm(Hf1, 2, dim=0) * torch.norm(Lf2, 2, dim=0)) 51 | d3 = 1 - torch.sum(Hf2*Lf1, dim=0) / (torch.norm(Hf2, 2, dim=0) * torch.norm(Lf1, 2, dim=0)) 52 | 53 | sim_loss = sim_loss + 0.5*torch.sum(torch.max(d1-d2+0.5, torch.FloatTensor([0.]).to(device))*Variable(labels[i,:])*Variable(labels[i+1,:])) 54 | sim_loss = sim_loss + 0.5*torch.sum(torch.max(d1-d3+0.5, torch.FloatTensor([0.]).to(device))*Variable(labels[i,:])*Variable(labels[i+1,:])) 55 | n_tmp = n_tmp + torch.sum(Variable(labels[i,:])*Variable(labels[i+1,:])) 56 | sim_loss = sim_loss / n_tmp 57 | return sim_loss 58 | 59 | 60 | def train(itr, dataset, args, model, optimizer, logger, device): 61 | 62 | features, labels = dataset.load_data(n_similar=args.num_similar) 63 | seq_len = np.sum(np.max(np.abs(features), axis=2) > 0, axis=1) 64 | features = features[:,:np.max(seq_len),:] 65 | 66 | features = torch.from_numpy(features).float().to(device) 67 | labels = torch.from_numpy(labels).float().to(device) 68 | 69 | final_features, element_logits = model(Variable(features)) 70 | 71 | milloss = MILL(element_logits, seq_len, args.batch_size, labels, device) 72 | casloss = CASL(final_features, element_logits, seq_len, args.num_similar, labels, device) 73 | 74 | total_loss = args.Lambda * milloss + (1-args.Lambda) * casloss 75 | 76 | logger.log_value('milloss', milloss, itr) 77 | logger.log_value('casloss', casloss, itr) 78 | logger.log_value('total_loss', total_loss, itr) 79 | 80 | print('Iteration: %d, Loss: %.3f' %(itr, total_loss.data.cpu().numpy())) 81 | 82 | optimizer.zero_grad() 83 | total_loss.backward() 84 | #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 85 | optimizer.step() 86 | 87 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def str2ind(categoryname,classlist): 4 | return [i for i in range(len(classlist)) if categoryname==classlist[i].decode('utf-8')][0] 5 | 6 | def strlist2indlist(strlist, classlist): 7 | return [str2ind(s,classlist) for s in strlist] 8 | 9 | def strlist2multihot(strlist, classlist): 10 | return np.sum(np.eye(len(classlist))[strlist2indlist(strlist,classlist)], axis=0) 11 | 12 | def idx2multihot(id_list,num_class): 13 | return np.sum(np.eye(num_class)[id_list], axis=0) 14 | 15 | def random_extract(feat, t_max): 16 | r = np.random.randint(len(feat)-t_max) 17 | return feat[r:r+t_max] 18 | 19 | def pad(feat, min_len): 20 | if np.shape(feat)[0] <= min_len: 21 | return np.pad(feat, ((0,min_len-np.shape(feat)[0]), (0,0)), mode='constant', constant_values=0) 22 | else: 23 | return feat 24 | 25 | def process_feat(feat, length): 26 | if len(feat) > length: 27 | return random_extract(feat, length) 28 | else: 29 | return pad(feat, length) 30 | 31 | def write_to_file(dname, dmap, cmap, itr): 32 | fid = open(dname + '-results.log', 'a+') 33 | string_to_write = str(itr) 34 | for item in dmap: 35 | string_to_write += ' ' + '%.2f' %item 36 | string_to_write += ' ' + '%.2f' %cmap 37 | fid.write(string_to_write + '\n') 38 | fid.close() 39 | 40 | -------------------------------------------------------------------------------- /video_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import glob 3 | import utils 4 | import time 5 | 6 | class Dataset(): 7 | def __init__(self, args): 8 | self.dataset_name = args.dataset_name 9 | self.num_class = args.num_class 10 | self.feature_size = args.feature_size 11 | self.path_to_features = '%s-%s-JOINTFeatures.npy' %(args.dataset_name, args.feature_type) 12 | self.path_to_annotations = self.dataset_name + '-Annotations/' 13 | self.features = np.load(self.path_to_features, encoding='bytes') 14 | self.segments = np.load(self.path_to_annotations + 'segments.npy') 15 | self.labels = np.load(self.path_to_annotations + 'labels_all.npy') # Specific to Thumos14 16 | self.classlist = np.load(self.path_to_annotations + 'classlist.npy') 17 | self.subset = np.load(self.path_to_annotations + 'subset.npy') 18 | self.batch_size = args.batch_size 19 | self.t_max = args.max_seqlen 20 | self.trainidx = [] 21 | self.testidx = [] 22 | self.classwiseidx = [] 23 | self.currenttestidx = 0 24 | self.labels_multihot = [utils.strlist2multihot(labs,self.classlist) for labs in self.labels] 25 | 26 | self.train_test_idx() 27 | self.classwise_feature_mapping() 28 | 29 | 30 | def train_test_idx(self): 31 | for i, s in enumerate(self.subset): 32 | if s.decode('utf-8') == 'validation': # Specific to Thumos14 33 | self.trainidx.append(i) 34 | else: 35 | self.testidx.append(i) 36 | 37 | def classwise_feature_mapping(self): 38 | for category in self.classlist: 39 | idx = [] 40 | for i in self.trainidx: 41 | for label in self.labels[i]: 42 | if label == category.decode('utf-8'): 43 | idx.append(i); break; 44 | self.classwiseidx.append(idx) 45 | 46 | 47 | def load_data(self, n_similar=3, is_training=True): 48 | if is_training==True: 49 | features = [] 50 | labels = [] 51 | idx = [] 52 | 53 | # Load similar pairs 54 | rand_classid = np.random.choice(len(self.classwiseidx), size=n_similar) 55 | for rid in rand_classid: 56 | rand_sampleid = np.random.choice(len(self.classwiseidx[rid]), size=2) 57 | idx.append(self.classwiseidx[rid][rand_sampleid[0]]) 58 | idx.append(self.classwiseidx[rid][rand_sampleid[1]]) 59 | 60 | # Load rest pairs 61 | rand_sampleid = np.random.choice(len(self.trainidx), size=self.batch_size-2*n_similar) 62 | for r in rand_sampleid: 63 | idx.append(self.trainidx[r]) 64 | 65 | return np.array([utils.process_feat(self.features[i], self.t_max) for i in idx]), np.array([self.labels_multihot[i] for i in idx]) 66 | 67 | else: 68 | labs = self.labels_multihot[self.testidx[self.currenttestidx]] 69 | feat = self.features[self.testidx[self.currenttestidx]] 70 | 71 | if self.currenttestidx == len(self.testidx)-1: 72 | done = True; self.currenttestidx = 0 73 | else: 74 | done = False; self.currenttestidx += 1 75 | 76 | return np.array(feat), np.array(labs), done 77 | 78 | --------------------------------------------------------------------------------