├── .gitignore ├── LICENSE ├── README.md ├── code ├── TW_FINCH │ ├── __pycache__ │ │ └── finch.cpython-38.pyc │ ├── bestMap.m │ ├── compute_clustScores.m │ ├── confusionmat.m │ ├── eval_fs.m │ ├── evaluate.m │ ├── finch.py │ ├── findgroups.m │ ├── hungarian.m │ └── main.py ├── helpers │ ├── Split_Videos │ │ ├── README.md │ │ ├── driver.py │ │ ├── make_splits.py │ │ └── parse_subtitles.py │ └── Trim_Intro │ │ ├── concat_subs.py │ │ ├── concat_video.sh │ │ ├── concat_videos.py │ │ ├── cut_intro.py │ │ ├── cut_intro.sh │ │ └── cut_srt.py └── lecture_aware_embds │ ├── args.py │ ├── avlectures_dataloader.py │ ├── eval.py │ ├── extract_feats.py │ ├── loss.py │ ├── loss_ce.py │ ├── loss_milnce.py │ ├── loss_mms.py │ ├── metrics.py │ ├── model.py │ ├── model_ef.py │ ├── stop_words.py │ ├── train.py │ ├── video_feature_extractor │ ├── LICENSE │ ├── README.md │ ├── create_feature_csv.py │ ├── create_feature_csv_indi.py │ ├── create_feature_csv_seg.py │ ├── create_pickle.py │ ├── create_pickle_indi.py │ ├── create_pickle_ocr.py │ ├── create_pickle_prevnext.py │ ├── create_pickle_seg2.py │ ├── create_pickle_seg2.sh │ ├── create_pickle_seg2_55.sh │ ├── create_pickle_seg2_mp.py │ ├── create_pickle_seg2_mp_55.sh │ ├── create_pickle_seg2_mp_92.sh │ ├── create_pickle_seg3.py │ ├── create_pickle_segmentation.py │ ├── create_pkl_tst.py │ ├── extract.py │ ├── extract_features_2d_indi.sh │ ├── extract_features_3d_indi.sh │ ├── helper_pkl.py │ ├── lec_list.py │ ├── merge_and_bert.py │ ├── merge_and_bert.sh │ ├── merge_and_bert_mp.py │ ├── model.py │ ├── ocr_bert_pickle.py │ ├── preprocessing.py │ ├── random_sequence_shuffler.py │ ├── readme.txt │ ├── video_loader.py │ └── videocnn │ │ ├── .gitignore │ │ ├── .opts.py.swp │ │ ├── LICENSE │ │ ├── README.md │ │ ├── class_names_list │ │ ├── classify.py │ │ ├── dataset.py │ │ ├── generate_result_video │ │ ├── README.md │ │ ├── SourceSansPro-Regular.ttf │ │ └── generate_result_video.py │ │ ├── input │ │ ├── main.py │ │ ├── mean.py │ │ ├── model.py │ │ ├── models │ │ ├── densenet.py │ │ ├── pre_act_resnet.py │ │ ├── resnet.py │ │ ├── resnext.py │ │ └── wide_resnet.py │ │ ├── opts.py │ │ ├── spatial_transforms.py │ │ ├── temporal_transforms.py │ │ ├── test.py │ │ ├── train.py │ │ └── validation.py │ └── we_embd.py └── figures └── AVLectures_stats.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | *.npy 2 | *.pickle 3 | *.pth 4 | *.pkl 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | code/lecture_aware_embds/.ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | # Rope project settings 109 | .ropeproject 110 | 111 | # mkdocs documentation 112 | /site 113 | 114 | # mypy 115 | .mypy_cache/ 116 | .dmypy.json 117 | dmypy.json 118 | 119 | # Pyre type checker 120 | .pyre/ 121 | 122 | .prof 123 | 124 | code/lecture_aware_embds/test.py 125 | code/lecture_aware_embds/test_dataloader.py 126 | 127 | code/lecture_aware_embds/video_feature_extractor/model/ 128 | code/lecture_aware_embds/video_feature_extractor/extract_features_2d.sh 129 | code/lecture_aware_embds/video_feature_extractor/extract_features_3d.sh 130 | code/lecture_aware_embds/video_feature_extractor/create_pickle.sh 131 | code/lecture_aware_embds/video_feature_extractor/slurm-*.out 132 | 133 | code/lecture_aware_embds/*.sh 134 | code/lecture_aware_embds/slurm-*.out 135 | code/lecture_aware_embds/*.ipynb 136 | -------------------------------------------------------------------------------- /code/TW_FINCH/__pycache__/finch.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/code/TW_FINCH/__pycache__/finch.cpython-38.pyc -------------------------------------------------------------------------------- /code/TW_FINCH/bestMap.m: -------------------------------------------------------------------------------- 1 | function [newL2] = bestMap(L1,L2) 2 | %bestmap: permute labels of L2 to match L1 as good as possible 3 | % [newL2] = bestMap(L1,L2); 4 | % 5 | % version 2.0 --May/2007 6 | % version 1.0 --November/2003 7 | % 8 | % Written by Deng Cai (dengcai AT gmail.com) 9 | 10 | 11 | %=========== 12 | 13 | L1 = L1(:); 14 | L2 = L2(:); 15 | if size(L1) ~= size(L2) 16 | error('size(L1) must == size(L2)'); 17 | end 18 | 19 | Label1 = unique(L1); 20 | nClass1 = length(Label1); 21 | Label2 = unique(L2); 22 | nClass2 = length(Label2); 23 | 24 | nClass = max(nClass1,nClass2); 25 | G = zeros(nClass); 26 | for i=1:nClass1 27 | for j=1:nClass2 28 | G(i,j) = length(find((L1 == Label1(i)) & (L2 == Label2(j)))); 29 | end 30 | end 31 | 32 | %% Compute Hungarian Match Matrix 33 | [c,t] = hungarian(-G); 34 | 35 | newL2 = zeros(size(L2)); 36 | 37 | for i=1:length(Label2) 38 | if c(i) <= nClass1 39 | newL2(L2 == Label2(i)) = Label1(c(i)); 40 | else 41 | newL2(L2 == Label2(i)) = c(i); 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /code/TW_FINCH/compute_clustScores.m: -------------------------------------------------------------------------------- 1 | function [fscore, precision, recall]=compute_clustScores(confusionMat) 2 | 3 | % confusionMat=confusionmat(true_label,predicted_label); 4 | 5 | acc = trace(confusionMat)/sum(confusionMat(:)); 6 | 7 | recall = diag(confusionMat)./sum(confusionMat,2); 8 | 9 | precision = diag(confusionMat)./sum(confusionMat,1)'; 10 | 11 | f1Scores = 2*(precision.*recall)./(precision +recall); 12 | 13 | precision(isnan(precision))=0; 14 | recall(isnan(recall))=0; 15 | f1Scores(isnan(f1Scores))=0; 16 | 17 | 18 | precision =mean(precision); 19 | recall= mean(recall); 20 | fscore = mean(f1Scores); 21 | 22 | end 23 | 24 | 25 | -------------------------------------------------------------------------------- /code/TW_FINCH/eval_fs.m: -------------------------------------------------------------------------------- 1 | function [mof, iou, fscore, res] = eval_fs(label_pre, label_gt, datasets_path) 2 | % Evaluates 50Salads dataset in Eval Mode 3 | 4 | % compute Hungarian based accuracy and IOU score 5 | res = bestMap(label_gt(:), label_pre(:)); 6 | 7 | label_gt = fs_eval_mode_map(label_gt, datasets_path); 8 | res = fs_eval_mode_map(res, datasets_path); 9 | 10 | 11 | 12 | mof = length(find(label_gt(:) == res(:)))/length(label_gt(:)); 13 | 14 | k = length(unique(res)); 15 | % compute IOU 16 | try 17 | iou= jaccard(categorical(label_gt), categorical(res)); 18 | catch 19 | iou= jaccard(label_gt, res); 20 | end 21 | % penalize under/over clustering equally in iou 22 | iou(isnan(iou))=0; 23 | iou= sum(iou)/ k; 24 | 25 | % compute fscore 26 | [fscore, ~, ~]=compute_clustScores(label_gt, res); 27 | 28 | 29 | function labs = fs_eval_mode_map(labs, datasets_path) 30 | mapping_path = fullfile(datasets_path, '50Salads', 'mapping'); 31 | map=readtable(fullfile(mapping_path, 'mapping.txt')); 32 | label_str = map.Var2(labs); 33 | 34 | map_val=readtable(fullfile(mapping_path, 'mappingeval.txt')); 35 | map2=table([1:numel(map_val.Var2)]', 'RowNames', map_val.Var2); 36 | mapped_label = table2array(map2(label_str,1)); 37 | grp = map_val.Var1 +1; 38 | labs = grp(mapped_label); 39 | end 40 | end -------------------------------------------------------------------------------- /code/TW_FINCH/evaluate.m: -------------------------------------------------------------------------------- 1 | function [mof, iou] = evaluate(label_pre, label_gt, res, iou) 2 | % compute Hungarian based accuracy and IOU score 3 | % res = bestMap(label_gt(:), label_pre(:)); 4 | mof = length(find(label_gt(:) == res(:)))/length(label_gt(:)); 5 | 6 | k = length(unique(label_pre)); 7 | % % compute IOU 8 | % try 9 | % iou= jaccard(categorical(label_gt), categorical(res)); 10 | % catch 11 | % iou= jaccard(label_gt, res); 12 | % end 13 | % % penalize under/over clustering equally in iou 14 | iou(isnan(iou))=0; 15 | iou= sum(iou)/ k; 16 | % compute fscore 17 | % [fscore, ~, ~]=compute_clustScores(label_gt, res); 18 | return [mof, iou] 19 | end -------------------------------------------------------------------------------- /code/TW_FINCH/findgroups.m: -------------------------------------------------------------------------------- 1 | function [gnums, varargout] = findgroups(varargin) 2 | %FINDGROUPS Find groups and return group numbers 3 | % G = FINDGROUPS(A) returns G, a vector of group numbers created from the 4 | % grouping variable A. G contains integer values from 1 to N, indicating 5 | % N distinct groups for the N unique values in A. 6 | % 7 | % A is a categorical, numeric, logical, string, datetime, duration, 8 | % or calendarDuration vector, or a cell array of character vectors. 9 | % G has the same length as A. 10 | % 11 | % [G,ID] = FINDGROUPS(A) also returns ID, a vector of the N unique values 12 | % that identify each group in A. ID has the same type as A. 13 | % 14 | % [G,ID1,ID2,...] = FINDGROUPS(A1,A2,...) returns group numbers created 15 | % from one or more grouping variables A1,A2,... . Each group is defined 16 | % by a unique combination of values across A1,A2,... . 17 | % [ID1(J),ID2(J),...] contains the values that define the J-th group. 18 | % 19 | % [G,TID] = FINDGROUPS(T) returns group numbers created from the 20 | % variables in the table T. The length of G equals the number of rows of 21 | % T. Each group is defined by a unique combination of values in the rows 22 | % of T. TID is a table where TID(J,:) contains the values that define the 23 | % J-th group. 24 | % 25 | % FINDGROUPS returns NaNs for corresponding missing elements in A. 26 | % Examples of missing elements are: 27 | % * NaN in a double array 28 | % * '' in a cell array of character vectors 29 | % * Any element that displays as , without quotes 30 | % For more information on missing elements type "help ismissing". 31 | % 32 | % Examples: 33 | % % Load patients data. 34 | % % List Weight, Gender, and Smoker variables for patients. 35 | % load patients; 36 | % whos Weight Gender Smoker 37 | % 38 | % % Find the mean weights by gender. 39 | % G = findgroups(Gender); 40 | % Y = splitapply(@mean,Weight,G) 41 | % 42 | % % Find the median weights by gender. Create a table containing the 43 | % % results. 44 | % [G,gender] = findgroups(Gender); 45 | % medianWeight = splitapply(@median,Weight,G) 46 | % results = table(gender,medianWeight) 47 | % 48 | % % Find the mean weights for all four groups of patients. 49 | % G = findgroups(Gender,Smoker); 50 | % Y = splitapply(@mean,Weight,G) 51 | % 52 | % % Find the mean weights for the four groups of patients. Create a table 53 | % % containing the results. 54 | % [G,gender,smoker] = findgroups(Gender,Smoker); 55 | % meanWeight = splitapply(@mean,Weight,G); 56 | % results = table(gender,smoker,meanWeight) 57 | % 58 | % % Read power loss data into a table. 59 | % % Find the maximum power loss in each region and by cause of power 60 | % % outage. Specify the grouping variables using table indexing. 61 | % % Return the maximum power losses in a table. 62 | % T = readtable('outages.csv'); 63 | % summary(T) 64 | % [G,powerLoss] = findgroups(T(:,{'Region','Cause'})); 65 | % powerLoss.maxLoss = splitapply(@max,T.Loss,G) 66 | % 67 | % See also SPLITAPPLY, UNIQUE, ISMEMBER, ISMISSING 68 | 69 | % Copyright 2015-2019 The MathWorks, Inc. 70 | 71 | narginchk(1,inf); 72 | nargoutchk(0, nargin+1); 73 | 74 | % Parse inputs into grouping variables. Remember which grouping variables 75 | % come from table input and the corresponding variable names 76 | [groupVars, outVarIdx, tOutTemplate] = parseInput(varargin); 77 | 78 | % Call into the grouping helper function used by groupsummary 79 | inclnan = false; 80 | inclempty = false; 81 | if nargout <=1 82 | gnums = matlab.internal.math.mgrp2idx(groupVars,0,inclnan,inclempty); 83 | if isrow(groupVars{1}) 84 | gnums = gnums'; 85 | end 86 | return; 87 | else 88 | [gnums,~,gnames] = matlab.internal.math.mgrp2idx(groupVars,0,inclnan,inclempty); 89 | if isrow(groupVars{1}) 90 | gnums = gnums'; 91 | gnames = cellfun(@transpose,gnames,'UniformOutput',false); 92 | end 93 | end 94 | 95 | % Build output for group IDs 96 | varargout = cell(1, nargin); 97 | for i = 1:nargin 98 | if istable(tOutTemplate{i}) 99 | gnames_i = gnames(outVarIdx{i}); 100 | varargout{i} = table(gnames_i{:}); 101 | varargout{i}.Properties = tOutTemplate{i}.Properties; 102 | else 103 | varargout(i) = gnames(outVarIdx{i}); 104 | end 105 | end 106 | end 107 | 108 | %------------------------------------------------------------------------------- 109 | function [groupVars, outVarIdx, tOutTemplate] = parseInput(userInput) 110 | % ParseInput Extract grouping variables from user inputs. 111 | % [GROUPVARS, OUTVARIDX, TOUTTEMPLATE] = PARSEINPUT(USERINPUT) extracts 112 | % into GROUPVARS a cell array of grouping variables from USERINPUT. 113 | % PARSEINPUT does not verify types of USERINPUT. Variables in table 114 | % entries in USERINPUT are extracted as individual grouping variables; 115 | % non-table entries are treated as grouping variables on their own. 116 | % 117 | % OUTVARIDX is a cell array of indices the same length as USERINPUT. Each 118 | % cell contains indices into GROUPVARS. These indices indicate which 119 | % grouping variables in GROUPVARS correspond to each element in 120 | % USERINPUT. Cells of OUTVARIDX that correspond to table entries in 121 | % USERINPUT have the same number of indices as there are variables in the 122 | % table. 123 | % 124 | % TOUTTEMPLATE is a cell array the same length as USERINPUT. Cells that 125 | % corresponds to table entries in USERINPUT contain a 0-by-N sub-table 126 | % where N is the number of variables in that table. Cells that 127 | % corresponds to non-table entries in USERINPUT will be empty. 128 | 129 | % Total number of grouping variables equal sum of non-table inputs and 130 | % total number of variables across all table inputs 131 | isTabularInput = cellfun(@matlab.internal.datatypes.istabular, userInput); 132 | nGrpVars = sum(cellfun(@width, userInput(isTabularInput))) + sum(~isTabularInput); 133 | 134 | groupVars = cell(1, nGrpVars); 135 | outVarIdx = cell(size(userInput)); 136 | tOutTemplate = cell(size(userInput)); 137 | 138 | % Extract grouping variables from userInput 139 | groupVarIdx = 0; % loop invariant: number of grouping variable already extracted 140 | for i = 1:length(userInput) 141 | if isTabularInput(i) 142 | t = userInput{i}; 143 | if istimetable(t), t = timetable2table(t,'ConvertRowTimes',false); end 144 | varIndices = groupVarIdx + (1:width(t)); 145 | for k = 1:numel(varIndices) 146 | groupVars{varIndices(k)} = t{:,k}; 147 | end 148 | tOutTemplate{i} = t([],:); 149 | tOutTemplate{i}.Properties.RowNames = {}; % clear rowNames for use as output template 150 | else 151 | varIndices = groupVarIdx + 1; 152 | groupVars(varIndices) = userInput(i); 153 | end 154 | 155 | outVarIdx{i} = varIndices; 156 | 157 | % Update loop invariant: number of grouping variable already extracted 158 | groupVarIdx = groupVarIdx + length(outVarIdx{i}); 159 | end 160 | 161 | if isempty(groupVars) 162 | throwAsCaller(MException(message('MATLAB:findgroups:GroupingVarNotVector'))); 163 | end 164 | 165 | end 166 | -------------------------------------------------------------------------------- /code/TW_FINCH/main.py: -------------------------------------------------------------------------------- 1 | from finch import FINCH 2 | import numpy as np 3 | import os 4 | import cv2 5 | import sys 6 | import pickle as pkl 7 | from tqdm import tqdm 8 | 9 | def check_clusters(cluster): 10 | prev_cluster = cluster[0] 11 | for c in cluster: 12 | if (c == prev_cluster) or (c == prev_cluster + 1): 13 | prev_cluster = c 14 | else: 15 | return False 16 | 17 | return True 18 | 19 | # learned lecture aware embeddings on which you want to perform clustering using TW-FINCH 20 | a = pkl.load(open('/ssd_scratch/cvit/darshan_2/seg_embds/2d3dOCR_ss_test50ft50.pkl', 'rb')) 21 | b = pkl.load(open('/home2/darshan.singh/combined_stats.pkl', 'rb')) 22 | 23 | d = {} 24 | cnt = 0 25 | cnt_2, cnt_3, cnt_4 = 0, 0, 0 26 | 27 | for course in a: 28 | course_cnt = 0 29 | for lec in a[course]: 30 | course_cnt += 1 31 | print("For course", course, "->", course_cnt) 32 | 33 | lst_alpha = [] 34 | 35 | for course in tqdm(list(a.keys())): 36 | d[course] = {} 37 | for lec in a[course]: 38 | cnt += 1 39 | clusters = b[course][lec]['num_segments'] 40 | vid_emb = a[course][lec]['vid_embd'] 41 | txt_emb = a[course][lec]['text_embd'] 42 | vidtext_emb = np.hstack((vid_emb, txt_emb)) 43 | d[course][lec] = {} 44 | 45 | _, _, vid_clusters = FINCH(vid_emb, req_clust=clusters, tw_finch=True, alpha = 1) 46 | alpha = 1 47 | while not check_clusters(vid_clusters): 48 | _, _, vid_clusters = FINCH(vid_emb, req_clust=clusters, tw_finch=True, alpha = alpha) 49 | alpha += 0.1 50 | if alpha > 5: 51 | lst_alpha.append(alpha) 52 | 53 | if not check_clusters(vid_clusters): 54 | cnt_2 += 1 55 | 56 | _, _, txt_clusters = FINCH(txt_emb, req_clust=clusters, tw_finch=True, alpha = 1) 57 | loop_counter = 0 58 | alpha = 1 59 | while not check_clusters(txt_clusters): 60 | _, _, txt_clusters = FINCH(txt_emb, req_clust=clusters, tw_finch=True, alpha = alpha) 61 | alpha += 0.1 62 | loop_counter += 1 63 | if alpha > 5: 64 | lst_alpha.append(alpha) 65 | 66 | if not check_clusters(txt_clusters): 67 | cnt_3 += 1 68 | 69 | _, _, vidtxt_clusters = FINCH(vidtext_emb, req_clust=clusters, tw_finch=True, alpha = 1) 70 | 71 | alpha = 1 72 | while not check_clusters(vidtxt_clusters): 73 | _, _, vidtxt_clusters = FINCH(vidtext_emb, req_clust=clusters, tw_finch=True, alpha = alpha) 74 | alpha += 0.1 75 | if alpha > 5: 76 | lst_alpha.append(alpha) 77 | 78 | if not check_clusters(vidtxt_clusters): 79 | cnt_4 += 1 80 | 81 | vid_clusters = vid_clusters.reshape(-1) 82 | txt_clusters = txt_clusters.reshape(-1) 83 | vidtxt_clusters = vidtxt_clusters.reshape(-1) 84 | d[course][lec]['vid_clusters'] = vid_clusters 85 | d[course][lec]['txt_clusters'] = txt_clusters 86 | d[course][lec]['vidtxt_clusters'] = vidtxt_clusters 87 | 88 | print(cnt, len(lst_alpha), cnt_2, cnt_3, cnt_4) 89 | print(lst_alpha) 90 | 91 | clusters = d.copy() 92 | lecs_excluded = {} 93 | 94 | for course in clusters: 95 | for lec in clusters[course]: 96 | for t in clusters[course][lec]: 97 | lst = [] 98 | for i in clusters[course][lec][t]: 99 | lst.append(i) 100 | prev_cluster = lst[0] 101 | for i, c in enumerate(lst): 102 | if (c == prev_cluster) or (c == prev_cluster + 1): 103 | prev_cluster = c 104 | else: 105 | if course not in lecs_excluded: 106 | lecs_excluded[course] = [] 107 | if lec not in lecs_excluded[course]: 108 | lecs_excluded[course].append(lec) 109 | break 110 | 111 | for course in lecs_excluded: 112 | for lec in lecs_excluded[course]: 113 | print(course, '->', lec) 114 | 115 | from os.path import join 116 | 117 | # Path where you want to save the predicted clusters 118 | with open(join('/ssd_scratch/cvit/darshan_2/clusters/2d3dOCR_ss_test50ft50.pkl'), 'wb') as handle: 119 | pkl.dump(d, handle, protocol=pkl.HIGHEST_PROTOCOL) -------------------------------------------------------------------------------- /code/helpers/Split_Videos/README.md: -------------------------------------------------------------------------------- 1 | # Split the videos using the following steps 2 | 3 | **Step1** 4 | Execute the `driver.py` program. To specify the path of the Datasubset use the `--base_dir` optional argument. The default path of the DataSubset is `/ssd_scratch/cvit/AVLectures/DataSubset`. To specify the minimum and maximum time of the splits use the `--min_time` and `--max_time` arguments respectively. By default `min_time=7 seconds` and `max_time= 15 seconds`. After executing this program the following will be created inside `base_dir`: 5 | 1. Inside each of the course directory a folder called `split_vids` would be created which contains the splits of all the videos. 6 | 7 | 8 | -------------------------------------------------------------------------------- /code/helpers/Split_Videos/driver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import subprocess 3 | import os 4 | 5 | from os.path import join 6 | from glob import glob 7 | 8 | import make_splits as ms 9 | import parse_subtitles as ps 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 13 | parser.add_argument("-x", "--min_time", type=int, required=False, help="Minimum time (in seconds)", default = 7) 14 | parser.add_argument("-y", "--max_time", type=int, required=False, help="Maximum time (in seconds)", default = 15) 15 | 16 | args = parser.parse_args() 17 | 18 | base_dir = args.base_dir 19 | min_time = args.min_time 20 | max_time = args.max_time 21 | 22 | print("Base Directory: ", base_dir) 23 | print("Min time = ", min_time, "seconds") 24 | print("Max time = ", max_time, "seconds") 25 | 26 | dir_list = [] 27 | 28 | for dir in glob(join(base_dir, '*')): 29 | if 'mit' in dir: 30 | dir_list.append(dir) 31 | 32 | dir_list.sort() 33 | print(dir_list) 34 | 35 | for dir in dir_list: 36 | 37 | print("Inside Directory - ", dir) 38 | 39 | base_dir = dir 40 | 41 | # step 1 42 | 43 | p = ps.ParseSubtitle(base_dir = base_dir, min_time = min_time, max_time = max_time) 44 | 45 | srt_files = [] 46 | 47 | for fl in glob(join(base_dir, 'subtitles', '*')): 48 | if fl.endswith('.srt'): 49 | srt_files.append(fl) 50 | 51 | for fl in srt_files: 52 | print(fl) 53 | p.parse(fl) 54 | 55 | # step 2 56 | 57 | p.combine() 58 | 59 | # step 3 60 | 61 | m = ms.SplitVideo(base_dir = base_dir) 62 | m._split() 63 | 64 | print("Done Successfully") 65 | -------------------------------------------------------------------------------- /code/helpers/Split_Videos/make_splits.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.connection import wait 2 | import subprocess, traceback 3 | import time 4 | import os 5 | from os.path import join 6 | import multiprocessing as mp 7 | from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor 8 | from tqdm import tqdm 9 | from glob import glob 10 | import threading 11 | 12 | import signal 13 | 14 | class SplitVideo(): 15 | 16 | # c = 0 17 | 18 | def __init__(self, base_dir): 19 | 20 | self.base_dir = base_dir 21 | self.delimiter = "@#@" 22 | 23 | def process_cmd(self, cmd): 24 | try: 25 | subprocess.call(cmd, shell=True) 26 | except KeyboardInterrupt: 27 | exit(0) 28 | except: 29 | traceback.print_exc() 30 | 31 | def _split(self): 32 | 33 | os.makedirs(join(self.base_dir, 'splits_vid'), exist_ok=True) 34 | 35 | CMD_ffmpeg = [] 36 | with open(join(self.base_dir, 'combined.txt'), 'r') as f: 37 | lines = f.readlines() 38 | for l in lines: 39 | name, _ , start, end = l.strip().split(self.delimiter) 40 | 41 | start = start.replace(',', '.') 42 | end = end.replace(',', '.') 43 | 44 | 45 | video_file = "" 46 | videofile_name = join(self.base_dir, "splits_vid", name) 47 | 48 | name = '-'.join(name.split('-')[:-1]) 49 | 50 | video_file = join(self.base_dir, 'videos', name + '.mp4') 51 | 52 | cmd1 = 'ffmpeg -hide_banner -loglevel error -ss {} -to {} -i {} -strict -2 {} -y'.format( 53 | start, end, video_file, videofile_name 54 | ) 55 | 56 | CMD_ffmpeg.append(cmd1) 57 | 58 | p = ThreadPoolExecutor(20) 59 | 60 | futures = [p.submit(self.process_cmd, j) for j in CMD_ffmpeg] 61 | _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))] -------------------------------------------------------------------------------- /code/helpers/Split_Videos/parse_subtitles.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join 3 | import argparse 4 | import ast 5 | from glob import glob 6 | 7 | class ParseSubtitle(): 8 | 9 | def __init__(self, base_dir=None, min_time=7, max_time=15): 10 | 11 | self.base_dir = os.curdir if base_dir is None else base_dir 12 | self.min_time = min_time 13 | self.max_time = max_time 14 | self.delimiter = "@#@" 15 | 16 | def getTime(self, t): 17 | h, m, sms = t.split(":") 18 | if ',' in sms: # Example t = '00:00:03,980' 19 | s, ms = sms.split(",") 20 | elif '.' in sms: # Example t = '00:00:03.980' 21 | s, ms = sms.split(".") 22 | else: # Example t = '00:00:03' 23 | s = sms 24 | ms = '0' 25 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 26 | return tm 27 | 28 | def toFFMPEGtime(self, t): 29 | ss, ms = divmod(t*1000, 1000) 30 | mm, ss = divmod(ss, 60) 31 | hh, mm = divmod(mm, 60) 32 | 33 | return "{:02d}:{:02d}:{:02d}.{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 34 | 35 | def parse(self, filename): 36 | 37 | filename_split = filename.split('/') 38 | base_dir, filename = "/".join(filename_split[:-1]), filename_split[-1] 39 | 40 | name = filename.replace('.srt', '') 41 | 42 | outfile = join(base_dir, name + "_parsed.txt") 43 | 44 | sub_file = open(outfile, 'w') 45 | self.lines = [] 46 | with open(str(join(base_dir, filename)), 'r') as f: 47 | lines = f.readlines() 48 | 49 | row, st, en = "", "", "" 50 | start, num = 0, 0 51 | st_prev = "" 52 | en_prev = "" 53 | 54 | for line in lines: 55 | l = line.strip() 56 | l = l.replace(self.delimiter, '') # newly added 57 | if "-->" in l: 58 | start = 1 59 | row = "" 60 | tm = l.split(" ") 61 | st, en = tm[0].strip(), tm[2].strip() 62 | elif l != "": 63 | row += l + " " 64 | else: 65 | row += self.delimiter + st + self.delimiter + en 66 | if start: 67 | row = "{}-{:05d}.mp4{}{}\n".format(name, num, self.delimiter, row) 68 | self.lines.append(row) 69 | sub_file.write(row) 70 | num += 1 71 | start = 0 72 | st_prev = st 73 | en_prev = en 74 | 75 | if((st != st_prev or en == en_prev) and (st == en_prev)): 76 | row += self.delimiter + st + self.delimiter + en 77 | if start: 78 | row = "{}-{:05d}.mp4{}{}\n".format(name, num, self.delimiter, row) 79 | self.lines.append(row) 80 | sub_file.write(row) 81 | num += 1 82 | start = 0 83 | 84 | sub_file.close() 85 | self.merge(name, base_dir) 86 | 87 | def merge(self, yid, base_dir): 88 | 89 | outfile = join(base_dir, yid + "_merged.txt") 90 | sub_file = open(outfile, 'w') 91 | 92 | s = '00:00:00.000' 93 | make_start = 1 94 | gtLessmintime = False 95 | ll = [] 96 | tm = [] 97 | cnt = 0 98 | for line in self.lines: 99 | 100 | #row = line.strip().split('|') 101 | row = line.strip().split(self.delimiter) 102 | 103 | vid_id ,start, end = row[0], row[2], row[3] 104 | 105 | vid_id = "-".join(vid_id.split('-')[:-1]) 106 | 107 | if make_start: 108 | s = start 109 | 110 | gt = self.getTime(end) - self.getTime(s) 111 | 112 | if self.min_time <= gt <= self.max_time: 113 | gtLessmintime = False 114 | ll.append(row[1]) 115 | tm.append([row[1], row[2], row[3]]) 116 | sen = " ".join(ll) 117 | 118 | sub_file.write('{1}-{2:05d}.mp4{0}{3}{0}{4}{0}{5}\n'.format(self.delimiter, vid_id, cnt, sen, tm[0][1], tm[-1][2])) 119 | cnt += 1 120 | make_start = 1 121 | ll = [] 122 | tm = [] 123 | elif gt < self.min_time: 124 | gtLessmintime = True 125 | 126 | make_start = 0 127 | ll.append(row[1]) 128 | tm.append([row[1], row[2], row[3]]) 129 | 130 | else: # if gt > self.max_time 131 | gtLessmintime = False 132 | ll.append(row[1]) 133 | tm.append([row[1], row[2], row[3]]) 134 | sen = " ".join(ll) 135 | 136 | sub_file.write('{1}-{2:05d}.mp4{0}{3}{0}{4}{0}{5}\n'.format(self.delimiter, vid_id, cnt, sen, tm[0][1], tm[-1][2])) 137 | cnt += 1 138 | make_start = 1 139 | ll = [] 140 | tm = [] 141 | 142 | if gtLessmintime: 143 | sen = " ".join(ll) 144 | sub_file.write('{1}-{2:05d}.mp4{0}{3}{0}{4}{0}{5}\n'.format(self.delimiter, vid_id, cnt, sen, tm[0][1], tm[-1][2])) 145 | 146 | sub_file.close() 147 | 148 | def combine(self): 149 | 150 | combined_file = open(join(self.base_dir, 'combined.txt'), 'w') 151 | for fl in glob(join(self.base_dir, 'subtitles', '*_merged.txt')): 152 | 153 | with open(fl, 'r') as f: 154 | lines = f.readlines() 155 | for line in lines: 156 | combined_file.write(line) 157 | 158 | combined_file.close() -------------------------------------------------------------------------------- /code/helpers/Trim_Intro/concat_subs.py: -------------------------------------------------------------------------------- 1 | import subprocess, traceback 2 | import os 3 | import argparse 4 | 5 | import cv2 6 | 7 | import pickle as pkl 8 | 9 | from os.path import join 10 | from glob import glob 11 | from pathlib import Path 12 | from tqdm import tqdm 13 | 14 | def get_length(filename): 15 | result = subprocess.run(["ffprobe", "-v", "error", "-show_entries", 16 | "format=duration", "-of", 17 | "default=noprint_wrappers=1:nokey=1", filename], 18 | stdout=subprocess.PIPE, 19 | stderr=subprocess.STDOUT) 20 | return float(result.stdout) 21 | 22 | def get_length_cv2(vid_path): 23 | video = cv2.VideoCapture(vid_path) 24 | frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT) 25 | fps = video.get(cv2.CAP_PROP_FPS) 26 | return round(frame_count / fps, 2) 27 | 28 | def getTime(t): 29 | h, m, sms = t.split(":") 30 | if ',' in sms: # Example t = '00:00:03,980' 31 | s, ms = sms.split(",") 32 | elif '.' in sms: # Example t = '00:00:03.980' 33 | s, ms = sms.split(".") 34 | else: # Example t = '00:00:03' 35 | s = sms 36 | ms = '0' 37 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 38 | return tm 39 | 40 | def toFFMPEGtime(t): 41 | ss, ms = divmod(t*1000, 1000) 42 | mm, ss = divmod(ss, 60) 43 | hh, mm = divmod(mm, 60) 44 | 45 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 46 | 47 | 48 | def concat_subtitles(course, out_sub_filename, sub_files): 49 | num_sub_files = len(sub_files) 50 | offset = 0 51 | cnt = 1 52 | 53 | for sub in sub_files: 54 | with open(join(course, 'subtitles', sub), 'r') as orig_sub, open(join(course, 'concatenated_subtitles', out_sub_filename), 'a') as concat_sub: 55 | lines = orig_sub.readlines() 56 | endHere = False 57 | last_arrow_idx = None 58 | for i, line in reversed(list(enumerate(lines))): 59 | l = line.strip() 60 | if '-->' in l: 61 | last_arrow_idx = i 62 | break 63 | 64 | for i, line in enumerate(lines): 65 | l = line.strip() 66 | 67 | if i < len(lines) - 1: 68 | next_line = lines[i + 1].strip() 69 | 70 | if '-->' in next_line: 71 | concat_sub.write('{}\n'.format(str(cnt))) 72 | cnt += 1 73 | endHere = False 74 | continue 75 | 76 | if '-->' in l: 77 | ts = l.split(' ') 78 | st, en = ts[0].strip(), ts[2].strip() 79 | st_shifted = toFFMPEGtime(getTime(st) + offset) 80 | en_shifted = toFFMPEGtime(getTime(en) + offset) 81 | 82 | if i == last_arrow_idx: 83 | last_arrow_time = toFFMPEGtime(get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4'))) + offset) 84 | concat_sub.write('{} --> {}\n'.format(st_shifted, last_arrow_time)) 85 | else: 86 | concat_sub.write('{} --> {}\n'.format(st_shifted, en_shifted)) 87 | endHere = True 88 | else: 89 | concat_sub.write('{}'.format(line)) 90 | endHere = False 91 | 92 | if endHere: 93 | concat_sub.write('\n') 94 | 95 | offset += get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4'))) 96 | 97 | sub_files = ['MIT8_01F16_L19v01_360p.srt', 'MIT8_01F16_L19v02_360p.srt', 'MIT8_01F16_L19v03_360p.srt', 98 | 'MIT8_01F16_L19v04_360p.srt', 'MIT8_01F16_L19v05_360p.srt', 'MIT8_01F16_L19v06_360p.srt', 99 | 'MIT8_01F16_L19v07_360p.srt'] 100 | 101 | concat_subtitles('/ssd_scratch/cvit/darshan/dataset_MITOCW_v1/mit032', 'L19.srt', sub_files) -------------------------------------------------------------------------------- /code/helpers/Trim_Intro/concat_video.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A $USER 3 | 4 | module load ffmpeg/4.4.1 5 | 6 | echo "Concat started" 7 | 8 | python concat_videos.py 9 | 10 | echo "DONE successfully" 11 | -------------------------------------------------------------------------------- /code/helpers/Trim_Intro/concat_videos.py: -------------------------------------------------------------------------------- 1 | import subprocess, traceback 2 | import os 3 | import argparse 4 | import pickle as pkl 5 | 6 | from os.path import join 7 | from glob import glob 8 | from pathlib import Path 9 | from tqdm import tqdm 10 | import cv2 11 | 12 | from collections import OrderedDict 13 | from concurrent.futures import ThreadPoolExecutor, as_completed 14 | 15 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' 16 | 17 | num_workers = 20 18 | 19 | delimiter = '@@' 20 | 21 | def get_length(filename): 22 | result = subprocess.run(["ffprobe", "-v", "error", "-show_entries", 23 | "format=duration", "-of", 24 | "default=noprint_wrappers=1:nokey=1", filename], 25 | stdout=subprocess.PIPE, 26 | stderr=subprocess.STDOUT) 27 | return float(result.stdout) 28 | 29 | def get_length_cv2(vid_path): 30 | video = cv2.VideoCapture(vid_path) 31 | frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT) 32 | fps = video.get(cv2.CAP_PROP_FPS) 33 | return round(frame_count / fps, 3) 34 | 35 | def getTime(t): 36 | h, m, sms = t.split(":") 37 | if ',' in sms: # Example t = '00:00:03,980' 38 | s, ms = sms.split(",") 39 | elif '.' in sms: # Example t = '00:00:03.980' 40 | s, ms = sms.split(".") 41 | else: # Example t = '00:00:03' 42 | s = sms 43 | ms = '0' 44 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 45 | return tm 46 | 47 | def toFFMPEGtime(t): 48 | ss, ms = divmod(t*1000, 1000) 49 | mm, ss = divmod(ss, 60) 50 | hh, mm = divmod(mm, 60) 51 | 52 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 53 | 54 | def process_cmd(cmd): 55 | try: 56 | subprocess.call(cmd, shell=True) 57 | except KeyboardInterrupt: 58 | exit(0) 59 | except: 60 | traceback.print_exc() 61 | 62 | def concat_subtitles(course, out_sub_filename, sub_files): 63 | num_sub_files = len(sub_files) 64 | offset = 0 65 | cnt = 1 66 | 67 | for sub in sub_files: 68 | with open(join(course, 'subtitles', sub), 'r') as orig_sub, open(join(course, 'concatenated_subtitles', out_sub_filename), 'a') as concat_sub: 69 | lines = orig_sub.readlines() 70 | endHere = False 71 | last_arrow_idx = None 72 | for i, line in reversed(list(enumerate(lines))): 73 | l = line.strip() 74 | if '-->' in l: 75 | last_arrow_idx = i 76 | break 77 | 78 | for i, line in enumerate(lines): 79 | l = line.strip() 80 | 81 | if i < len(lines) - 1: 82 | next_line = lines[i + 1].strip() 83 | 84 | if '-->' in next_line: 85 | concat_sub.write('{}\n'.format(str(cnt))) 86 | cnt += 1 87 | endHere = False 88 | continue 89 | 90 | if '-->' in l: 91 | ts = l.split(' ') 92 | st, en = ts[0].strip(), ts[2].strip() 93 | st_shifted = toFFMPEGtime(getTime(st) + offset) 94 | en_shifted = toFFMPEGtime(getTime(en) + offset) 95 | 96 | if i == last_arrow_idx: 97 | last_arrow_time = toFFMPEGtime(get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4'))) + offset) 98 | concat_sub.write('{} --> {}\n'.format(st_shifted, last_arrow_time)) 99 | else: 100 | concat_sub.write('{} --> {}\n'.format(st_shifted, en_shifted)) 101 | endHere = True 102 | else: 103 | concat_sub.write('{}'.format(line)) 104 | endHere = False 105 | 106 | if endHere: 107 | concat_sub.write('\n') 108 | 109 | offset += get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4'))) 110 | 111 | course_list = [] 112 | 113 | for course in glob(join(base_dir, '*')): 114 | if 'mit' in course: 115 | course_list.append(course) 116 | 117 | course_list.sort() 118 | 119 | for course in course_list: 120 | Path(join(course, 'concatenated_videos')).mkdir(parents=True, exist_ok=True) 121 | Path(join(course, 'concatenated_subtitles')).mkdir(parents=True, exist_ok=True) 122 | 123 | course_name = course.split('/')[-1] 124 | print("Inside course - ", course_name) 125 | 126 | 127 | cmd_list = [] 128 | 129 | with open(join(Path.home(), 'Segmentation', 'segments', 'lecname', course_name + '.txt')) as f: 130 | lines = f.readlines() 131 | 132 | course_stats = OrderedDict() 133 | 134 | for line in lines: 135 | l = line.strip() 136 | segment_name = l.split(delimiter)[0] 137 | lec_name = l.split(delimiter)[-1] 138 | 139 | if lec_name not in course_stats: 140 | course_stats[lec_name] = [] 141 | course_stats[lec_name].append(segment_name) 142 | 143 | for lec_name in course_stats: 144 | 145 | num_segments = len(course_stats[lec_name]) 146 | 147 | # Usual concat command 148 | concat_cmd = "ffmpeg -hide_banner -loglevel error" 149 | for i in range(num_segments): 150 | concat_cmd += " -i {}".format(join(course, 'videos', course_stats[lec_name][i])) 151 | concat_cmd += ' -filter_complex "' 152 | for i in range(num_segments): 153 | concat_cmd += "[{0}:v] [{0}:a] ".format(i) 154 | concat_cmd += 'concat=n={}:v=1:a=1 [v] [a]"'.format(num_segments) 155 | concat_cmd += ' -map "[v]" -map "[a]" {}'.format(join(course, 'concatenated_videos', lec_name + '.mp4')) 156 | 157 | 158 | sub_list = [] 159 | for i in range(num_segments): 160 | sub_list.append(course_stats[lec_name][i].replace('.mp4', '.srt')) 161 | 162 | concat_subtitles(course, lec_name + '.srt', sub_list) 163 | 164 | cmd_list.append(concat_cmd) 165 | 166 | p = ThreadPoolExecutor(num_workers) 167 | 168 | futures = [p.submit(process_cmd, j) for j in cmd_list] 169 | _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))] 170 | 171 | 172 | print("DONE SUCCESSFULLY") -------------------------------------------------------------------------------- /code/helpers/Trim_Intro/cut_intro.py: -------------------------------------------------------------------------------- 1 | import subprocess, traceback 2 | import os 3 | import argparse 4 | 5 | import pickle as pkl 6 | 7 | from os.path import join 8 | from glob import glob 9 | from pathlib import Path 10 | from tqdm import tqdm 11 | 12 | from concurrent.futures import ThreadPoolExecutor, as_completed 13 | 14 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' 15 | 16 | num_workers = 30 17 | 18 | def getTime(t): 19 | h, m, sms = t.split(":") 20 | if ',' in sms: # Example t = '00:00:03,980' 21 | s, ms = sms.split(",") 22 | elif '.' in sms: # Example t = '00:00:03.980' 23 | s, ms = sms.split(".") 24 | else: # Example t = '00:00:03' 25 | s = sms 26 | ms = '0' 27 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 28 | return tm 29 | 30 | def toFFMPEGtime(t): 31 | ss, ms = divmod(t*1000, 1000) 32 | mm, ss = divmod(ss, 60) 33 | hh, mm = divmod(mm, 60) 34 | 35 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 36 | 37 | def trim_subtitles(course, sub_file, start_ts, end_ts): 38 | 39 | with open(join(course, 'subtitles', sub_file), 'r') as orig_sub, open(join(course, 'trimmed_subtitles', sub_file), 'w') as trimmed_sub: 40 | start_copy = False 41 | end_copy = False 42 | count = 1 43 | 44 | lines = orig_sub.readlines() 45 | for i, line in enumerate(lines): 46 | l = line.strip() 47 | 48 | if '-->' in l and not start_copy: 49 | ts = l.split(' ') 50 | st, en = ts[0].strip(), ts[2].strip() 51 | if (getTime(st) - start_ts < 0) and (getTime(en) - start_ts <= 0): 52 | start_copy = False 53 | else: 54 | start_copy = True 55 | trimmed_sub.write('{}\n'.format(str(count))) 56 | count += 1 57 | 58 | if getTime(st) != start_ts: 59 | trimmed_sub.write('{} --> {}\n'.format('00:00:00,000', toFFMPEGtime(abs(getTime(en) - start_ts)))) 60 | continue 61 | 62 | 63 | if start_copy and not end_copy: 64 | 65 | if i < len(lines) - 1: 66 | next_line = lines[i + 1].strip() 67 | if '-->' in next_line: 68 | next_line_st = next_line.split(' ')[0] 69 | if getTime(next_line_st) >= end_ts: 70 | end_copy = True 71 | return None 72 | trimmed_sub.write('{}\n'.format(str(count))) 73 | count += 1 74 | continue 75 | 76 | if '-->' in l: 77 | ts = l.split(' ') 78 | st, en = ts[0].strip(), ts[2].strip() 79 | 80 | st_shifted = toFFMPEGtime(getTime(st) - start_ts) 81 | en_shifted = toFFMPEGtime(getTime(en) - start_ts) 82 | trimmed_sub.write('{} --> {}\n'.format(st_shifted, en_shifted)) 83 | else: 84 | trimmed_sub.write(line) 85 | 86 | 87 | def process_cmd(cmd): 88 | try: 89 | subprocess.call(cmd, shell=True) 90 | except KeyboardInterrupt: 91 | exit(0) 92 | except: 93 | traceback.print_exc() 94 | 95 | course_list = [] 96 | 97 | for course in glob(join(base_dir, '*')): 98 | if 'mit' in course: 99 | course_list.append(course) 100 | 101 | course_list.sort() 102 | 103 | for course in course_list: 104 | print(course) 105 | 106 | for course in course_list: 107 | 108 | Path(join(course, 'trimmed_videos')).mkdir(parents=True, exist_ok=True) 109 | Path(join(course, 'trimmed_subtitles')).mkdir(parents=True, exist_ok=True) 110 | 111 | course_name = course.split('/')[-1] 112 | print("Inside course - ", course_name) 113 | 114 | 115 | cmd_list = [] 116 | 117 | pkl_file = pkl.load(open(join(Path.home(), 'Segmentation', 'segments', 'stats', course_name + '.pkl'), 'rb')) 118 | 119 | for lec_name in pkl_file: 120 | vid_name = lec_name + '.mp4' 121 | sub_file = lec_name + '.srt' 122 | start_ts = pkl_file[lec_name]['st'] 123 | end_ts = pkl_file[lec_name]['en'] 124 | 125 | trim_cmd = 'ffmpeg -hide_banner -loglevel error -i {} -ss {} -to {} -strict -2 {} -y'.format( 126 | join(course, 'videos', vid_name), start_ts, end_ts, join(course, 'trimmed_videos', vid_name) 127 | ) 128 | 129 | trim_subtitles(course, sub_file, int(start_ts), int(end_ts)) 130 | 131 | cmd_list.append(trim_cmd) 132 | 133 | p = ThreadPoolExecutor(num_workers) 134 | 135 | futures = [p.submit(process_cmd, j) for j in cmd_list] 136 | _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))] 137 | 138 | 139 | print("DONE SUCCESSFULLY") -------------------------------------------------------------------------------- /code/helpers/Trim_Intro/cut_intro.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A $USER 3 | 4 | module load ffmpeg/4.4.1 5 | 6 | echo "Trimming started" 7 | 8 | python cut_intro.py 9 | 10 | echo "DONE successfully" 11 | -------------------------------------------------------------------------------- /code/helpers/Trim_Intro/cut_srt.py: -------------------------------------------------------------------------------- 1 | import subprocess, traceback 2 | import os 3 | import argparse 4 | 5 | from os.path import join 6 | from glob import glob 7 | from pathlib import Path 8 | from tqdm import tqdm 9 | 10 | from concurrent.futures import ThreadPoolExecutor, as_completed 11 | 12 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' 13 | 14 | def getTime(t): 15 | h, m, sms = t.split(":") 16 | if ',' in sms: # Example t = '00:00:03,980' 17 | s, ms = sms.split(",") 18 | elif '.' in sms: # Example t = '00:00:03.980' 19 | s, ms = sms.split(".") 20 | else: # Example t = '00:00:03' 21 | s = sms 22 | ms = '0' 23 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 24 | return tm 25 | 26 | def toFFMPEGtime(t): 27 | ss, ms = divmod(t*1000, 1000) 28 | mm, ss = divmod(ss, 60) 29 | hh, mm = divmod(mm, 60) 30 | 31 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 32 | 33 | def trim_subtitles(course, sub_file, start_ts, end_ts): 34 | 35 | with open(join(course, 'subtitles', sub_file), 'r') as orig_sub, open(join(course, 'trimmed_subtitles', sub_file), 'w') as trimmed_sub: 36 | start_copy = False 37 | end_copy = False 38 | count = 1 39 | 40 | lines = orig_sub.readlines() 41 | for i, line in enumerate(lines): 42 | l = line.strip() 43 | 44 | if '-->' in l and not start_copy: 45 | ts = l.split(' ') 46 | st, en = ts[0].strip(), ts[2].strip() 47 | if (getTime(st) - start_ts < 0) and (getTime(en) - start_ts <= 0): 48 | start_copy = False 49 | else: 50 | start_copy = True 51 | trimmed_sub.write('{}\n'.format(str(count))) 52 | count += 1 53 | 54 | if getTime(st) != start_ts: 55 | trimmed_sub.write('{} --> {}\n'.format('00:00:00,000', toFFMPEGtime(abs(getTime(en) - start_ts)))) 56 | continue 57 | 58 | 59 | if start_copy and not end_copy: 60 | 61 | if i < len(lines) - 1: 62 | next_line = lines[i + 1].strip() 63 | if '-->' in next_line: 64 | next_line_st = next_line.split(' ')[0] 65 | if getTime(next_line_st) >= end_ts: 66 | end_copy = True 67 | return None 68 | trimmed_sub.write('{}\n'.format(str(count))) 69 | count += 1 70 | continue 71 | 72 | if '-->' in l: 73 | ts = l.split(' ') 74 | st, en = ts[0].strip(), ts[2].strip() 75 | 76 | st_shifted = toFFMPEGtime(getTime(st) - start_ts) 77 | en_shifted = toFFMPEGtime(getTime(en) - start_ts) 78 | trimmed_sub.write('{} --> {}\n'.format(st_shifted, en_shifted)) 79 | else: 80 | trimmed_sub.write(line) 81 | 82 | 83 | 84 | course_list = [] 85 | 86 | for course in glob(join(base_dir, '*')): 87 | if 'mit' in course: 88 | course_list.append(course) 89 | 90 | course_list.sort() 91 | 92 | for course in course_list: 93 | print(course) 94 | 95 | trim_subtitles('/ssd_scratch/cvit/darshan/dataset_MITOCW_v1/mit001', 'ocw-18.01-f07-lec01_300k.srt', 22, 120) -------------------------------------------------------------------------------- /code/lecture_aware_embds/args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def get_args(description='Text-Video'): 4 | parser = argparse.ArgumentParser(description=description) 5 | parser.add_argument( 6 | '--train_csv', 7 | type=str, 8 | default='data/v1.csv', 9 | help='train csv') 10 | parser.add_argument( 11 | '--features_path_2D', 12 | type=str, 13 | default='feature_2d', 14 | help='feature path for 2D features') 15 | parser.add_argument( 16 | '--features_path_3D', 17 | type=str, 18 | default='feature_3d', 19 | help='feature path for 3D features') 20 | parser.add_argument( 21 | '--caption_path', 22 | type=str, 23 | default='data/caption.pickle', 24 | help='caption pickle file path') 25 | parser.add_argument( 26 | '--word2vec_path', 27 | type=str, 28 | default='data/GoogleNews-vectors-negative300.bin', 29 | help='word embedding path') 30 | parser.add_argument( 31 | '--word2vec', 32 | dest='word2vec', 33 | action='store_true', 34 | help='If you want to use word2vec embeddings') 35 | parser.add_argument( 36 | '--BERT', 37 | dest='word2vec', 38 | action='store_false', 39 | help='If you want to use BERT embeddings') 40 | parser.set_defaults(word2vec=True) 41 | parser.add_argument( 42 | '--BERT_train_path', 43 | type=str, 44 | default='', 45 | help='BERT embeddings path of training data') 46 | parser.add_argument( 47 | '--BERT_val_path', 48 | type=str, 49 | default='', 50 | help='BERT embeddings path of validation data') 51 | parser.add_argument( 52 | '--pretrain_path', 53 | type=str, 54 | default='', 55 | help='pre train model path') 56 | parser.add_argument( 57 | '--checkpoint_dir', 58 | type=str, 59 | default='', 60 | help='checkpoint model folder') 61 | parser.add_argument('--num_thread_reader', type=int, default=1, 62 | help='') 63 | parser.add_argument('--embd_dim', type=int, default=2048, 64 | help='embedding dim') 65 | parser.add_argument('--lr', type=float, default=0.0001, 66 | help='initial learning rate') 67 | parser.add_argument('--epochs', type=int, default=20, 68 | help='upper epoch limit') 69 | parser.add_argument('--batch_size', type=int, default=256, 70 | help='batch size') 71 | parser.add_argument('--batch_size_val', type=int, default=3500, 72 | help='batch size eval') 73 | parser.add_argument('--lr_decay', type=float, default=0.9, 74 | help='Learning rate exp epoch decay') 75 | parser.add_argument('--n_display', type=int, default=10, 76 | help='Information display frequence') 77 | parser.add_argument('--feature_dim', type=int, default=4096, 78 | help='video feature dimension') 79 | parser.add_argument('--we_dim', type=int, default=300, 80 | help='word embedding dimension') 81 | parser.add_argument('--ocr_dim', type=int, default=2048, 82 | help='OCR text embedding dimension') 83 | parser.add_argument('--seed', type=int, default=1, 84 | help='random seed') 85 | parser.add_argument('--verbose', type=int, default=1, 86 | help='') 87 | parser.add_argument('--max_words', type=int, default=20, 88 | help='') 89 | parser.add_argument('--min_words', type=int, default=0, 90 | help='') 91 | parser.add_argument('--feature_framerate', type=int, default=1, 92 | help='') 93 | parser.add_argument('--min_time', type=float, default=5.0, 94 | help='Gather small clips') 95 | parser.add_argument('--margin', type=float, default=0.1, 96 | help='margin for loss') 97 | parser.add_argument('--hard_negative_rate', type=float, default=0.5, 98 | help='rate of intra negative sample') 99 | parser.add_argument('--negative_weighting', type=int, default=1, 100 | help='Weight the loss for intra negative') 101 | parser.add_argument('--n_pair', type=int, default=1, 102 | help='Num of pair to output from data loader') 103 | parser.add_argument('--avlectures', type=int, default=0, 104 | help='Train on AVLectures data') 105 | parser.add_argument('--eval_avlectures', type=int, default=0, 106 | help='Evaluate on AVLectures data') 107 | parser.add_argument('--sentence_dim', type=int, default=-1, 108 | help='sentence dimension') 109 | parser.add_argument('--save_every', type=int, default=1, 110 | help='intervals at which the checkpoints to be saved') 111 | parser.add_argument('--only_2d', type=int, default=0, 112 | help='1, if you want to use only 2D features for training and inference. 0 otherwise') 113 | parser.add_argument('--only_3d', type=int, default=0, 114 | help='1, if you want to use only 3D features for training and inference. 0 otherwise') 115 | parser.add_argument('--only_ocr', type=int, default=0, 116 | help='1, if you want to use only ocr features for training and inference. 0 otherwise') 117 | parser.add_argument('--ocr', type=int, default=0, 118 | help='1, if you want to use OCR features for training and inference. 0 otherwise') 119 | parser.add_argument( 120 | '--avlectures_train_path', 121 | type=str, 122 | default='data/avlectures_train.pkl', 123 | help='') 124 | parser.add_argument( 125 | '--avlectures_helper_path', 126 | type=str, 127 | default='data/avlectures_helper.pkl', 128 | help='') 129 | parser.add_argument( 130 | '--avlectures_val_path', 131 | type=str, 132 | default='data/avlectures_val.pkl', 133 | help='') 134 | args = parser.parse_args() 135 | return args 136 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import DataLoader 8 | from args import get_args 9 | from model import Net 10 | from metrics import compute_metrics, print_computed_metrics 11 | from gensim.models.keyedvectors import KeyedVectors 12 | import pickle 13 | import glob 14 | from lsmdc_dataloader import LSMDC_DataLoader 15 | from msrvtt_dataloader import MSRVTT_DataLoader 16 | from youcook_dataloader import Youcook_DataLoader 17 | from avlectures_dataloader import AVLectures_DataLoader 18 | 19 | args = get_args() 20 | if args.verbose: 21 | print(args) 22 | 23 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument' 24 | 25 | if args.word2vec: 26 | print('Loading word vectors: {}'.format(args.word2vec_path)) 27 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True) 28 | print('done') 29 | else: 30 | we = None 31 | 32 | if args.eval_avlectures: 33 | dataset_avlectures = AVLectures_DataLoader( 34 | data=args.avlectures_val_path, 35 | helper_pkl = args.avlectures_helper_path, 36 | we=we, 37 | max_words=args.max_words, 38 | we_dim=args.we_dim, 39 | word2vec=args.word2vec, 40 | ocr=args.ocr, 41 | only_2d=args.only_2d, 42 | only_3d=args.only_3d 43 | ) 44 | dataloader_avlectures = DataLoader( 45 | dataset_avlectures, 46 | batch_size=args.batch_size_val, 47 | num_workers=args.num_thread_reader, 48 | shuffle=False, 49 | ) 50 | 51 | net = Net( 52 | video_dim=args.feature_dim, 53 | embd_dim=args.embd_dim, 54 | we_dim=args.we_dim, 55 | max_words=args.max_words, 56 | word2vec=args.word2vec, 57 | ocr=args.ocr, 58 | ocr_dim=args.ocr_dim, 59 | only_ocr=args.only_ocr 60 | ) 61 | net.eval() 62 | net.cuda() 63 | 64 | if args.verbose: 65 | print('Starting evaluation loop ...') 66 | 67 | def Eval_retrieval(model, eval_dataloader, dataset_name): 68 | model.eval() 69 | print('Evaluating Text-Video retrieval on {} data'.format(dataset_name)) 70 | with th.no_grad(): 71 | for i_batch, data in enumerate(eval_dataloader): 72 | text = data['text'].cuda() 73 | video = data['video'].cuda() 74 | vid = data['video_id'] 75 | ocr_embd = None 76 | if args.ocr: 77 | ocr_embd = data['ocr_embd'].cuda() 78 | m = model(video, text, ocr_embd) 79 | m = m.cpu().detach().numpy() 80 | metrics = compute_metrics(m) 81 | print_computed_metrics(metrics) 82 | 83 | all_checkpoints = glob.glob(args.pretrain_path) 84 | 85 | for c in all_checkpoints: 86 | print('Eval checkpoint: {}'.format(c)) 87 | print('Loading checkpoint: {}'.format(c)) 88 | net.load_checkpoint(c) 89 | if args.eval_avlectures: 90 | Eval_retrieval(net, dataloader_avlectures, 'AVLectures') 91 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/extract_feats.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | from operator import le 6 | 7 | import torch as th 8 | import numpy as np 9 | from torch.utils.data import DataLoader 10 | from args import get_args 11 | from model_ef import Net 12 | # from model import Net 13 | from metrics import compute_metrics, print_computed_metrics 14 | from gensim.models.keyedvectors import KeyedVectors 15 | import pickle 16 | import glob 17 | from lsmdc_dataloader import LSMDC_DataLoader 18 | from msrvtt_dataloader import MSRVTT_DataLoader 19 | from youcook_dataloader import Youcook_DataLoader 20 | from avlectures_dataloader import AVLectures_DataLoader 21 | 22 | 23 | args = get_args() 24 | if args.verbose: 25 | print(args) 26 | 27 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument' 28 | 29 | if args.word2vec: 30 | print('Loading word vectors: {}'.format(args.word2vec_path)) 31 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True) 32 | print('done') 33 | else: 34 | we = args.BERT_val_path 35 | 36 | 37 | if args.eval_avlectures: 38 | dataset_val = AVLectures_DataLoader( 39 | data=args.avlectures_val_path, 40 | helper_pkl = args.avlectures_helper_path, 41 | we=we, 42 | max_words=args.max_words, 43 | we_dim=args.we_dim, 44 | word2vec=args.word2vec, 45 | ocr=args.ocr, 46 | only_2d=args.only_2d, 47 | only_3d=args.only_3d 48 | ) 49 | dataloader_val = DataLoader( 50 | dataset_val, 51 | batch_size=args.batch_size_val, 52 | num_workers=args.num_thread_reader, 53 | shuffle=False, 54 | ) 55 | 56 | 57 | net = Net( 58 | video_dim=args.feature_dim, 59 | embd_dim=args.embd_dim, 60 | we_dim=args.we_dim, 61 | max_words=args.max_words, 62 | word2vec=args.word2vec, 63 | ocr=args.ocr, 64 | ocr_dim=args.ocr_dim, 65 | only_ocr=args.only_ocr 66 | ) 67 | net.eval() 68 | net.cuda() 69 | 70 | if args.verbose: 71 | print('Starting evaluation loop ...') 72 | 73 | pkl_data = {} 74 | 75 | def Eval_retrieval(model, eval_dataloader, dataset_name): 76 | model.eval() 77 | print('Evaluating Text-Video retrieval on {} data'.format(dataset_name)) 78 | with th.no_grad(): 79 | for i_batch, data in enumerate(eval_dataloader): 80 | text = data['text'].cuda() 81 | video = data['video'].cuda() 82 | ocr_embd = None 83 | if args.ocr: 84 | ocr_embd = data['ocr_embd'].cuda() 85 | vid = data['video_id'] 86 | course_name = data['course_name'][0] 87 | st = data['st'] 88 | et = data['et'] 89 | vid_duration = data['vid_duration'] 90 | st = st.item() 91 | et = et.item() 92 | vid_duration = vid_duration.item() 93 | m, vid_embd, text_embd = model(video, text, ocr_embd) 94 | 95 | # for trained embds 96 | text_embd = text_embd.cpu().detach().numpy() 97 | vid_embd = vid_embd.cpu().detach().numpy() 98 | m = m.cpu().detach().numpy() 99 | 100 | lecture_name = "-".join(vid[0].split('-')[:-1]) 101 | split_number = int(vid[0].split('-')[-1]) 102 | 103 | if course_name not in pkl_data: 104 | pkl_data[course_name] = {} 105 | 106 | if lecture_name not in pkl_data[course_name]: 107 | pkl_data[course_name][lecture_name] = {"vid_embd": [], "text_embd": [], "stet": [], "vid_duration": []} 108 | 109 | pkl_data[course_name][lecture_name]["vid_embd"].append((split_number, vid_embd)) 110 | pkl_data[course_name][lecture_name]["text_embd"].append((split_number, text_embd)) 111 | pkl_data[course_name][lecture_name]["stet"].append((split_number, st, et)) 112 | pkl_data[course_name][lecture_name]["vid_duration"].append((split_number, vid_duration)) 113 | 114 | all_checkpoints = glob.glob(args.pretrain_path) 115 | 116 | for c in all_checkpoints: 117 | print('Eval checkpoint: {}'.format(c)) 118 | print('Loading checkpoint: {}'.format(c)) 119 | net.load_checkpoint(c) 120 | if args.eval_avlectures: 121 | Eval_retrieval(net, dataloader_val, 'AVLectures') 122 | 123 | pkl_data_new = {} 124 | 125 | for c in pkl_data.keys(): 126 | 127 | if c not in pkl_data_new: 128 | pkl_data_new[c] = {} 129 | 130 | for k in pkl_data[c].keys(): 131 | 132 | vid_embd_sorted = pkl_data[c][k]['vid_embd'] 133 | vid_embd_sorted = sorted(vid_embd_sorted, key=lambda x: x[0]) 134 | 135 | text_embd_sorted = pkl_data[c][k]['text_embd'] 136 | text_embd_sorted = sorted(text_embd_sorted, key=lambda x: x[0]) 137 | 138 | vid_duration_sorted = pkl_data[c][k]['vid_duration'] 139 | vid_duration_sorted = sorted(vid_duration_sorted, key=lambda x: x[0]) 140 | 141 | stet_sorted = pkl_data[c][k]['stet'] 142 | stet_sorted = sorted(stet_sorted, key=lambda x: x[0]) 143 | 144 | vid_embd = vid_embd_sorted[0][1] 145 | text_embd = text_embd_sorted[0][1] 146 | vid_duration = [vid_duration_sorted[0][1]] 147 | stet = [(stet_sorted[0][0], stet_sorted[0][1], stet_sorted[0][2])]s 148 | 149 | for i in range(1, len(vid_embd_sorted)): 150 | prev_vid_embd = vid_embd 151 | prev_text_embd = text_embd 152 | 153 | vid_embd = np.concatenate((prev_vid_embd, vid_embd_sorted[i][1])) 154 | text_embd = np.concatenate((prev_text_embd, text_embd_sorted[i][1])) 155 | 156 | vid_duration.append(vid_duration_sorted[i][1]) 157 | 158 | stet.append((stet_sorted[i][0], stet_sorted[i][1], stet_sorted[i][2])) 159 | 160 | pkl_data_new[c][k] = {"vid_embd": vid_embd, "text_embd": text_embd, "vid_duration": vid_duration, "stet": stet} 161 | 162 | # Path where you want to save the extracted features 163 | with open('/ssd_scratch/cvit/darshan_2/seg_embds/2d3dOCR_ss_test50ft50.pkl', 'wb') as handle: 164 | pickle.dump(pkl_data_new, handle, protocol=pickle.HIGHEST_PROTOCOL) 165 | 166 | print("DONE SUCCESSFULLY") -------------------------------------------------------------------------------- /code/lecture_aware_embds/loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn.functional as F 7 | import torch as th 8 | import numpy as np 9 | 10 | class MaxMarginRankingLoss(th.nn.Module): 11 | def __init__(self, 12 | margin=1.0, 13 | negative_weighting=False, 14 | batch_size=1, 15 | n_pair=1, 16 | hard_negative_rate=0.5, 17 | ): 18 | super(MaxMarginRankingLoss, self).__init__() 19 | self.margin = margin 20 | self.n_pair = n_pair 21 | self.batch_size = batch_size 22 | easy_negative_rate = 1 - hard_negative_rate 23 | self.easy_negative_rate = easy_negative_rate 24 | self.negative_weighting = negative_weighting 25 | if n_pair > 1: 26 | alpha = easy_negative_rate / ((batch_size - 1) * (1 - easy_negative_rate)) 27 | mm_mask = (1 - alpha) * np.eye(self.batch_size) + alpha 28 | mm_mask = np.kron(mm_mask, np.ones((n_pair, n_pair))) 29 | mm_mask = th.tensor(mm_mask) * (batch_size * (1 - easy_negative_rate)) 30 | self.mm_mask = mm_mask.float().cuda() 31 | 32 | 33 | def forward(self, x): 34 | d = th.diag(x) 35 | max_margin = F.relu(self.margin + x - d.view(-1, 1)) + \ 36 | F.relu(self.margin + x - d.view(1, -1)) 37 | if self.negative_weighting and self.n_pair > 1: 38 | max_margin = max_margin * self.mm_mask 39 | return max_margin.mean() 40 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/loss_ce.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn.functional as F 7 | import torch as th 8 | import numpy as np 9 | import torch.nn as nn 10 | 11 | class CE_loss(th.nn.Module): 12 | def __init__(self): 13 | super(CE_loss, self).__init__() 14 | 15 | def forward(self, S, margin=0.001): 16 | 17 | target = th.LongTensor(list(range(S.size(0)))).to(S.device) 18 | ce_loss = nn.CrossEntropyLoss() 19 | loss = ce_loss(S, target) 20 | return loss 21 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/loss_milnce.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | 3 | 4 | class MILNCELoss(th.nn.Module): 5 | def __init__(self): 6 | super(MILNCELoss, self).__init__() 7 | 8 | def forward(self, video_embd, text_embd): 9 | x = th.matmul(video_embd, text_embd.t()) 10 | x = x.view(video_embd.shape[0], video_embd.shape[0], -1) 11 | nominator = x * th.eye(x.shape[0])[:,:,None].cuda() 12 | nominator = nominator.sum(dim=1) 13 | nominator = th.logsumexp(nominator, dim=1) 14 | denominator = th.cat((x, x.permute(1,0,2)), dim=1).view(x.shape[0], -1) 15 | denominator = th.logsumexp(denominator, dim=1) 16 | return th.mean(denominator - nominator) -------------------------------------------------------------------------------- /code/lecture_aware_embds/loss_mms.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn.functional as F 7 | import torch as th 8 | import numpy as np 9 | 10 | class MMS_loss(th.nn.Module): 11 | def __init__(self): 12 | super(MMS_loss, self).__init__() 13 | 14 | def forward(self, S, margin=0.001): 15 | deltas = margin * th.eye(S.size(0)).to(S.device) 16 | S = S - deltas 17 | 18 | target = th.LongTensor(list(range(S.size(0)))).to(S.device) 19 | I2C_loss = F.nll_loss(F.log_softmax(S, dim=1), target) 20 | C2I_loss = F.nll_loss(F.log_softmax(S.t(), dim=1), target) 21 | loss = I2C_loss + C2I_loss 22 | return loss -------------------------------------------------------------------------------- /code/lecture_aware_embds/metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | 8 | 9 | def compute_metrics(x): 10 | sx = np.sort(-x, axis=1) 11 | d = np.diag(-x) 12 | d = d[:, np.newaxis] 13 | ind = sx - d 14 | ind = np.where(ind == 0) 15 | ind = ind[1] 16 | metrics = {} 17 | metrics['R1'] = float(np.sum(ind == 0)) / len(ind) 18 | metrics['R5'] = float(np.sum(ind < 5)) / len(ind) 19 | metrics['R10'] = float(np.sum(ind < 10)) / len(ind) 20 | metrics['MR'] = np.median(ind) + 1 21 | return metrics 22 | 23 | 24 | def print_computed_metrics(metrics): 25 | r1 = metrics['R1'] 26 | r5 = metrics['R5'] 27 | r10 = metrics['R10'] 28 | mr = metrics['MR'] 29 | print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr)) 30 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/model.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | import re 10 | 11 | class Net(nn.Module): 12 | def __init__( 13 | self, 14 | embd_dim=1024, 15 | video_dim=2048, 16 | n_pair=1, 17 | we_dim=300, 18 | max_words=30, 19 | sentence_dim=-1, 20 | we=None, 21 | word2vec=True, 22 | ocr=0, 23 | ocr_dim=2048, 24 | only_ocr=0 25 | ): 26 | super(Net, self).__init__() 27 | self.ocr = ocr 28 | 29 | if sentence_dim <= 0: 30 | self.text_pooling = Sentence_Maxpool(we_dim, embd_dim, word2vec) 31 | if ocr: 32 | self.ocr_pooling = Sentence_Maxpool(we_dim, ocr_dim, word2vec) 33 | else: 34 | self.text_pooling = Sentence_Maxpool(we_dim, sentence_dim) 35 | if ocr: 36 | self.ocr_pooling = Sentence_Maxpool(we_dim, sentence_dim) 37 | 38 | self.GU_text = Gated_Embedding_Unit( 39 | self.text_pooling.out_dim, embd_dim, gating=True) 40 | 41 | if ocr: 42 | self.GU_ocr = Gated_Embedding_Unit( 43 | self.ocr_pooling.out_dim, ocr_dim, gating=True) 44 | 45 | self.GU_video = Gated_Embedding_Unit( 46 | video_dim, embd_dim, gating=True) 47 | 48 | self.n_pair = n_pair 49 | self.embd_dim = embd_dim 50 | self.we = we 51 | self.we_dim = we_dim 52 | self.word2vec = word2vec 53 | self.only_ocr = only_ocr 54 | 55 | 56 | def save_checkpoint(self, path): 57 | th.save(self.state_dict(), path) 58 | 59 | def load_checkpoint(self, path, cpu=False): 60 | if cpu: 61 | self.load_state_dict(th.load(path, 62 | map_location=lambda storage, loc: storage)) 63 | else: 64 | self.load_state_dict(th.load(path)) 65 | 66 | def forward(self, video, text, ocr): 67 | if ocr != None: 68 | if self.only_ocr: 69 | video = self.ocr_pooling(ocr) 70 | else: 71 | video = th.cat((video, self.ocr_pooling(ocr)), dim = 1) 72 | video = self.GU_video(video) 73 | text = self.GU_text(self.text_pooling(text)) 74 | return th.matmul(text, video.t()), video, text 75 | 76 | 77 | class Gated_Embedding_Unit(nn.Module): 78 | def __init__(self, input_dimension, output_dimension, gating=True): 79 | super(Gated_Embedding_Unit, self).__init__() 80 | self.fc = nn.Linear(input_dimension, output_dimension) 81 | self.cg = Context_Gating(output_dimension) 82 | self.gating = gating 83 | 84 | def forward(self, x): 85 | x = self.fc(x) 86 | if self.gating: 87 | x = self.cg(x) 88 | x = F.normalize(x) 89 | return x 90 | 91 | class Sentence_Maxpool(nn.Module): 92 | def __init__(self, word_dimension, output_dim, word2vec=True, relu=True): 93 | super(Sentence_Maxpool, self).__init__() 94 | self.fc = nn.Linear(word_dimension, output_dim) 95 | self.out_dim = output_dim 96 | self.word2vec = word2vec 97 | self.relu = relu 98 | 99 | def forward(self, x): 100 | x = self.fc(x) 101 | if self.relu: 102 | x = F.relu(x) 103 | 104 | if self.word2vec: 105 | return th.max(x, dim=1)[0] # if word2vec 106 | else: 107 | return x # if not word2vec 108 | 109 | class Context_Gating(nn.Module): 110 | def __init__(self, dimension, add_batch_norm=False): 111 | super(Context_Gating, self).__init__() 112 | self.fc = nn.Linear(dimension, dimension) 113 | self.add_batch_norm = add_batch_norm 114 | self.batch_norm = nn.BatchNorm1d(dimension) 115 | 116 | def forward(self, x): 117 | x1 = self.fc(x) 118 | if self.add_batch_norm: 119 | x1 = self.batch_norm(x1) 120 | x = th.cat((x, x1), 1) 121 | return F.glu(x, 1) 122 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/model_ef.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch.nn as nn 7 | import torch as th 8 | import torch.nn.functional as F 9 | import re 10 | 11 | class Net(nn.Module): 12 | def __init__( 13 | self, 14 | embd_dim=1024, 15 | video_dim=2048, 16 | n_pair=1, 17 | we_dim=300, 18 | max_words=30, 19 | sentence_dim=-1, 20 | we=None, 21 | word2vec=True, 22 | ocr=0, 23 | ocr_dim=2048, 24 | only_ocr=0 25 | ): 26 | super(Net, self).__init__() 27 | self.ocr = ocr 28 | 29 | if sentence_dim <= 0: 30 | self.text_pooling = Sentence_Maxpool(we_dim, embd_dim, word2vec) 31 | if ocr: 32 | self.ocr_pooling = Sentence_Maxpool(we_dim, ocr_dim, word2vec) 33 | else: 34 | self.text_pooling = Sentence_Maxpool(we_dim, sentence_dim) 35 | if ocr: 36 | self.ocr_pooling = Sentence_Maxpool(we_dim, sentence_dim) 37 | 38 | self.GU_text = Gated_Embedding_Unit( 39 | self.text_pooling.out_dim, embd_dim, gating=True) 40 | 41 | if ocr: 42 | self.GU_ocr = Gated_Embedding_Unit( 43 | self.ocr_pooling.out_dim, ocr_dim, gating=True) 44 | 45 | self.GU_video = Gated_Embedding_Unit( 46 | video_dim, embd_dim, gating=True) 47 | self.n_pair = n_pair 48 | self.embd_dim = embd_dim 49 | self.we = we 50 | self.we_dim = we_dim 51 | self.word2vec = word2vec 52 | self.only_ocr = only_ocr 53 | 54 | 55 | def save_checkpoint(self, path): 56 | th.save(self.state_dict(), path) 57 | 58 | def load_checkpoint(self, path, cpu=False): 59 | if cpu: 60 | self.load_state_dict(th.load(path, 61 | map_location=lambda storage, loc: storage)) 62 | else: 63 | self.load_state_dict(th.load(path)) 64 | 65 | def forward(self, video, text, ocr): 66 | if ocr != None: 67 | if self.only_ocr: 68 | video = self.ocr_pooling(ocr) 69 | else: 70 | video = th.cat((video, self.ocr_pooling(ocr)), dim = 1) 71 | video = self.GU_video(video) 72 | text = self.GU_text(self.text_pooling(text)) 73 | return (th.matmul(text, video.t()), video, text) 74 | 75 | 76 | 77 | class Gated_Embedding_Unit(nn.Module): 78 | def __init__(self, input_dimension, output_dimension, gating=True): 79 | super(Gated_Embedding_Unit, self).__init__() 80 | self.fc = nn.Linear(input_dimension, output_dimension) 81 | self.cg = Context_Gating(output_dimension) 82 | self.gating = gating 83 | 84 | def forward(self, x): 85 | x = self.fc(x) 86 | if self.gating: 87 | x = self.cg(x) 88 | x = F.normalize(x) 89 | return x 90 | 91 | class Sentence_Maxpool(nn.Module): 92 | def __init__(self, word_dimension, output_dim, word2vec=True, relu=True): 93 | super(Sentence_Maxpool, self).__init__() 94 | self.fc = nn.Linear(word_dimension, output_dim) 95 | self.out_dim = output_dim 96 | self.word2vec = word2vec 97 | self.relu = relu 98 | 99 | def forward(self, x): 100 | x = self.fc(x) 101 | if self.relu: 102 | x = F.relu(x) 103 | 104 | if self.word2vec: 105 | return th.max(x, dim=1)[0] # if word2vec 106 | else: 107 | return x # if not word2vec 108 | 109 | class Context_Gating(nn.Module): 110 | def __init__(self, dimension, add_batch_norm=False): 111 | super(Context_Gating, self).__init__() 112 | self.fc = nn.Linear(dimension, dimension) 113 | self.add_batch_norm = add_batch_norm 114 | self.batch_norm = nn.BatchNorm1d(dimension) 115 | 116 | def forward(self, x): 117 | x1 = self.fc(x) 118 | if self.add_batch_norm: 119 | x1 = self.batch_norm(x1) 120 | x = th.cat((x, x1), 1) 121 | return F.glu(x, 1) 122 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/stop_words.py: -------------------------------------------------------------------------------- 1 | # This list of English stop words is taken from the "Glasgow Information 2 | # Retrieval Group". The original list can be found at 3 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words 4 | ENGLISH_STOP_WORDS = frozenset([ 5 | "a", "about", "above", "across", "actually", "after", "afterwards", "again", "against", 6 | "all", "almost", "alone", "along", "already", "also", "although", "always", 7 | "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", 8 | "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", 9 | "around", "as", "at", "back", "be", "became", "because", "become", 10 | "becomes", "becoming", "been", "before", "beforehand", "behind", "being", 11 | "below", "beside", "besides", "between", "beyond", "bill", "both", 12 | "bottom", "but", "by", "call", "can", "cannot", "cant", "can't", "co", "con", 13 | "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "don't", 14 | "down", "due", "during", "each", "easy", "eg", "eight", "either", "eleven", "else", 15 | "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", 16 | "everything", "everywhere", "except", "few", "fifteen", "fifty", 17 | "find", "fire", "first", "five", "for", "former", "formerly", "forty", 18 | "found", "four", "from", "further", "give", 19 | "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", 20 | "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", 21 | "how", "however", "hundred", "i", "ie", "if", "i'm", "i'll", "i've", "in", "inc", "indeed", 22 | "interest", "is", "it", "it'll", "its", "it's", "itself", "just", "keep", "last", "latter", 23 | "latterly", "least", "less", "like", "ltd", "made", "many", "may", "me", 24 | "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", 25 | "much", "must", "my", "myself", "name", "namely", "neither", 26 | "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", 27 | "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "okay", "on", 28 | "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", 29 | "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", 30 | "please", "put", "rather", "re", "really", "same", "see", "seem", "seemed", 31 | "seeming", "seems", "serious", "several", "she", "should", "show", "side", 32 | "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", 33 | "something", "sometime", "sometimes", "somewhere", "still", "such", 34 | "take", "ten", "than", "thank", "thanks", "that", "that's", "the", "their", "them", 35 | "themselves", "then", "thence", "there", "thereafter", "thereby", 36 | "therefore", "therein", "thereupon", "these", "they", 37 | "third", "this", "those", "though", "three", "through", "throughout", 38 | "thru", "thus", "to", "together", "too", "top", "toward", "towards", 39 | "twelve", "twenty", "two", "un", "until", "up", "upon", "us", 40 | "very", "via", "view", "viewing", "viewer", "was", "we", "we'll", "well", "welcome", 41 | "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", 42 | "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", 43 | "who", "whoever", "whole", "whom", "whose", "why", "will", "with", 44 | "within", "without", "would", "wont", "won't", "yet", "you", "your", "yours", "you've", "you'll", "yourself", 45 | "yourselves", "youtube", "going", "want", "right", "you're", "we're", "know", "gonna", "need", "bit", 46 | "look", "yeah", "guys", "sure", "let's", "video", "oh", "let", "today","they're", "did", "looks", 47 | "different", "great" , "different", "say", "um", "probably", "kind", "doesn't", "does", "maybe", "hey", 48 | "we've", "better", "hope", "there's", "try"]) 49 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | from __future__ import print_function 5 | 6 | import torch as th 7 | from torch.utils.data import DataLoader 8 | import numpy as np 9 | import torch.optim as optim 10 | from args import get_args 11 | import random 12 | import os 13 | from avlectures_dataloader import AVLectures_DataLoader 14 | from model import Net 15 | from metrics import compute_metrics, print_computed_metrics 16 | from loss import MaxMarginRankingLoss 17 | from loss_mms import MMS_loss 18 | from loss_ce import CE_loss 19 | from loss_milnce import MILNCELoss 20 | from gensim.models.keyedvectors import KeyedVectors 21 | import pickle 22 | 23 | 24 | args = get_args() 25 | if args.verbose: 26 | print(args) 27 | 28 | # predefining random initial seeds 29 | th.manual_seed(args.seed) 30 | np.random.seed(args.seed) 31 | random.seed(args.seed) 32 | 33 | if args.checkpoint_dir != '' and not(os.path.isdir(args.checkpoint_dir)): 34 | os.mkdir(args.checkpoint_dir) 35 | 36 | if not(args.avlectures): 37 | print('Loading captions: {}'.format(args.caption_path)) 38 | caption = pickle.load(open(args.caption_path, 'rb')) 39 | print('done') 40 | 41 | if args.word2vec: 42 | 43 | print('Loading word vectors: {}'.format(args.word2vec_path)) 44 | we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True) 45 | 46 | if args.avlectures: 47 | we_train = we 48 | we_val = we 49 | 50 | print('done') 51 | 52 | else: 53 | we_train = None 54 | we_val = None 55 | 56 | if args.avlectures: 57 | dataset = AVLectures_DataLoader( 58 | data=args.avlectures_train_path, 59 | helper_pkl = args.avlectures_helper_path, 60 | we=we_train, 61 | max_words=args.max_words, 62 | we_dim=args.we_dim, 63 | word2vec=args.word2vec, 64 | ocr=args.ocr, 65 | n_pair=args.n_pair, 66 | only_2d=args.only_2d, 67 | only_3d=args.only_3d 68 | ) 69 | dataset_size = len(dataset) 70 | dataloader = DataLoader( 71 | dataset, 72 | batch_size=args.batch_size, 73 | num_workers=args.num_thread_reader, 74 | shuffle=True, 75 | batch_sampler=None, 76 | drop_last=True, 77 | ) 78 | if args.eval_avlectures: 79 | dataset_val = AVLectures_DataLoader( 80 | data=args.avlectures_val_path, 81 | helper_pkl = args.avlectures_helper_path, 82 | we=we_val, 83 | max_words=args.max_words, 84 | we_dim=args.we_dim, 85 | word2vec=args.word2vec, 86 | ocr=args.ocr, 87 | only_2d=args.only_2d, 88 | only_3d=args.only_3d 89 | ) 90 | dataloader_val = DataLoader( 91 | dataset_val, 92 | batch_size=args.batch_size_val, 93 | num_workers=args.num_thread_reader, 94 | shuffle=False, 95 | ) 96 | 97 | net = Net( 98 | video_dim=args.feature_dim, 99 | embd_dim=args.embd_dim, 100 | we_dim=args.we_dim, 101 | n_pair=args.n_pair, 102 | max_words=args.max_words, 103 | sentence_dim=args.sentence_dim, 104 | word2vec=args.word2vec, 105 | ocr=args.ocr, 106 | ocr_dim=args.ocr_dim, 107 | only_ocr=args.only_ocr 108 | ) 109 | net.train() 110 | # Optimizers + Loss 111 | 112 | loss_op = MaxMarginRankingLoss( 113 | margin=args.margin, 114 | negative_weighting=args.negative_weighting, 115 | batch_size=args.batch_size, 116 | n_pair=args.n_pair, 117 | hard_negative_rate=args.hard_negative_rate, 118 | ) 119 | 120 | # loss_op = MMS_loss() 121 | # loss_op = CE_loss() 122 | # loss_op = MILNCELoss() 123 | 124 | net.cuda() 125 | loss_op.cuda() 126 | 127 | 128 | if args.pretrain_path != '': 129 | net.load_checkpoint(args.pretrain_path) 130 | 131 | optimizer = optim.Adam(net.parameters(), lr=args.lr) 132 | 133 | if args.verbose: 134 | print('Starting training loop ...') 135 | 136 | def TrainOneBatch(model, opt, data, loss_fun): 137 | text = data['text'].cuda() 138 | video = data['video'].cuda() 139 | ocr_embd = None 140 | if args.ocr: 141 | ocr_embd = data['ocr_embd'].cuda() 142 | video = video.view(-1, video.shape[-1]) 143 | if args.word2vec: 144 | text = text.view(-1, text.shape[-2], text.shape[-1]) # original 145 | if args.ocr: 146 | ocr_embd = ocr_embd.view(-1, ocr_embd.shape[-2], ocr_embd.shape[-1]) 147 | else: 148 | if args.n_pair > 1: 149 | text = text.view(-1, text.shape[-2], text.shape[-1]) # original 150 | text = text.squeeze() 151 | if args.ocr: 152 | ocr_embd = ocr_embd.view(-1, ocr_embd.shape[-2], ocr_embd.shape[-1]) 153 | ocr_embd = ocr_embd.squeeze() 154 | opt.zero_grad() 155 | with th.set_grad_enabled(True): 156 | sim_matrix, v, t = model(video, text, ocr_embd) 157 | loss = loss_fun(sim_matrix) 158 | loss.backward() 159 | opt.step() 160 | return loss.item() 161 | 162 | def Eval_retrieval(model, eval_dataloader, dataset_name): 163 | model.eval() 164 | print('Evaluating Text-Video retrieval on {} data'.format(dataset_name)) 165 | with th.no_grad(): 166 | for i_batch, data in enumerate(eval_dataloader): 167 | text = data['text'].cuda() 168 | video = data['video'].cuda() 169 | ocr_embd = None 170 | if args.ocr: 171 | ocr_embd = data['ocr_embd'].cuda() 172 | m = model(video, text, ocr_embd) 173 | m = m.cpu().detach().numpy() 174 | metrics = compute_metrics(m) 175 | print_computed_metrics(metrics) 176 | 177 | for epoch in range(args.epochs): 178 | running_loss = 0.0 179 | if args.eval_avlectures: 180 | Eval_retrieval(net, dataloader_val, 'AVLectures') 181 | if args.verbose: 182 | print('Epoch: %d' % epoch) 183 | for i_batch, sample_batch in enumerate(dataloader): 184 | batch_loss = TrainOneBatch(net, optimizer, sample_batch, loss_op) # orig 185 | running_loss += batch_loss 186 | if (i_batch + 1) % args.n_display == 0 and args.verbose: 187 | print('Epoch %d, Epoch status: %.4f, Training loss: %.4f' % 188 | (epoch + 1, args.batch_size * float(i_batch) / dataset_size, 189 | running_loss / args.n_display)) 190 | running_loss = 0.0 191 | for param_group in optimizer.param_groups: 192 | param_group['lr'] *= args.lr_decay 193 | if args.checkpoint_dir != '': 194 | if epoch + 1 == args.epochs or (epoch + 1) % args.save_every == 0: 195 | path = os.path.join(args.checkpoint_dir, 'e{}.pth'.format(epoch + 1)) 196 | net.save_checkpoint(path) 197 | 198 | if args.eval_avlectures: 199 | Eval_retrieval(net, dataloader_val, 'AVLectures') 200 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/README.md: -------------------------------------------------------------------------------- 1 | # Steps to extract features 2 | 3 | **Step 1:** 4 | 5 | Execute the `create_feature_csv.py` program. To specify the path of the Datasubset use the `--base_dir` optional argument. The default path of the DataSubset is `/ssd_scratch/cvit/AVLectures/DataSubset`. After executing this program the following will be created inside `base_dir`. 6 | 7 | a. *input_2d.csv* 8 | 9 | b. *input_3d.csv* 10 | 11 | c. Also empty directories called *features, features/2d/, features/3d/* will be created. 12 | 13 | **Step 2:** 14 | 15 | Once we have the 2d, 3d CSV files and empty directories to store 2d & 3d features, our next task is to extract the 2d and 3d features from the videos using the `extract.py` program. 16 | First extract the 2d features using the following command: 17 | ``` 18 | python extract.py --csv=input_2d.csv --type=2d --batch_size=64 --num_decoding_thread=4 19 | ``` 20 | Then download the 3D ResNext-101 model as follows (for 3d feature extraction): 21 | 22 | ``` 23 | mkdir model 24 | $ cd model 25 | $ wget https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/models/resnext101.pth 26 | ``` 27 | Now extract the 3d features using the following command: 28 | ``` 29 | $ python extract.py --csv=input_3d.csv --type=3d --batch_size=64 --num_decoding_thread=4 30 | ``` 31 | 32 | **Step 3:** 33 | 34 | Now it is time to create the pickle file of our data. To do this execute the `create_pickle.py` program. To specify the path of the Datasubset use the `--base_dir` optional argument. The default path of the DataSubset is `/ssd_scratch/cvit/AVLectures/DataSubset`. After executing this program a pickle file called `avl.pkl` will be created inside the `base_dir`. 35 | 36 | # Fast and Easy to use video feature extractor 37 | 38 | This repo aims at providing an easy to use and efficient code for extracting 39 | video features using deep CNN (2D or 3D). 40 | 41 | It has been originally designed to extract video features for the large scale video dataset HowTo100M (https://www.di.ens.fr/willow/research/howto100m/) in an efficient manner. 42 | 43 | 44 | Most of the time, extracting CNN features from video is cumbersome. 45 | In fact, this usually requires dumping video frames into the disk, loading the dumped frames one 46 | by one, pre processing them and use a CNN to extract features on chunks of videos. 47 | This process is not efficient because of the dumping of frames on disk which is 48 | slow and can use a lot of inodes when working with large dataset of videos. 49 | 50 | To avoid having to do that, this repo provides a simple python script for that task: Just provide a list of raw videos and the script will take care of on the fly video decoding (with ffmpeg) and feature extraction using state-of-the-art models. While being fast, it also happen to be very convenient. 51 | 52 | This script is also optimized for multi processing GPU feature extraction. 53 | 54 | 55 | # Requirements 56 | - Python 3 57 | - PyTorch (>= 1.0) 58 | - ffmpeg-python (https://github.com/kkroening/ffmpeg-python) 59 | 60 | # How To Use ? 61 | 62 | First of all you need to generate a csv containing the list of videos you 63 | want to process. For instance, if you have video1.mp4 and video2.webm to process, 64 | you will need to generate a csv of this form: 65 | 66 | ``` 67 | video_path,feature_path 68 | absolute_path_video1.mp4,absolute_path_of_video1_features.npy 69 | absolute_path_video2.webm,absolute_path_of_video2_features.npy 70 | ``` 71 | 72 | And then just simply run: 73 | 74 | ```sh 75 | python extract.py --csv=input.csv --type=2d --batch_size=64 --num_decoding_thread=4 76 | ``` 77 | This command will extract 2d video feature for video1.mp4 (resp. video2.webm) at path_of_video1_features.npy (resp. path_of_video2_features.npy) in 78 | a form of a numpy array. 79 | To get feature from the 3d model instead, just change type argument 2d per 3d. 80 | The parameter --num_decoding_thread will set how many parallel cpu thread are used for the decoding of the videos. 81 | 82 | Please note that the script is intended to be run on ONE single GPU only. 83 | if multiple gpu are available, please make sure that only one free GPU is set visible 84 | by the script with the CUDA_VISIBLE_DEVICES variable environnement for example. 85 | 86 | # Can I use multiple GPU to speed up feature extraction ? 87 | 88 | Yes ! just run the same script with same input csv on another GPU (that can be from a different machine, provided that the disk to output the features is shared between the machines). The script will create a new feature extraction process that will only focus on processing the videos that have not been processed yet, without overlapping with the other extraction process already running. 89 | 90 | # What models are implemented ? 91 | So far, only one 2D and one 3D models can be used. 92 | 93 | - The 2D model is the pytorch model zoo ResNet-152 pretrained on ImageNet. The 2D features are extracted at 1 feature per second at the resolution of 224. 94 | - The 3D model is a ResNexT-101 16 frames (https://github.com/kenshohara/3D-ResNets-PyTorch) pretrained on Kinetics. The 3D features are extracted at 1.5 feature per second at the resolution of 112. 95 | 96 | # Downloading pretrained models 97 | This will download the pretrained 3D ResNext-101 model we used from: https://github.com/kenshohara/3D-ResNets-PyTorch 98 | 99 | ```sh 100 | mkdir model 101 | cd model 102 | wget https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/models/resnext101.pth 103 | ``` 104 | 105 | 106 | 107 | # Acknowledgements 108 | The code re-used code from https://github.com/kenshohara/3D-ResNets-PyTorch 109 | for 3D CNN. 110 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_feature_csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import argparse 4 | 5 | from pathlib import Path 6 | from os.path import join 7 | from glob import glob 8 | 9 | import cv2 10 | 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 12 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 13 | # After executing this code, two new CSV files would be created inside the base_dir 14 | # 1. input_2d.csv 2. input_3d.csv 15 | # Also empty directories called "features", "features/2d/", "features/3d/" will be created inside base_dir. 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 19 | args = parser.parse_args() 20 | 21 | base_dir = args.base_dir 22 | print("Base Directory:") 23 | print(base_dir) 24 | 25 | delimiter = "@#@" 26 | 27 | # Create empty directories called "features/2d/" and "features/3d/" 28 | Path(join(base_dir, "features", "2d")).mkdir(parents=True, exist_ok=True) 29 | Path(join(base_dir, "features", "3d")).mkdir(parents=True, exist_ok=True) 30 | 31 | fields = ['video_path', 'feature_path'] 32 | 33 | filename_2d = join(base_dir, 'input_2d.csv') # for extracting 2d features 34 | filename_3d = join(base_dir, 'input_3d.csv') # for extracting 3d features 35 | 36 | rows = [] 37 | 38 | folder_list = [] 39 | 40 | for fl in glob(join(base_dir, '*')): 41 | if 'mit' in fl: 42 | folder_list.append(fl) 43 | 44 | folder_list.sort() 45 | print(folder_list) 46 | 47 | def check_vid(vid_path): 48 | try: 49 | cap = cv2.VideoCapture(vid_path) 50 | return cap.isOpened() 51 | except: 52 | return False 53 | 54 | with open(filename_2d, 'w') as csvfile_2d, open(filename_3d, 'w') as csvfile_3d: 55 | csvwriter_2d = csv.writer(csvfile_2d) 56 | csvwriter_2d.writerow(fields) 57 | 58 | csvwriter_3d = csv.writer(csvfile_3d) 59 | csvwriter_3d.writerow(fields) 60 | 61 | for folder in folder_list: 62 | rows_2d = [] 63 | rows_3d = [] 64 | count = 0 65 | print("Inside - ", folder) 66 | with open(join(folder, 'combined.txt'), 'r') as text_file: 67 | lines = text_file.readlines() 68 | 69 | for line in lines: 70 | vid_name = line.split(delimiter)[0] 71 | vid_path = join(folder, 'splits_vid', vid_name) 72 | # if not check_vid(vid_path): 73 | # print("This video {0} is not split properly".format(vid_name)) 74 | # continue 75 | feature_name = vid_name.replace('.mp4', '.npy') 76 | feature_path_2d = join(base_dir, 'features', '2d', feature_name) 77 | feature_path_3d = join(base_dir, 'features', '3d', feature_name) 78 | rows_2d.append([vid_path, feature_path_2d]) 79 | rows_3d.append([vid_path, feature_path_3d]) 80 | count += 1 81 | 82 | print("Count = ", count) 83 | csvwriter_2d.writerows(rows_2d) 84 | csvwriter_3d.writerows(rows_3d) -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_feature_csv_indi.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import argparse 4 | 5 | from pathlib import Path 6 | from os.path import join 7 | from glob import glob 8 | 9 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 10 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 11 | # After executing this code, two new CSV files would be created inside the base_dir 12 | # 1. input_2d.csv 2. input_3d.csv 13 | # Also empty directories called "features", "features/2d/", "features/3d/" will be created inside base_dir. 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 17 | args = parser.parse_args() 18 | 19 | base_dir = args.base_dir 20 | print("Base Directory:") 21 | print(base_dir) 22 | 23 | # Create empty directories called "features/2d/" and "features/3d/" 24 | Path(join(base_dir, "features_m011", "2d")).mkdir(parents=True, exist_ok=True) 25 | Path(join(base_dir, "features_m011", "3d")).mkdir(parents=True, exist_ok=True) 26 | 27 | fields = ['video_path', 'feature_path'] 28 | 29 | filename_2d = join(base_dir, 'input_2d_m011.csv') # for extracting 2d features 30 | filename_3d = join(base_dir, 'input_3d_m011.csv') # for extracting 3d features 31 | 32 | rows = [] 33 | 34 | folder_list = [] 35 | 36 | for fl in glob(join(base_dir, '*')): 37 | if ('mit011'in fl): 38 | folder_list.append(fl) 39 | 40 | folder_list.sort() 41 | print(folder_list) 42 | 43 | with open(filename_2d, 'w') as csvfile_2d, open(filename_3d, 'w') as csvfile_3d: 44 | csvwriter_2d = csv.writer(csvfile_2d) 45 | csvwriter_2d.writerow(fields) 46 | 47 | csvwriter_3d = csv.writer(csvfile_3d) 48 | csvwriter_3d.writerow(fields) 49 | 50 | for folder in folder_list: 51 | rows_2d = [] 52 | rows_3d = [] 53 | count = 0 54 | print("Inside - ", folder) 55 | with open(join(folder, 'combined.txt'), 'r') as text_file: 56 | lines = text_file.readlines() 57 | 58 | for line in lines: 59 | count += 1 60 | vid_name = line.split('|')[0] 61 | vid_path = join(folder, 'splits_vid', vid_name) 62 | feature_name = vid_name.replace('.mp4', '.npy') 63 | feature_path_2d = join(base_dir, 'features_m011', '2d', feature_name) 64 | feature_path_3d = join(base_dir, 'features_m011', '3d', feature_name) 65 | rows_2d.append([vid_path, feature_path_2d]) 66 | rows_3d.append([vid_path, feature_path_3d]) 67 | 68 | print("Count = ", count) 69 | csvwriter_2d.writerows(rows_2d) 70 | csvwriter_3d.writerows(rows_3d) 71 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_feature_csv_seg.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import argparse 4 | 5 | from pathlib import Path 6 | from os.path import join 7 | from glob import glob 8 | 9 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 10 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 11 | # After executing this code, two new CSV files would be created inside the base_dir 12 | # 1. input_2d.csv 2. input_3d.csv 13 | # Also empty directories called "features", "features/2d/", "features/3d/" will be created inside base_dir. 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 17 | args = parser.parse_args() 18 | 19 | base_dir = args.base_dir 20 | print("Base Directory:") 21 | print(base_dir) 22 | 23 | # Create empty directories called "features/2d/" and "features/3d/" 24 | Path(join(base_dir, "features", "2d")).mkdir(parents=True, exist_ok=True) 25 | Path(join(base_dir, "features", "3d")).mkdir(parents=True, exist_ok=True) 26 | 27 | fields = ['video_path', 'feature_path'] 28 | 29 | filename_2d = join(base_dir, 'input_2d.csv') # for extracting 2d features 30 | filename_3d = join(base_dir, 'input_3d.csv') # for extracting 3d features 31 | 32 | rows = [] 33 | 34 | folder_list = [] 35 | 36 | for fl in glob(join(base_dir, '*')): 37 | if 'mit' in fl: 38 | folder_list.append(fl) 39 | 40 | folder_list.sort() 41 | print(folder_list) 42 | 43 | with open(filename_2d, 'w') as csvfile_2d, open(filename_3d, 'w') as csvfile_3d: 44 | csvwriter_2d = csv.writer(csvfile_2d) 45 | csvwriter_2d.writerow(fields) 46 | 47 | csvwriter_3d = csv.writer(csvfile_3d) 48 | csvwriter_3d.writerow(fields) 49 | 50 | for folder in folder_list: 51 | rows_2d = [] 52 | rows_3d = [] 53 | count = 0 54 | print("Inside - ", folder) 55 | with open(join(folder, 'combined.txt'), 'r') as text_file: 56 | lines = text_file.readlines() 57 | 58 | for line in lines: 59 | count += 1 60 | vid_name = line.split('|')[0] 61 | vid_path = join(folder, vid_name) 62 | feature_name = vid_name.replace('.mp4', '.npy') 63 | feature_path_2d = join(base_dir, 'features', '2d', feature_name) 64 | feature_path_3d = join(base_dir, 'features', '3d', feature_name) 65 | rows_2d.append([vid_path, feature_path_2d]) 66 | rows_3d.append([vid_path, feature_path_3d]) 67 | 68 | print("Count = ", count) 69 | csvwriter_2d.writerows(rows_2d) 70 | csvwriter_3d.writerows(rows_3d) -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | 6 | from os.path import join, isfile 7 | from glob import glob 8 | 9 | import numpy as np 10 | 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 18 | args = parser.parse_args() 19 | 20 | base_dir = args.base_dir 21 | print("Base Directory:") 22 | print(base_dir) 23 | 24 | data = [] 25 | 26 | folder_list = [] 27 | 28 | for fl in glob(join(base_dir, '*')): 29 | if 'mit' in fl: 30 | folder_list.append(fl) 31 | 32 | folder_list.sort() 33 | print(folder_list) 34 | 35 | count_match = 0 36 | total_count = 0 37 | 38 | for folder in folder_list: 39 | count = 0 40 | 41 | print("Inside - ", folder) 42 | 43 | with open(join(folder, 'combined.txt'), 'r') as text_file: 44 | 45 | lines = text_file.readlines() 46 | 47 | for line in lines: 48 | count += 1 49 | features = {} 50 | vid_name = line.split('|')[0] 51 | subtitle = line.split('|')[1] 52 | subtitle = " ".join(subtitle.split()) 53 | features_name = vid_name.replace('.mp4', '.npy') 54 | 55 | if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 56 | 57 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 58 | three_d = np.load(join(base_dir, 'features', '3d', features_name)) 59 | 60 | # if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0: 61 | # print("True") 62 | # continue 63 | 64 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048): 65 | print("True") 66 | continue 67 | 68 | # two_d = two_d.mean(axis = 0) 69 | # three_d = three_d.mean(axis = 0) 70 | 71 | two_d = two_d.max(axis = 0) 72 | three_d = three_d.max(axis = 0) 73 | 74 | features['2d'] = two_d 75 | features['3d'] = three_d 76 | features['caption'] = subtitle 77 | features['id'] = vid_name.replace('.mp4', '') 78 | 79 | data.append(features) 80 | count_match += 1 81 | 82 | print("Count = ", count) 83 | total_count += count 84 | 85 | print("Count match = ", count_match) 86 | print("Total count = ", total_count) 87 | 88 | with open(join(base_dir, 'm1_m2_10s15s_2d3d.pkl'), 'wb') as handle: 89 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) 90 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_indi.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | 6 | from os.path import join, isfile 7 | from glob import glob 8 | 9 | import numpy as np 10 | 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 18 | args = parser.parse_args() 19 | 20 | base_dir = args.base_dir 21 | print("Base Directory:") 22 | print(base_dir) 23 | 24 | data = [] 25 | 26 | folder_list = [] 27 | 28 | for fl in glob(join(base_dir, '*')): 29 | # if ('mit011' in fl) or ('mit012' in fl): 30 | # continue 31 | # if 'mit' in fl: 32 | # folder_list.append(fl) 33 | 34 | if ('mit006' in fl) or ('mit011' in fl): 35 | folder_list.append(fl) 36 | 37 | folder_list.sort() 38 | print(folder_list) 39 | 40 | count_match = 0 41 | total_count = 0 42 | 43 | for folder in folder_list: 44 | count = 0 45 | 46 | print("Inside - ", folder) 47 | 48 | with open(join(folder, 'combined.txt'), 'r') as text_file: 49 | 50 | lines = text_file.readlines() 51 | 52 | for line in lines: 53 | count += 1 54 | features = {} 55 | vid_name = line.split('|')[0] 56 | subtitle = line.split('|')[1] 57 | subtitle = " ".join(subtitle.split()) 58 | 59 | features_name = vid_name.replace('.mp4', '.npy') 60 | 61 | if isfile(join(base_dir, 'features_m011', '2d', features_name)) and isfile(join(base_dir, 'features_m011', '3d', features_name)): 62 | 63 | two_d = np.load(join(base_dir, 'features_m011', '2d', features_name)) 64 | three_d = np.load(join(base_dir, 'features_m011', '3d', features_name)) 65 | 66 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0: 67 | print("True") 68 | continue 69 | 70 | # two_d = two_d.mean(axis = 0) 71 | # three_d = three_d.mean(axis = 0) 72 | 73 | two_d = two_d.max(axis = 0) 74 | three_d = three_d.max(axis = 0) 75 | 76 | features['2d'] = two_d 77 | features['3d'] = three_d 78 | features['caption'] = subtitle 79 | features['id'] = vid_name.replace('.mp4', '') 80 | 81 | data.append(features) 82 | count_match += 1 83 | 84 | elif isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 85 | 86 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 87 | three_d = np.load(join(base_dir, 'features', '3d', features_name)) 88 | 89 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0: 90 | print("True") 91 | continue 92 | 93 | # two_d = two_d.mean(axis = 0) 94 | # three_d = three_d.mean(axis = 0) 95 | 96 | two_d = two_d.max(axis = 0) 97 | three_d = three_d.max(axis = 0) 98 | 99 | features['2d'] = two_d 100 | features['3d'] = three_d 101 | features['caption'] = subtitle 102 | features['id'] = vid_name.replace('.mp4', '') 103 | 104 | data.append(features) 105 | count_match += 1 106 | 107 | print("Count = ", count) 108 | total_count += count 109 | 110 | print("Count match = ", count_match) 111 | print("Total count = ", total_count) 112 | 113 | with open(join(base_dir, 'm6m11_pysd.pkl'), 'wb') as handle: 114 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) 115 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_ocr.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from os.path import join, isfile 8 | from glob import glob 9 | 10 | import numpy as np 11 | import subprocess 12 | 13 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 14 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 15 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 16 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 17 | 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 20 | args = parser.parse_args() 21 | 22 | base_dir = args.base_dir 23 | print("Base Directory:") 24 | print(base_dir) 25 | 26 | delimiter = "@#@" 27 | 28 | ocr_dir = '/ssd_scratch/cvit/darshan/OCR_dataset_MITOCW_v1' 29 | 30 | seg_stats_dir = '/home2/darshan.singh/Segmentation/stats' 31 | 32 | def get_length(filename): 33 | result = subprocess.run(["ffprobe", "-v", "error", "-show_entries", 34 | "format=duration", "-of", 35 | "default=noprint_wrappers=1:nokey=1", filename], 36 | stdout=subprocess.PIPE, 37 | stderr=subprocess.STDOUT) 38 | return float(result.stdout) 39 | 40 | def toFFMPEGtime(t): 41 | ss, ms = divmod(t*1000, 1000) 42 | mm, ss = divmod(ss, 60) 43 | hh, mm = divmod(mm, 60) 44 | 45 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 46 | 47 | def getTime(t): 48 | h, m, sms = t.split(":") 49 | if ',' in sms: # Example t = '00:00:03,980' 50 | s, ms = sms.split(",") 51 | elif '.' in sms: # Example t = '00:00:03.980' 52 | s, ms = sms.split(".") 53 | else: # Example t = '00:00:03' 54 | s = sms 55 | ms = 0 56 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms)/1000 57 | return tm 58 | 59 | def getOCR(course_name, vid_name, st, et): 60 | lec_name = "-".join(vid_name.split('-')[:-1]) 61 | split_num = int(vid_name.split('-')[-1].replace('.mp4', '')) 62 | 63 | # json_data = glob(join(ocr_dir, course_name, lec_name, '*.json'))[0] 64 | # json_data = open(json_data, 'r') 65 | # json_data = json.load(json_data) 66 | # fps = round(json_data['frame_metadata']['True FPS'], 2) 67 | 68 | ocr_text = "" 69 | ocr_frame_num = None 70 | ocr_frame_ts = None 71 | ocr_lec_name = lec_name 72 | 73 | for fl in glob(join(ocr_dir, course_name, lec_name, '*.json')): 74 | json_data_fl_ref = open(fl, 'r') 75 | json_data_fl = json.load(json_data_fl_ref) 76 | json_data_fl_ref.close() 77 | fps = json_data_fl['frame_metadata']['True FPS'] 78 | frame_num = json_data_fl['frame_metadata']['Frame number'] 79 | 80 | if frame_num == 1: 81 | continue 82 | 83 | frame_ts = round(frame_num / fps) 84 | 85 | if st <= frame_ts and frame_ts <= et: 86 | if 'fullTextAnnotation' in json_data_fl: 87 | ocr_text = json_data_fl['fullTextAnnotation']['text'] 88 | ocr_frame_num = frame_num 89 | ocr_frame_ts = frame_ts 90 | 91 | return ocr_text, ocr_frame_num, ocr_frame_ts, ocr_lec_name 92 | 93 | data = [] 94 | 95 | folder_list = [] 96 | 97 | for fl in glob(join(base_dir, '*')): 98 | if 'mit' in fl: 99 | folder_list.append(fl) 100 | 101 | folder_list.sort() 102 | print(folder_list) 103 | 104 | count_match = 0 105 | total_count = 0 106 | 107 | for folder in folder_list: 108 | count = 0 109 | course_name = folder.split('/')[-1] 110 | 111 | print("Inside - ", course_name) 112 | 113 | with open(join(folder, 'combined.txt'), 'r') as text_file: 114 | 115 | lines = text_file.readlines() 116 | 117 | for line in lines: 118 | count += 1 119 | features = {} 120 | vid_name = line.split(delimiter)[0] 121 | subtitle = line.split(delimiter)[1] 122 | subtitle = " ".join(subtitle.split()) 123 | st = getTime(line.split(delimiter)[2]) 124 | et = getTime(line.split(delimiter)[3]) 125 | features_name = vid_name.replace('.mp4', '.npy') 126 | 127 | if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 128 | 129 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 130 | three_d = np.load(join(base_dir, 'features', '3d', features_name)) 131 | 132 | # if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0: 133 | # print("True") 134 | # continue 135 | 136 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048): 137 | print("True") 138 | continue 139 | 140 | # two_d = two_d.mean(axis = 0) 141 | # three_d = three_d.mean(axis = 0) 142 | 143 | two_d = two_d.max(axis = 0) 144 | three_d = three_d.max(axis = 0) 145 | 146 | features['2d'] = two_d 147 | features['3d'] = three_d 148 | features['caption'] = subtitle 149 | features['id'] = vid_name.replace('.mp4', '') 150 | features['vid_duration_ffprobe'] = get_length(join(folder, 'splits_vid', vid_name)) 151 | features['vid_duration'] = et - st 152 | features['st'] = st 153 | features['et'] = et 154 | 155 | # retrieving OCR data 156 | lec_name = "-".join(vid_name.split('-')[:-1]) 157 | lec_num = int(lec_name.split('-')[-1].replace('_300k', '').replace('lec', '')) 158 | 159 | seg_stats = pkl.load(open(join(seg_stats_dir, course_name + '.pkl'), 'rb')) 160 | 161 | if lec_num in seg_stats: 162 | offset = int(seg_stats[lec_num]['st']) 163 | else: 164 | offset = 0 165 | 166 | ocr_data = getOCR(course_name, vid_name, float(st) + offset, float(et) + offset) 167 | 168 | features['ocr_text'] = " ".join(ocr_data[0].split()) 169 | features['ocr_frame_num'] = ocr_data[1] 170 | features['ocr_frame_ts'] = ocr_data[2] 171 | features['ocr_lec_name'] = ocr_data[3] 172 | features['offset'] = offset 173 | 174 | data.append(features) 175 | count_match += 1 176 | 177 | print("Count = ", count) 178 | total_count += count 179 | 180 | print("Count match = ", count_match) 181 | print("Total count = ", total_count) 182 | 183 | with open(join(base_dir, 'm1_m2_20s25s_2d3d.pkl'), 'wb') as handle: 184 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) 185 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_prevnext.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | 6 | from os.path import join, isfile 7 | from glob import glob 8 | 9 | import numpy as np 10 | 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 18 | args = parser.parse_args() 19 | 20 | base_dir = args.base_dir 21 | print("Base Directory:") 22 | print(base_dir) 23 | 24 | delimiter = "@#@" 25 | 26 | data = [] 27 | 28 | folder_list = [] 29 | 30 | def getTime(t): 31 | h, m, sms = t.split(":") 32 | if ',' in sms: # Example t = '00:00:03,980' 33 | s, ms = sms.split(",") 34 | elif '.' in sms: # Example t = '00:00:03.980' 35 | s, ms = sms.split(".") 36 | else: # Example t = '00:00:03' 37 | s = sms 38 | ms = '0' 39 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 40 | return tm 41 | 42 | for fl in glob(join(base_dir, '*')): 43 | # if ('mit011' in fl) or ('mit012' in fl): 44 | # continue 45 | if 'mit' in fl: 46 | folder_list.append(fl) 47 | 48 | folder_list.sort() 49 | print(folder_list) 50 | 51 | count_match = 0 52 | total_count = 0 53 | 54 | for folder in folder_list: 55 | count = 0 56 | 57 | course_name = folder.split('/')[-1] 58 | print("Inside course -", course_name) 59 | 60 | # for glob(join(folder, 'subtitles')) 61 | 62 | with open(join(folder, 'combined.txt'), 'r') as text_file: 63 | 64 | lines = text_file.readlines() 65 | lines = sorted(lines, key = lambda line: (line.split(delimiter))[0]) 66 | 67 | for i in range(len(lines)): 68 | l = lines[i].strip() 69 | count += 1 70 | features = {} 71 | 72 | present_vid = lines[i].split(delimiter)[0] # example : "MIT6_042JF10_lec17_300k-00000.mp4" 73 | present_vid_name = '-'.join((present_vid.split('-'))[:-1]) # example : "MIT6_042JF10_lec17_300k" 74 | present_vid_id = (present_vid.split('-'))[-1] # example : "00000.mp4" 75 | present_vid_id = int(present_vid_id.replace('.mp4', '')) # example : 0 76 | present_vid_subtitle = lines[i].split(delimiter)[1] 77 | present_vid_subtitle = " ".join(present_vid_subtitle.split()) 78 | 79 | prev_vid = "" 80 | next_vid = "" 81 | 82 | subtitle = "" 83 | 84 | if i > 0: 85 | prev_vid = lines[i - 1].split(delimiter)[0] 86 | if i < len(lines) - 1: 87 | next_vid = lines[i + 1].split(delimiter)[0] 88 | 89 | 90 | if prev_vid != '': 91 | 92 | prev_vid_name = '-'.join((prev_vid.split('-'))[:-1]) 93 | prev_vid_id = (prev_vid.split('-'))[-1] 94 | prev_vid_id = int(prev_vid_id.replace('.mp4', '')) 95 | prev_vid_subtitle = lines[i - 1].split(delimiter)[1] 96 | prev_vid_subtitle = " ".join(prev_vid_subtitle.split()) 97 | 98 | if present_vid_name == prev_vid_name and prev_vid_id == present_vid_id - 1: 99 | subtitle = prev_vid_subtitle + " " 100 | 101 | subtitle = subtitle + present_vid_subtitle 102 | 103 | if next_vid != '': 104 | 105 | next_vid_name = '-'.join((next_vid.split('-'))[:-1]) 106 | next_vid_id = (next_vid.split('-'))[-1] 107 | next_vid_id = int(next_vid_id.replace('.mp4', '')) 108 | next_vid_subtitle = lines[i + 1].split(delimiter)[1] 109 | next_vid_subtitle = " ".join(next_vid_subtitle.split()) 110 | 111 | if present_vid_name == next_vid_name and next_vid_id == present_vid_id + 1: 112 | subtitle = subtitle + " " + next_vid_subtitle 113 | 114 | features_name = present_vid.replace('.mp4', '.npy') 115 | st = getTime(l.split(delimiter)[2]) 116 | et = getTime(l.split(delimiter)[3]) 117 | 118 | if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 119 | 120 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 121 | three_d = np.load(join(base_dir, 'features', '3d', features_name)) 122 | 123 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048): 124 | print("True") 125 | continue 126 | 127 | # two_d = two_d.mean(axis = 0) 128 | # three_d = three_d.mean(axis = 0) 129 | 130 | two_d = two_d.max(axis = 0) 131 | three_d = three_d.max(axis = 0) 132 | 133 | features['2d'] = two_d 134 | features['3d'] = three_d 135 | features['caption'] = subtitle 136 | features['id'] = present_vid.replace('.mp4', '') 137 | features['st'] = st 138 | features['et'] = et 139 | 140 | features['vid_duration'] = et - st 141 | 142 | features['course_name'] = course_name 143 | 144 | data.append(features) 145 | count_match += 1 146 | 147 | print("Count = ", count) 148 | total_count += count 149 | 150 | print("Count match = ", count_match) 151 | print("Total count = ", total_count) 152 | 153 | with open(join(base_dir, 'seg_10s15s_2d3dprevnext.pkl'), 'wb') as handle: 154 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from tqdm import tqdm 8 | 9 | from os.path import join, isfile 10 | from glob import glob 11 | 12 | # from sentence_transformers import SentenceTransformer, util 13 | 14 | import numpy as np 15 | import subprocess 16 | 17 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 18 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 19 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 20 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 24 | args = parser.parse_args() 25 | 26 | base_dir = args.base_dir 27 | print("Base Directory:") 28 | print(base_dir) 29 | 30 | delimiter = "@#@" 31 | 32 | ocr_dir = '/ssd_scratch/cvit/darshan/OCR' 33 | 34 | def get_length(filename): 35 | result = subprocess.run(["ffprobe", "-v", "error", "-show_entries", 36 | "format=duration", "-of", 37 | "default=noprint_wrappers=1:nokey=1", filename], 38 | stdout=subprocess.PIPE, 39 | stderr=subprocess.STDOUT) 40 | return float(result.stdout) 41 | 42 | def toFFMPEGtime(t): 43 | ss, ms = divmod(t*1000, 1000) 44 | mm, ss = divmod(ss, 60) 45 | hh, mm = divmod(mm, 60) 46 | 47 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 48 | 49 | def getTime(t): 50 | h, m, sms = t.split(":") 51 | if ',' in sms: # Example t = '00:00:03,980' 52 | s, ms = sms.split(",") 53 | elif '.' in sms: # Example t = '00:00:03.980' 54 | s, ms = sms.split(".") 55 | else: # Example t = '00:00:03' 56 | s = sms 57 | ms = '0' 58 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 59 | return tm 60 | 61 | def getOCR(course_name, vid_name, st, et): 62 | lec_name = "-".join(vid_name.split('-')[:-1]) 63 | split_num = int(vid_name.split('-')[-1].replace('.mp4', '')) 64 | 65 | # json_data = glob(join(ocr_dir, course_name, lec_name, '*.json'))[0] 66 | # json_data = open(json_data, 'r') 67 | # json_data = json.load(json_data) 68 | # fps = round(json_data['frame_metadata']['True FPS'], 2) 69 | 70 | ocr_text = "" 71 | ocr_frame_num = None 72 | ocr_frame_ts = None 73 | ocr_lec_name = lec_name 74 | 75 | for fl in glob(join(ocr_dir, course_name, lec_name, '*.json')): 76 | json_data_fl_ref = open(fl, 'r') 77 | json_data_fl = json.load(json_data_fl_ref) 78 | json_data_fl_ref.close() 79 | fps = json_data_fl['frame_metadata']['True FPS'] 80 | frame_num = json_data_fl['frame_metadata']['Frame number'] 81 | 82 | if frame_num == 1: 83 | continue 84 | 85 | frame_ts = round(frame_num / fps, 3) 86 | 87 | if st <= frame_ts and frame_ts <= et: 88 | if 'fullTextAnnotation' in json_data_fl: 89 | ocr_text = json_data_fl['fullTextAnnotation']['text'] 90 | ocr_frame_num = frame_num 91 | ocr_frame_ts = frame_ts 92 | break 93 | 94 | return ocr_text, ocr_frame_num, ocr_frame_ts, ocr_lec_name 95 | 96 | data = [] 97 | 98 | folder_list = [] 99 | 100 | for fl in glob(join(base_dir, '*')): 101 | if 'mit' in fl: 102 | folder_list.append(fl) 103 | 104 | folder_list.sort() 105 | print(folder_list) 106 | 107 | # model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1') 108 | # model_ss = SentenceTransformer('all-mpnet-base-v2') 109 | 110 | count_match = 0 111 | total_count = 0 112 | 113 | for folder in folder_list: 114 | count = 0 115 | 116 | # print("Inside - ", folder) 117 | course_name = folder.split('/')[-1] 118 | print("Inside course -", course_name) 119 | 120 | with open(join(folder, 'combined.txt'), 'r') as text_file: 121 | 122 | lines = text_file.readlines() 123 | 124 | line_lst = [] 125 | 126 | # for line in lines: 127 | # l = line.strip() 128 | # line_lst.append(l) 129 | 130 | for line in tqdm(lines): 131 | l = line.strip() 132 | count += 1 133 | features = {} 134 | vid_name = l.split(delimiter)[0] 135 | subtitle = l.split(delimiter)[1] 136 | subtitle = " ".join(subtitle.split()) 137 | st = getTime(l.split(delimiter)[2]) 138 | et = getTime(l.split(delimiter)[3]) 139 | features_name = vid_name.replace('.mp4', '.npy') 140 | 141 | # if isfile(join(base_dir, 'features', '2d', features_name)): 142 | if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 143 | 144 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 145 | three_d = np.load(join(base_dir, 'features', '3d', features_name)) 146 | 147 | #if two_d.shape == (0, 2048): 148 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048): 149 | print("True") 150 | continue 151 | 152 | # if mean-pooling 153 | # two_d = two_d.mean(axis = 0) 154 | # three_d = three_d.mean(axis = 0) 155 | 156 | # if max-pooling 157 | two_d = two_d.max(axis = 0) 158 | three_d = three_d.max(axis = 0) 159 | 160 | features['2d'] = two_d 161 | features['3d'] = three_d 162 | features['caption'] = subtitle 163 | features['id'] = vid_name.replace('.mp4', '') 164 | # features['vid_duration'] = get_length(join(folder, 'splits_vid', vid_name)) 165 | features['vid_duration'] = et - st 166 | features['st'] = st 167 | features['et'] = et 168 | features['course_name'] = course_name 169 | 170 | # features['emb_qa'] = model_qa.encode(subtitle) 171 | # features['emb_ss'] = model_ss.encode(subtitle) 172 | 173 | # retrieving OCR data 174 | lec_name = "-".join(vid_name.split('-')[:-1]) 175 | 176 | ocr_data = getOCR(course_name, vid_name, float(st), float(et)) 177 | 178 | features['ocr_text'] = " ".join(ocr_data[0].split()) 179 | features['ocr_frame_num'] = ocr_data[1] 180 | features['ocr_frame_ts'] = ocr_data[2] 181 | features['ocr_lec_name'] = ocr_data[3] 182 | 183 | data.append(features) 184 | count_match += 1 185 | 186 | print("Count = ", count) 187 | total_count += count 188 | 189 | print("Count match = ", count_match) 190 | print("Total count = ", total_count) 191 | 192 | with open(join(base_dir, 'v1_1.pkl'), 'wb') as handle: 193 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) 194 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 10 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=INFINITE 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | module load ffmpeg/4.4.1 11 | 12 | echo "Pickling started" 13 | 14 | python create_pickle_seg2.py --base_dir='/ssd_scratch/cvit/darshan/segmentation_dataset_v1_10s15s' 15 | 16 | echo "Pickling ended successfully" 17 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_55.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 10 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=INFINITE 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | module load ffmpeg/4.4.1 11 | 12 | echo "Pickling started" 13 | 14 | python create_pickle_seg2.py --base_dir='/ssd_scratch/cvit/darshan/segmentation_dataset_v1_10s15s' 15 | 16 | echo "Pickling ended successfully" 17 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_mp.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor 8 | 9 | from tqdm import tqdm 10 | 11 | from os.path import join, isfile 12 | from glob import glob 13 | 14 | # from sentence_transformers import SentenceTransformer, util 15 | 16 | import numpy as np 17 | import subprocess 18 | 19 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 20 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 21 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 22 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 26 | parser.add_argument("-f", "--file_name", type=str, required=False, help="", default = 'm') 27 | args = parser.parse_args() 28 | 29 | base_dir = args.base_dir 30 | print("Base Directory:") 31 | print(base_dir) 32 | 33 | f_name = args.file_name 34 | f_name = "20_25" 35 | print(f_name) 36 | 37 | delimiter = "@#@" 38 | 39 | ocr_dir = '/ssd_scratch/cvit/darshan/OCR/dataset_MITOCW_v1' 40 | 41 | base_pkl = pkl.load(open('/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s/v1_1.pkl', 'rb')) 42 | 43 | def get_length(filename): 44 | result = subprocess.run(["ffprobe", "-v", "error", "-show_entries", 45 | "format=duration", "-of", 46 | "default=noprint_wrappers=1:nokey=1", filename], 47 | stdout=subprocess.PIPE, 48 | stderr=subprocess.STDOUT) 49 | return float(result.stdout) 50 | 51 | def toFFMPEGtime(t): 52 | ss, ms = divmod(t*1000, 1000) 53 | mm, ss = divmod(ss, 60) 54 | hh, mm = divmod(mm, 60) 55 | 56 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 57 | 58 | def getTime(t): 59 | h, m, sms = t.split(":") 60 | if ',' in sms: # Example t = '00:00:03,980' 61 | s, ms = sms.split(",") 62 | elif '.' in sms: # Example t = '00:00:03.980' 63 | s, ms = sms.split(".") 64 | else: # Example t = '00:00:03' 65 | s = sms 66 | ms = '0' 67 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 68 | return tm 69 | 70 | def getOCR(course_name, vid_name, st, et): 71 | lec_name = "-".join(vid_name.split('-')[:-1]) 72 | split_num = int(vid_name.split('-')[-1].replace('.mp4', '')) 73 | 74 | # json_data = glob(join(ocr_dir, course_name, lec_name, '*.json'))[0] 75 | # json_data = open(json_data, 'r') 76 | # json_data = json.load(json_data) 77 | # fps = round(json_data['frame_metadata']['True FPS'], 2) 78 | 79 | ocr_text = "" 80 | ocr_frame_num = None 81 | ocr_frame_ts = None 82 | ocr_lec_name = lec_name 83 | 84 | for fl in glob(join(ocr_dir, course_name, lec_name, '*.json')): 85 | json_data_fl_ref = open(fl, 'r') 86 | json_data_fl = json.load(json_data_fl_ref) 87 | json_data_fl_ref.close() 88 | fps = json_data_fl['frame_metadata']['True FPS'] 89 | frame_num = json_data_fl['frame_metadata']['Frame number'] 90 | 91 | if frame_num == 1: 92 | continue 93 | 94 | frame_ts = round(frame_num / fps, 3) 95 | 96 | if st <= frame_ts and frame_ts <= et: 97 | if 'fullTextAnnotation' in json_data_fl: 98 | ocr_text = json_data_fl['fullTextAnnotation']['text'] 99 | ocr_frame_num = frame_num 100 | ocr_frame_ts = frame_ts 101 | break 102 | 103 | return ocr_text, ocr_frame_num, ocr_frame_ts, ocr_lec_name 104 | 105 | 106 | def do_job(l): 107 | 108 | course_name = l['course_name'] 109 | 110 | if not os.path.isdir(join(ocr_dir, course_name)): 111 | return 112 | 113 | st = l['st'] 114 | et = l['et'] 115 | vid_name = l['id'] + '.mp4' 116 | 117 | lec_name = "-".join(vid_name.split('-')[:-1]) 118 | 119 | ocr_data = getOCR(course_name, vid_name, float(st), float(et)) 120 | 121 | l['ocr_text'] = " ".join(ocr_data[0].split()) 122 | l['ocr_frame_num'] = ocr_data[1] 123 | l['ocr_frame_ts'] = ocr_data[2] 124 | l['ocr_lec_name'] = ocr_data[3] 125 | 126 | return l 127 | 128 | 129 | count_match = 0 130 | total_count = 0 131 | # p = ThreadPoolExecutor(20) 132 | p = ProcessPoolExecutor(19) 133 | 134 | futures = [p.submit(do_job, li) for li in base_pkl] 135 | x = [r.result() for r in tqdm(as_completed(futures), total=len(futures))] 136 | 137 | # print(d_ocr[0]) 138 | # print(x) 139 | 140 | with open(join(base_dir, 'v1_2d3dOCR_{}.pkl'.format(f_name)), 'wb') as handle: 141 | pkl.dump(x, handle, protocol=pkl.HIGHEST_PROTOCOL) 142 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_mp_55.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 20 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=INFINITE 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | 11 | echo "Pickling started" 12 | 13 | python create_pickle_seg2_mp.py --base_dir='/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' --file_name='m050_m156' 14 | 15 | echo "Pickling ended successfully" 16 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_mp_92.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 20 4 | #SBATCH --gres=gpu:2 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=INFINITE 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | module load ffmpeg/4.4.1 11 | 12 | echo "Pickling started" 13 | 14 | python create_pickle_seg2_mp.py --base_dir='/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' --file_name='m080_mit103' 15 | 16 | echo "Pickling ended successfully" 17 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pickle_segmentation.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | 6 | from os.path import join, isfile 7 | from glob import glob 8 | 9 | import numpy as np 10 | 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 18 | args = parser.parse_args() 19 | 20 | base_dir = args.base_dir 21 | print("Base Directory:") 22 | print(base_dir) 23 | 24 | data = [] 25 | 26 | folder_list = [] 27 | 28 | for fl in glob(join(base_dir, '*')): 29 | if 'mit' in fl: 30 | folder_list.append(fl) 31 | 32 | folder_list.sort() 33 | print(folder_list) 34 | 35 | count_match = 0 36 | total_count = 0 37 | row_num = 0 38 | 39 | for folder in folder_list: 40 | count = 0 41 | 42 | print("Inside - ", folder) 43 | 44 | with open(join(folder, 'combined.txt'), 'r') as text_file: 45 | 46 | lines = text_file.readlines() 47 | 48 | for line in lines: 49 | count += 1 50 | 51 | vid_name = line.split('|')[0] 52 | subtitle = line.split('|')[1] 53 | features_name = vid_name.replace('.mp4', '.npy') 54 | 55 | #if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 56 | 57 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 58 | # three_d = np.load(join(base_dir, 'features', '3d', features_name)) 59 | 60 | if two_d.shape == (0, 2048): 61 | print("True") 62 | continue 63 | 64 | # two_d = two_d.mean(axis = 0) 65 | # three_d = three_d.mean(axis = 0) 66 | 67 | # two_d = two_d.max(axis = 0) 68 | # three_d = three_d.max(axis = 0) 69 | 70 | for row in two_d: 71 | features = {} 72 | features['2d'] = row 73 | features['caption'] = 'frame ' + str(row_num) 74 | row_num += 1 75 | features['id'] = 'frame ' + str(row_num) 76 | data.append(features) 77 | count_match += 1 78 | 79 | 80 | # features['2d'] = two_d 81 | # features['3d'] = three_d 82 | # features['caption'] = subtitle 83 | # features['id'] = vid_name.replace('.mp4', '') 84 | 85 | # data.append(features) 86 | # count_match += 1 87 | 88 | print("Count = ", count) 89 | total_count += count 90 | 91 | print("Count match = ", count_match) 92 | print("Total count = ", total_count) 93 | 94 | with open(join(base_dir, 'seg.pkl'), 'wb') as handle: 95 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) 96 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/create_pkl_tst.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | 6 | from os.path import join, isfile 7 | from glob import glob 8 | 9 | import numpy as np 10 | 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir) 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir. 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset') 18 | args = parser.parse_args() 19 | 20 | base_dir = args.base_dir 21 | print("Base Directory:") 22 | print(base_dir) 23 | 24 | data = [] 25 | 26 | folder_list = [] 27 | 28 | for fl in glob(join(base_dir, '*')): 29 | # if ('mit011' in fl) or ('mit012' in fl): 30 | # continue 31 | # if 'mit' in fl: 32 | # folder_list.append(fl) 33 | 34 | if ('mit006' in fl): 35 | folder_list.append(fl) 36 | 37 | folder_list.sort() 38 | print(folder_list) 39 | 40 | count_match = 0 41 | total_count = 0 42 | 43 | features = {} 44 | 45 | for folder in folder_list: 46 | count = 0 47 | 48 | print("Inside - ", folder) 49 | 50 | with open(join(folder, 'combined.txt'), 'r') as text_file: 51 | 52 | lines = text_file.readlines() 53 | 54 | for line in lines: 55 | count += 1 56 | 57 | vid_name = line.split('|')[0] 58 | subtitle = line.split('|')[1] 59 | subtitle = " ".join(subtitle.split()) 60 | 61 | features_name = vid_name.replace('.mp4', '.npy') 62 | 63 | if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)): 64 | 65 | two_d = np.load(join(base_dir, 'features', '2d', features_name)) 66 | three_d = np.load(join(base_dir, 'features', '3d', features_name)) 67 | 68 | if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0: 69 | print("True") 70 | continue 71 | 72 | # two_d = two_d.mean(axis = 0) 73 | # three_d = three_d.mean(axis = 0) 74 | 75 | two_d = two_d.max(axis = 0) 76 | three_d = three_d.max(axis = 0) 77 | 78 | # features['2d'] = two_d 79 | # features['3d'] = three_d 80 | # features['caption'] = subtitle 81 | features[features_name.replace('.npy', '')] = subtitle 82 | 83 | # data.append(features) 84 | count_match += 1 85 | 86 | print("Count = ", count) 87 | total_count += count 88 | 89 | print("Count match = ", count_match) 90 | print("Total count = ", total_count) 91 | 92 | with open(join(base_dir, 'm6_pysd_subs.pkl'), 'wb') as handle: 93 | pkl.dump(features, handle, protocol=pkl.HIGHEST_PROTOCOL) 94 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/extract.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | import math 3 | import numpy as np 4 | from video_loader import VideoLoader 5 | from torch.utils.data import DataLoader 6 | import argparse 7 | from model import get_model 8 | from preprocessing import Preprocessing 9 | from random_sequence_shuffler import RandomSequenceSampler 10 | import torch.nn.functional as F 11 | import os 12 | 13 | parser = argparse.ArgumentParser(description='Easy video feature extractor') 14 | 15 | parser.add_argument( 16 | '--csv', 17 | type=str, 18 | help='input csv with video input path') 19 | parser.add_argument('--batch_size', type=int, default=64, 20 | help='batch size') 21 | parser.add_argument('--type', type=str, default='2d', 22 | help='CNN type') 23 | parser.add_argument('--half_precision', type=int, default=1, 24 | help='output half precision float') 25 | parser.add_argument('--num_decoding_thread', type=int, default=4, 26 | help='Num parallel thread for video decoding') 27 | parser.add_argument('--l2_normalize', type=int, default=1, 28 | help='l2 normalize feature') 29 | parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth', 30 | help='Resnext model path') 31 | args = parser.parse_args() 32 | 33 | dataset = VideoLoader( 34 | args.csv, 35 | framerate=1 if args.type == '2d' else 24, 36 | size=224 if args.type == '2d' else 112, 37 | centercrop=(args.type == '3d'), 38 | ) 39 | n_dataset = len(dataset) 40 | sampler = RandomSequenceSampler(n_dataset, 10) 41 | loader = DataLoader( 42 | dataset, 43 | batch_size=1, 44 | shuffle=False, 45 | num_workers=args.num_decoding_thread, 46 | sampler=sampler if n_dataset > 10 else None, 47 | ) 48 | preprocess = Preprocessing(args.type) 49 | model = get_model(args) 50 | 51 | with th.no_grad(): 52 | for k, data in enumerate(loader): 53 | input_file = data['input'][0] 54 | output_file = data['output'][0] 55 | base_dir = "/".join(output_file.split('/')[:-1]) 56 | os.makedirs(base_dir, exist_ok=True) 57 | if len(data['video'].shape) > 3: 58 | print('Computing features of video {}/{}: {}'.format( 59 | k + 1, n_dataset, input_file)) 60 | video = data['video'].squeeze() 61 | if len(video.shape) == 4: 62 | video = preprocess(video) 63 | n_chunk = len(video) 64 | features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0) 65 | n_iter = int(math.ceil(n_chunk / float(args.batch_size))) 66 | for i in range(n_iter): 67 | min_ind = i * args.batch_size 68 | max_ind = (i + 1) * args.batch_size 69 | video_batch = video[min_ind:max_ind].cuda() 70 | batch_features = model(video_batch) 71 | if args.l2_normalize: 72 | batch_features = F.normalize(batch_features, dim=1) 73 | features[min_ind:max_ind] = batch_features 74 | features = features.cpu().numpy() 75 | if args.half_precision: 76 | features = features.astype('float16') 77 | np.save(output_file, features) 78 | else: 79 | print('Video {} already processed.'.format(input_file)) 80 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/extract_features_2d_indi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 10 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=04-00:00:00 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | echo "Started" 11 | 12 | python extract.py --csv=/ssd_scratch/cvit/AVL/data_subset_50s_60s/input_2d_m011_m012.csv --type=2d --batch_size=64 --num_decoding_thread=8 13 | 14 | echo "Done successfully" 15 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/extract_features_3d_indi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 10 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=04-00:00:00 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | echo "Started" 11 | 12 | python extract.py --csv=/ssd_scratch/cvit/AVL/data_subset_50s_60s/input_3d_m011_m012.csv --type=3d --batch_size=64 --num_decoding_thread=8 13 | 14 | echo "Done successfully" 15 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/helper_pkl.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from tqdm import tqdm 8 | 9 | from os.path import join, isfile 10 | from glob import glob 11 | 12 | import numpy as np 13 | import subprocess 14 | 15 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s' 16 | 17 | delimiter = "@#@" 18 | 19 | def toFFMPEGtime(t): 20 | ss, ms = divmod(t*1000, 1000) 21 | mm, ss = divmod(ss, 60) 22 | hh, mm = divmod(mm, 60) 23 | 24 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 25 | 26 | def getTime(t): 27 | h, m, sms = t.split(":") 28 | if ',' in sms: # Example t = '00:00:03,980' 29 | s, ms = sms.split(",") 30 | elif '.' in sms: # Example t = '00:00:03.980' 31 | s, ms = sms.split(".") 32 | else: # Example t = '00:00:03' 33 | s = sms 34 | ms = '0' 35 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 36 | return tm 37 | 38 | 39 | base_pkl = pkl.load(open(join(base_dir, 'dataset_v1_20s25s_2d3dOCRBERT.pkl'), 'rb')) 40 | 41 | 42 | d = {} 43 | 44 | for f in tqdm(base_pkl): 45 | vid_name = "-".join(f['id'].split('-')[:-1]) 46 | if vid_name not in d: 47 | d[vid_name] = [] 48 | d[vid_name].append(f) 49 | 50 | 51 | with open(join(base_dir, 'dataset_v1_helper_20s25s.pkl'), 'wb') as handle: 52 | pkl.dump(d, handle, protocol=pkl.HIGHEST_PROTOCOL) 53 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/lec_list.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from tqdm import tqdm 8 | 9 | from os.path import join, isfile 10 | from glob import glob 11 | 12 | import numpy as np 13 | import subprocess 14 | 15 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s' 16 | 17 | delimiter = "@#@" 18 | 19 | def toFFMPEGtime(t): 20 | ss, ms = divmod(t*1000, 1000) 21 | mm, ss = divmod(ss, 60) 22 | hh, mm = divmod(mm, 60) 23 | 24 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 25 | 26 | def getTime(t): 27 | h, m, sms = t.split(":") 28 | if ',' in sms: # Example t = '00:00:03,980' 29 | s, ms = sms.split(",") 30 | elif '.' in sms: # Example t = '00:00:03.980' 31 | s, ms = sms.split(".") 32 | else: # Example t = '00:00:03' 33 | s = sms 34 | ms = '0' 35 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 36 | return tm 37 | 38 | 39 | base_pkl = pkl.load(open(join(base_dir, 'dataset_v1_20s25s_2d3dOCRBERT.pkl'), 'rb')) 40 | 41 | 42 | d = [] 43 | 44 | for f in tqdm(base_pkl): 45 | vid_name = "-".join(f['id'].split('-')[:-1]) 46 | if vid_name not in d: 47 | d.append(vid_name) 48 | 49 | with open(join(base_dir, 'dataset_v1_leclist_20s25s.pkl'), 'wb') as handle: 50 | pkl.dump(d, handle, protocol=pkl.HIGHEST_PROTOCOL) 51 | 52 | print(len(d)) 53 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/merge_and_bert.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from tqdm import tqdm 8 | 9 | from os.path import join, isfile 10 | from glob import glob 11 | 12 | from sentence_transformers import SentenceTransformer, util 13 | 14 | import numpy as np 15 | import subprocess 16 | import copy 17 | 18 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s' 19 | 20 | print("Loading BERT models") 21 | 22 | # model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1') 23 | model_ss = SentenceTransformer('all-mpnet-base-v2') 24 | 25 | print("Done model loading") 26 | 27 | p_list = ['/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s/v1_2d3dOCR_20_25_1.pkl', 28 | '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s/v1_2d3dOCR_20_25_2.pkl'] 29 | 30 | # for p in glob(join(base_dir, '*')): 31 | # p_list.append(p) 32 | 33 | p_list.sort() 34 | 35 | # p_list = [join(base_dir, 'v1_2d3dOCR_4_8.pkl')] 36 | 37 | print(p_list) 38 | 39 | data = [] 40 | 41 | for pkl_fl in p_list: 42 | b = pkl.load(open(pkl_fl, 'rb')) 43 | 44 | for f in tqdm(b): 45 | if f is not None: 46 | features = f.copy() 47 | 48 | caption = features['caption'] 49 | 50 | # features['emb_qa'] = model_qa.encode(caption) 51 | features['emb_ss'] = model_ss.encode(caption) 52 | 53 | ocr_text = f['ocr_text'] 54 | # features['ocr_emb_qa'] = model_qa.encode(ocr_text) 55 | features['ocr_emb_ss'] = model_ss.encode(ocr_text) 56 | 57 | data.append(features) 58 | 59 | 60 | print(len(data)) 61 | 62 | with open(join(base_dir, 'dataset_v1_20s25s_2d3dOCRBERT.pkl'), 'wb') as handle: 63 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) 64 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/merge_and_bert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -A darshan.singh 3 | #SBATCH -n 20 4 | #SBATCH --gres=gpu:2 5 | #SBATCH --mem-per-cpu=2G 6 | #SBATCH --time=INFINITE 7 | #SBATCH --mail-user=darshans012@gmail.com 8 | #SBATCH --mail-type=ALL 9 | 10 | echo "started" 11 | 12 | python merge_and_bert.py 13 | 14 | echo "Finished successfully" 15 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/merge_and_bert_mp.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from tqdm import tqdm 8 | 9 | from os.path import join, isfile 10 | from glob import glob 11 | 12 | from sentence_transformers import SentenceTransformer, util 13 | 14 | from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor 15 | 16 | import numpy as np 17 | import subprocess 18 | import copy 19 | 20 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_4s8s' 21 | 22 | print("Loading BERT models") 23 | 24 | # model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1') 25 | model_ss = SentenceTransformer('all-mpnet-base-v2') 26 | 27 | print("Done model loading") 28 | 29 | p_list = [] 30 | 31 | for p in glob(join(base_dir, '*')): 32 | p_list.append(p) 33 | 34 | p_list.sort() 35 | 36 | # p_list = [join(base_dir, 'v1_2d3dOCR_4_8.pkl')] 37 | 38 | print(p_list) 39 | 40 | data = [] 41 | 42 | def do_job(f): 43 | features = f.copy() 44 | 45 | caption = features['caption'] 46 | 47 | # features['emb_qa'] = model_qa.encode(caption) 48 | features['emb_ss'] = model_ss.encode(caption) 49 | 50 | ocr_text = f['ocr_text'] 51 | # features['ocr_emb_qa'] = model_qa.encode(ocr_text) 52 | features['ocr_emb_ss'] = model_ss.encode(ocr_text) 53 | 54 | return features 55 | 56 | for pkl_fl in p_list: 57 | b = pkl.load(open(pkl_fl, 'rb')) 58 | 59 | p = ProcessPoolExecutor(30) 60 | 61 | futures = [p.submit(do_job, li) for li in b] 62 | x = [r.result() for r in tqdm(as_completed(futures), total=len(futures))] 63 | data.extend(x) 64 | 65 | # for f in tqdm(b): 66 | # features = f.copy() 67 | 68 | # caption = features['caption'] 69 | 70 | # # features['emb_qa'] = model_qa.encode(caption) 71 | # features['emb_ss'] = model_ss.encode(caption) 72 | 73 | # ocr_text = f['ocr_text'] 74 | # # features['ocr_emb_qa'] = model_qa.encode(ocr_text) 75 | # features['ocr_emb_ss'] = model_ss.encode(ocr_text) 76 | 77 | # data.append(features) 78 | 79 | 80 | print(len(data)) 81 | 82 | with open(join(base_dir, 'datasetv1_4s8s_2d3dOCRBERT.pkl'), 'wb') as handle: 83 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/model.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import torch as th 3 | import torchvision.models as models 4 | from videocnn.models import resnext 5 | from torch import nn 6 | 7 | 8 | class GlobalAvgPool(nn.Module): 9 | def __init__(self): 10 | super(GlobalAvgPool, self).__init__() 11 | 12 | def forward(self, x): 13 | return th.mean(x, dim=[-2, -1]) 14 | 15 | 16 | def get_model(args): 17 | assert args.type in ['2d', '3d'] 18 | if args.type == '2d': 19 | print('Loading 2D-ResNet-152 ...') 20 | model = models.resnet152(pretrained=True) 21 | model = nn.Sequential(*list(model.children())[:-2], GlobalAvgPool()) 22 | model = model.cuda() 23 | else: 24 | print('Loading 3D-ResneXt-101 ...') 25 | model = resnext.resnet101( 26 | num_classes=400, 27 | shortcut_type='B', 28 | cardinality=32, 29 | sample_size=112, 30 | sample_duration=16, 31 | last_fc=False) 32 | model = model.cuda() 33 | model_data = th.load(args.resnext101_model_path) 34 | model.load_state_dict(model_data) 35 | 36 | model.eval() 37 | print('loaded') 38 | return model 39 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/ocr_bert_pickle.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os.path 3 | import os 4 | import argparse 5 | import json 6 | 7 | from tqdm import tqdm 8 | 9 | from os.path import join, isfile 10 | from glob import glob 11 | 12 | from sentence_transformers import SentenceTransformer, util 13 | 14 | import numpy as np 15 | import subprocess 16 | 17 | 18 | base_dir = '/ssd_scratch/cvit/darshan/segmentation_dataset_v1_10s15s' 19 | 20 | 21 | delimiter = "@#@" 22 | 23 | def toFFMPEGtime(t): 24 | ss, ms = divmod(t*1000, 1000) 25 | mm, ss = divmod(ss, 60) 26 | hh, mm = divmod(mm, 60) 27 | 28 | return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms)) 29 | 30 | def getTime(t): 31 | h, m, sms = t.split(":") 32 | if ',' in sms: # Example t = '00:00:03,980' 33 | s, ms = sms.split(",") 34 | elif '.' in sms: # Example t = '00:00:03.980' 35 | s, ms = sms.split(".") 36 | else: # Example t = '00:00:03' 37 | s = sms 38 | ms = '0' 39 | tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000 40 | return tm 41 | 42 | 43 | model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1') 44 | model_ss = SentenceTransformer('all-mpnet-base-v2') 45 | 46 | base_pkl = pkl.load(open(join(base_dir, 'old_seg_10s15s_2d3dBERTOCR.pkl'), 'rb')) 47 | 48 | data = [] 49 | 50 | for f in tqdm(base_pkl): 51 | features = f.copy() 52 | ocr_text = f['ocr_text'] 53 | features['ocr_emb_qa'] = model_qa.encode(ocr_text) 54 | features['ocr_emb_ss'] = model_ss.encode(ocr_text) 55 | 56 | 57 | data.append(features) 58 | 59 | 60 | with open(join(base_dir, 'seg_10s15s_2d3dOCRBERT.pkl'), 'wb') as handle: 61 | pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL) -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/preprocessing.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | 3 | class Normalize(object): 4 | 5 | def __init__(self, mean, std): 6 | self.mean = th.FloatTensor(mean).view(1, 3, 1, 1) 7 | self.std = th.FloatTensor(std).view(1, 3, 1, 1) 8 | 9 | def __call__(self, tensor): 10 | tensor = (tensor - self.mean) / (self.std + 1e-8) 11 | return tensor 12 | 13 | class Preprocessing(object): 14 | 15 | def __init__(self, type): 16 | self.type = type 17 | if type == '2d': 18 | self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 19 | elif type == '3d': 20 | self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0]) 21 | 22 | def _zero_pad(self, tensor, size): 23 | n = size - len(tensor) % size 24 | if n == size: 25 | return tensor 26 | else: 27 | z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3]) 28 | return th.cat((tensor, z), 0) 29 | 30 | def __call__(self, tensor): 31 | if self.type == '2d': 32 | tensor = tensor / 255.0 33 | tensor = self.norm(tensor) 34 | elif self.type == '3d': 35 | tensor = self._zero_pad(tensor, 16) 36 | tensor = self.norm(tensor) 37 | tensor = tensor.view(-1, 16, 3, 112, 112) 38 | tensor = tensor.transpose(1, 2) 39 | return tensor 40 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/random_sequence_shuffler.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | from torch.utils.data.sampler import Sampler 3 | import numpy as np 4 | 5 | class RandomSequenceSampler(Sampler): 6 | 7 | def __init__(self, n_sample, seq_len): 8 | self.n_sample = n_sample 9 | self.seq_len = seq_len 10 | 11 | def _pad_ind(self, ind): 12 | zeros = np.zeros(self.seq_len - self.n_sample % self.seq_len) 13 | ind = np.concatenate((ind, zeros)) 14 | return ind 15 | 16 | def __iter__(self): 17 | idx = np.arange(self.n_sample) 18 | if self.n_sample % self.seq_len != 0: 19 | idx = self._pad_ind(idx) 20 | idx = np.reshape(idx, (-1, self.seq_len)) 21 | np.random.shuffle(idx) 22 | idx = np.reshape(idx, (-1)) 23 | return iter(idx.astype(int)) 24 | 25 | def __len__(self): 26 | return self.n_sample + (self.seq_len - self.n_sample % self.seq_len) 27 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/readme.txt: -------------------------------------------------------------------------------- 1 | Step 1: 2 | Execute the "create_feature_csv.py" program. To specify the path of the Datasubset use the '--base_dir' optional argument. The default path of the DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset'. After executing this program the following will be created inside base_dir. 3 | a. input_2d.csv 4 | b. input_3d.csv 5 | c. Also empty directories called "features", "features/2d/", "features/3d/" will be created. 6 | 7 | Step 2: 8 | Once we have the 2d, 3d CSV files and empty directories to store 2d & 3d features, our next task is to extract the 2d and 3d features from the videos using the "extract.py" program. 9 | First extract the 2d features using the following command: 10 | $ python extract.py --csv=input_2d.csv --type=2d --batch_size=64 --num_decoding_thread=4 11 | Then download the 3D ResNext-101 model as follows (for 3d feature extraction): 12 | $ mkdir model 13 | $ cd model 14 | $ wget https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/models/resnext101.pth 15 | Now extract the 3d features using the following command: 16 | $ python extract.py --csv=input_3d.csv --type=3d --batch_size=64 --num_decoding_thread=4 17 | 18 | Step 3: 19 | Now it is time to create the pickle file of our data. To do this execute the "create_pickle.py" program. To specify the path of the Datasubset use the '--base_dir' optional argument. The default path of the DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset'. After executing this program a pickle file called 'avl.pkl' will be created inside base_dir. 20 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/video_loader.py: -------------------------------------------------------------------------------- 1 | import torch as th 2 | from torch.utils.data import Dataset 3 | import pandas as pd 4 | import os 5 | import numpy as np 6 | import ffmpeg 7 | 8 | 9 | class VideoLoader(Dataset): 10 | """Pytorch video loader.""" 11 | 12 | def __init__( 13 | self, 14 | csv, 15 | framerate=1, 16 | size=112, 17 | centercrop=False, 18 | ): 19 | """ 20 | Args: 21 | """ 22 | self.csv = pd.read_csv(csv) 23 | self.centercrop = centercrop 24 | self.size = size 25 | self.framerate = framerate 26 | 27 | def __len__(self): 28 | return len(self.csv) 29 | 30 | def _get_video_dim(self, video_path): 31 | probe = ffmpeg.probe(video_path) 32 | video_stream = next((stream for stream in probe['streams'] 33 | if stream['codec_type'] == 'video'), None) 34 | width = int(video_stream['width']) 35 | height = int(video_stream['height']) 36 | return height, width 37 | 38 | def _get_output_dim(self, h, w): 39 | if isinstance(self.size, tuple) and len(self.size) == 2: 40 | return self.size 41 | elif h >= w: 42 | return int(h * self.size / w), self.size 43 | else: 44 | return self.size, int(w * self.size / h) 45 | 46 | def __getitem__(self, idx): 47 | video_path = self.csv['video_path'].values[idx] 48 | output_file = self.csv['feature_path'].values[idx] 49 | 50 | if not(os.path.isfile(output_file)) and os.path.isfile(video_path): 51 | print('Decoding video: {}'.format(video_path)) 52 | try: 53 | h, w = self._get_video_dim(video_path) 54 | except: 55 | print('ffprobe failed at: {}'.format(video_path)) 56 | return {'video': th.zeros(1), 'input': video_path, 57 | 'output': output_file} 58 | height, width = self._get_output_dim(h, w) 59 | cmd = ( 60 | ffmpeg 61 | .input(video_path) 62 | .filter('fps', fps=self.framerate) 63 | .filter('scale', width, height) 64 | ) 65 | if self.centercrop: 66 | x = int((width - self.size) / 2.0) 67 | y = int((height - self.size) / 2.0) 68 | cmd = cmd.crop(x, y, self.size, self.size) 69 | out, _ = ( 70 | cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') 71 | .run(capture_stdout=True, quiet=True) 72 | ) 73 | if self.centercrop and isinstance(self.size, int): 74 | height, width = self.size, self.size 75 | video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3]) 76 | video = th.from_numpy(video.astype('float32')) 77 | video = video.permute(0, 3, 1, 2) 78 | else: 79 | video = th.zeros(1) 80 | 81 | return {'video': video, 'input': video_path, 'output': output_file} 82 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | .DS_Store -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/.opts.py.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/code/lecture_aware_embds/video_feature_extractor/videocnn/.opts.py.swp -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Kensho Hara 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/README.md: -------------------------------------------------------------------------------- 1 | # Video Classification Using 3D ResNet 2 | This is a pytorch code for video (action) classification using 3D ResNet trained by [this code](https://github.com/kenshohara/3D-ResNets-PyTorch). 3 | The 3D ResNet is trained on the Kinetics dataset, which includes 400 action classes. 4 | This code uses videos as inputs and outputs class names and predicted class scores for each 16 frames in the score mode. 5 | In the feature mode, this code outputs features of 512 dims (after global average pooling) for each 16 frames. 6 | 7 | **Torch (Lua) version of this code is available [here](https://github.com/kenshohara/video-classification-3d-cnn).** 8 | 9 | ## Requirements 10 | * [PyTorch](http://pytorch.org/) 11 | ``` 12 | conda install pytorch torchvision cuda80 -c soumith 13 | ``` 14 | * FFmpeg, FFprobe 15 | ``` 16 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz 17 | tar xvf ffmpeg-release-64bit-static.tar.xz 18 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin; 19 | ``` 20 | * Python 3 21 | 22 | ## Preparation 23 | * Download this code. 24 | * Download the [pretrained model](https://drive.google.com/drive/folders/1zvl89AgFAApbH0At-gMuZSeQB_LpNP-M?usp=sharing). 25 | * ResNeXt-101 achieved the best performance in our experiments. (See [paper](https://arxiv.org/abs/1711.09577) in details.) 26 | 27 | ## Usage 28 | Assume input video files are located in ```./videos```. 29 | 30 | To calculate class scores for each 16 frames, use ```--mode score```. 31 | ``` 32 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode score 33 | ``` 34 | To visualize the classification results, use ```generate_result_video/generate_result_video.py```. 35 | 36 | To calculate video features for each 16 frames, use ```--mode feature```. 37 | ``` 38 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode feature 39 | ``` 40 | 41 | 42 | ## Citation 43 | If you use this code, please cite the following: 44 | ``` 45 | @article{hara3dcnns, 46 | author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh}, 47 | title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?}, 48 | journal={arXiv preprint}, 49 | volume={arXiv:1711.09577}, 50 | year={2017}, 51 | } 52 | ``` 53 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/classify.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | from dataset import Video 5 | from spatial_transforms import (Compose, Normalize, Scale, CenterCrop, ToTensor) 6 | from temporal_transforms import LoopPadding 7 | 8 | def classify_video(video_dir, video_name, class_names, model, opt): 9 | assert opt.mode in ['score', 'feature'] 10 | 11 | spatial_transform = Compose([Scale(opt.sample_size), 12 | CenterCrop(opt.sample_size), 13 | ToTensor(), 14 | Normalize(opt.mean, [1, 1, 1])]) 15 | temporal_transform = LoopPadding(opt.sample_duration) 16 | data = Video(video_dir, spatial_transform=spatial_transform, 17 | temporal_transform=temporal_transform, 18 | sample_duration=opt.sample_duration) 19 | data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size, 20 | shuffle=False, num_workers=opt.n_threads, pin_memory=True) 21 | 22 | video_outputs = [] 23 | video_segments = [] 24 | for i, (inputs, segments) in enumerate(data_loader): 25 | inputs = Variable(inputs, volatile=True) 26 | outputs = model(inputs) 27 | 28 | video_outputs.append(outputs.cpu().data) 29 | video_segments.append(segments) 30 | 31 | video_outputs = torch.cat(video_outputs) 32 | video_segments = torch.cat(video_segments) 33 | results = { 34 | 'video': video_name, 35 | 'clips': [] 36 | } 37 | 38 | _, max_indices = video_outputs.max(dim=1) 39 | for i in range(video_outputs.size(0)): 40 | clip_results = { 41 | 'segment': video_segments[i].tolist(), 42 | } 43 | 44 | if opt.mode == 'score': 45 | clip_results['label'] = class_names[max_indices[i]] 46 | clip_results['scores'] = video_outputs[i].tolist() 47 | elif opt.mode == 'feature': 48 | clip_results['features'] = video_outputs[i].tolist() 49 | 50 | results['clips'].append(clip_results) 51 | 52 | return results 53 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from PIL import Image 4 | import os 5 | import math 6 | import functools 7 | import copy 8 | 9 | 10 | def pil_loader(path): 11 | # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) 12 | with open(path, 'rb') as f: 13 | with Image.open(f) as img: 14 | return img.convert('RGB') 15 | 16 | 17 | def accimage_loader(path): 18 | try: 19 | return accimage.Image(path) 20 | except IOError: 21 | # Potentially a decoding problem, fall back to PIL.Image 22 | return pil_loader(path) 23 | 24 | 25 | def get_default_image_loader(): 26 | from torchvision import get_image_backend 27 | if get_image_backend() == 'accimage': 28 | import accimage 29 | return accimage_loader 30 | else: 31 | return pil_loader 32 | 33 | 34 | def video_loader(video_dir_path, frame_indices, image_loader): 35 | video = [] 36 | for i in frame_indices: 37 | image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i)) 38 | if os.path.exists(image_path): 39 | video.append(image_loader(image_path)) 40 | else: 41 | return video 42 | 43 | return video 44 | 45 | 46 | def get_default_video_loader(): 47 | image_loader = get_default_image_loader() 48 | return functools.partial(video_loader, image_loader=image_loader) 49 | 50 | 51 | def load_annotation_data(data_file_path): 52 | with open(data_file_path, 'r') as data_file: 53 | return json.load(data_file) 54 | 55 | 56 | def get_class_labels(data): 57 | class_labels_map = {} 58 | index = 0 59 | for class_label in data['labels']: 60 | class_labels_map[class_label] = index 61 | index += 1 62 | return class_labels_map 63 | 64 | 65 | def get_video_names_and_annotations(data, subset): 66 | video_names = [] 67 | annotations = [] 68 | 69 | for key, value in data['database'].items(): 70 | this_subset = value['subset'] 71 | if this_subset == subset: 72 | if subset == 'testing': 73 | video_names.append('test/{}'.format(key)) 74 | else: 75 | label = value['annotations']['label'] 76 | video_names.append('{}/{}'.format(label, key)) 77 | annotations.append(value['annotations']) 78 | 79 | return video_names, annotations 80 | 81 | 82 | def make_dataset(video_path, sample_duration): 83 | dataset = [] 84 | 85 | n_frames = len(os.listdir(video_path)) 86 | 87 | begin_t = 1 88 | end_t = n_frames 89 | sample = { 90 | 'video': video_path, 91 | 'segment': [begin_t, end_t], 92 | 'n_frames': n_frames, 93 | } 94 | 95 | step = sample_duration 96 | for i in range(1, (n_frames - sample_duration + 1), step): 97 | sample_i = copy.deepcopy(sample) 98 | sample_i['frame_indices'] = list(range(i, i + sample_duration)) 99 | sample_i['segment'] = torch.IntTensor([i, i + sample_duration - 1]) 100 | dataset.append(sample_i) 101 | 102 | return dataset 103 | 104 | 105 | class Video(data.Dataset): 106 | def __init__(self, video_path, 107 | spatial_transform=None, temporal_transform=None, 108 | sample_duration=16, get_loader=get_default_video_loader): 109 | self.data = make_dataset(video_path, sample_duration) 110 | 111 | self.spatial_transform = spatial_transform 112 | self.temporal_transform = temporal_transform 113 | self.loader = get_loader() 114 | 115 | def __getitem__(self, index): 116 | """ 117 | Args: 118 | index (int): Index 119 | Returns: 120 | tuple: (image, target) where target is class_index of the target class. 121 | """ 122 | path = self.data[index]['video'] 123 | 124 | frame_indices = self.data[index]['frame_indices'] 125 | if self.temporal_transform is not None: 126 | frame_indices = self.temporal_transform(frame_indices) 127 | clip = self.loader(path, frame_indices) 128 | if self.spatial_transform is not None: 129 | clip = [self.spatial_transform(img) for img in clip] 130 | clip = torch.stack(clip, 0).permute(1, 0, 2, 3) 131 | 132 | target = self.data[index]['segment'] 133 | 134 | return clip, target 135 | 136 | def __len__(self): 137 | return len(self.data) 138 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/README.md: -------------------------------------------------------------------------------- 1 | # Result Video Generation 2 | This is a code for generating videos of classification results. 3 | It uses both ```output.json``` and videos as inputs and draw predicted class names in each frame. 4 | 5 | ## Requirements 6 | * Python 3 7 | * Pillow 8 | * ffmpeg, ffprobe 9 | 10 | ## Usage 11 | To generate videos based on ```../output.json```, execute the following. 12 | ``` 13 | python generate_result_video.py ../output.json ../videos ./videos_pred ../class_names_list 5 14 | ``` 15 | The 2nd parameter (```../videos```) is the root directory of videos. 16 | The 3rd parameter (```./videos_pred```) is the directory path of output videos. 17 | The 5th parameter is a size of temporal unit. 18 | The CNN predicts class scores for a 16 frame clip. 19 | The code averages the scores over each unit. 20 | The size 5 means that it averages the scores over 5 clips (i.e. 16x5 frames). 21 | If you use the size as 0, the scores are averaged over all clips of a video. 22 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/SourceSansPro-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/SourceSansPro-Regular.ttf -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/generate_result_video.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import subprocess 5 | import numpy as np 6 | from PIL import Image, ImageDraw, ImageFont 7 | 8 | 9 | def get_fps(video_file_path, frames_directory_path): 10 | p = subprocess.Popen('ffprobe {}'.format(video_file_path), 11 | shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 12 | _, res = p.communicate() 13 | res = res.decode('utf-8') 14 | 15 | duration_index = res.find('Duration:') 16 | duration_str = res[(duration_index + 10):(duration_index + 21)] 17 | hour = float(duration_str[0:2]) 18 | minute = float(duration_str[3:5]) 19 | sec = float(duration_str[6:10]) 20 | total_sec = hour * 3600 + minute * 60 + sec 21 | 22 | n_frames = len(os.listdir(frames_directory_path)) 23 | fps = round(n_frames / total_sec, 2) 24 | return fps 25 | 26 | 27 | if __name__ == '__main__': 28 | result_json_path = sys.argv[1] 29 | video_root_path = sys.argv[2] 30 | dst_directory_path = sys.argv[3] 31 | if not os.path.exists(dst_directory_path): 32 | subprocess.call('mkdir -p {}'.format(dst_directory_path), shell=True) 33 | class_name_path = sys.argv[4] 34 | temporal_unit = int(sys.argv[5]) 35 | 36 | with open(result_json_path, 'r') as f: 37 | results = json.load(f) 38 | 39 | with open(class_name_path, 'r') as f: 40 | class_names = [] 41 | for row in f: 42 | class_names.append(row[:-1]) 43 | 44 | for index in range(len(results)): 45 | video_path = os.path.join(video_root_path, results[index]['video']) 46 | print(video_path) 47 | 48 | clips = results[index]['clips'] 49 | unit_classes = [] 50 | unit_segments = [] 51 | if temporal_unit == 0: 52 | unit = len(clips) 53 | else: 54 | unit = temporal_unit 55 | for i in range(0, len(clips), unit): 56 | n_elements = min(unit, len(clips) - i) 57 | scores = np.array(clips[i]['scores']) 58 | for j in range(i, min(i + unit, len(clips))): 59 | scores += np.array(clips[i]['scores']) 60 | scores /= n_elements 61 | unit_classes.append(class_names[np.argmax(scores)]) 62 | unit_segments.append([clips[i]['segment'][0], 63 | clips[i + n_elements - 1]['segment'][1]]) 64 | 65 | if os.path.exists('tmp'): 66 | subprocess.call('rm -rf tmp', shell=True) 67 | subprocess.call('mkdir tmp', shell=True) 68 | 69 | subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(video_path), shell=True) 70 | 71 | fps = get_fps(video_path, 'tmp') 72 | 73 | for i in range(len(unit_classes)): 74 | for j in range(unit_segments[i][0], unit_segments[i][1] + 1): 75 | image = Image.open('tmp/image_{:05}.jpg'.format(j)).convert('RGB') 76 | min_length = min(image.size) 77 | font_size = int(min_length * 0.05) 78 | font = ImageFont.truetype(os.path.join(os.path.dirname(__file__), 79 | 'SourceSansPro-Regular.ttf'), 80 | font_size) 81 | d = ImageDraw.Draw(image) 82 | textsize = d.textsize(unit_classes[i], font=font) 83 | x = int(font_size * 0.5) 84 | y = int(font_size * 0.25) 85 | x_offset = x 86 | y_offset = y 87 | rect_position = (x, y, x + textsize[0] + x_offset * 2, 88 | y + textsize[1] + y_offset * 2) 89 | d.rectangle(rect_position, fill=(30, 30, 30)) 90 | d.text((x + x_offset, y + y_offset), unit_classes[i], 91 | font=font, fill=(235, 235, 235)) 92 | image.save('tmp/image_{:05}_pred.jpg'.format(j)) 93 | 94 | dst_file_path = os.path.join(dst_directory_path, video_path.split('/')[-1]) 95 | subprocess.call('ffmpeg -y -r {} -i tmp/image_%05d_pred.jpg -b:v 1000k {}'.format(fps, dst_file_path), 96 | shell=True) 97 | 98 | if os.path.exists('tmp'): 99 | subprocess.call('rm -rf tmp', shell=True) 100 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/input: -------------------------------------------------------------------------------- 1 | video1.mp4 2 | video2.mp4 3 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import subprocess 5 | import numpy as np 6 | import torch 7 | from torch import nn 8 | 9 | from opts import parse_opts 10 | from model import generate_model 11 | from mean import get_mean 12 | from classify import classify_video 13 | 14 | if __name__=="__main__": 15 | opt = parse_opts() 16 | opt.mean = get_mean() 17 | opt.arch = '{}-{}'.format(opt.model_name, opt.model_depth) 18 | opt.sample_size = 112 19 | opt.sample_duration = 16 20 | opt.n_classes = 400 21 | 22 | model = generate_model(opt) 23 | print('loading model {}'.format(opt.model)) 24 | model_data = torch.load(opt.model) 25 | assert opt.arch == model_data['arch'] 26 | model.load_state_dict(model_data['state_dict']) 27 | model.eval() 28 | if opt.verbose: 29 | print(model) 30 | 31 | input_files = [] 32 | with open(opt.input, 'r') as f: 33 | for row in f: 34 | input_files.append(row[:-1]) 35 | 36 | class_names = [] 37 | with open('class_names_list') as f: 38 | for row in f: 39 | class_names.append(row[:-1]) 40 | 41 | ffmpeg_loglevel = 'quiet' 42 | if opt.verbose: 43 | ffmpeg_loglevel = 'info' 44 | 45 | if os.path.exists('tmp'): 46 | subprocess.call('rm -rf tmp', shell=True) 47 | 48 | outputs = [] 49 | for input_file in input_files: 50 | video_path = os.path.join(opt.video_root, input_file) 51 | if os.path.exists(video_path): 52 | print(video_path) 53 | subprocess.call('mkdir tmp', shell=True) 54 | subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(video_path), 55 | shell=True) 56 | 57 | result = classify_video('tmp', input_file, class_names, model, opt) 58 | outputs.append(result) 59 | 60 | subprocess.call('rm -rf tmp', shell=True) 61 | else: 62 | print('{} does not exist'.format(input_file)) 63 | 64 | if os.path.exists('tmp'): 65 | subprocess.call('rm -rf tmp', shell=True) 66 | 67 | with open(opt.output, 'w') as f: 68 | json.dump(outputs, f) 69 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/mean.py: -------------------------------------------------------------------------------- 1 | def get_mean(): 2 | return [114.7748, 107.7354, 99.4750] 3 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/models/densenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | import math 6 | 7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264'] 8 | 9 | 10 | def densenet121(**kwargs): 11 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), 12 | **kwargs) 13 | return model 14 | 15 | 16 | def densenet169(**kwargs): 17 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32), 18 | **kwargs) 19 | return model 20 | 21 | 22 | def densenet201(**kwargs): 23 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), 24 | **kwargs) 25 | return model 26 | 27 | 28 | def densenet264(**kwargs): 29 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48), 30 | **kwargs) 31 | return model 32 | 33 | 34 | def get_fine_tuning_parameters(model, ft_begin_index): 35 | if ft_begin_index == 0: 36 | return model.parameters() 37 | 38 | ft_module_names = [] 39 | for i in range(ft_begin_index, 5): 40 | ft_module_names.append('denseblock{}'.format(ft_begin_index)) 41 | ft_module_names.append('transition{}'.format(ft_begin_index)) 42 | ft_module_names.append('norm5') 43 | ft_module_names.append('classifier') 44 | 45 | parameters = [] 46 | for k, v in model.named_parameters(): 47 | for ft_module in ft_module_names: 48 | if ft_module in k: 49 | parameters.append({'params': v}) 50 | break 51 | else: 52 | parameters.append({'params': v, 'lr': 0.0}) 53 | 54 | return parameters 55 | 56 | 57 | class _DenseLayer(nn.Sequential): 58 | def __init__(self, num_input_features, growth_rate, bn_size, drop_rate): 59 | super(_DenseLayer, self).__init__() 60 | self.add_module('norm.1', nn.BatchNorm3d(num_input_features)) 61 | self.add_module('relu.1', nn.ReLU(inplace=True)) 62 | self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate, 63 | kernel_size=1, stride=1, bias=False)) 64 | self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate)) 65 | self.add_module('relu.2', nn.ReLU(inplace=True)) 66 | self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate, 67 | kernel_size=3, stride=1, padding=1, bias=False)) 68 | self.drop_rate = drop_rate 69 | 70 | def forward(self, x): 71 | new_features = super(_DenseLayer, self).forward(x) 72 | if self.drop_rate > 0: 73 | new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) 74 | return torch.cat([x, new_features], 1) 75 | 76 | 77 | class _DenseBlock(nn.Sequential): 78 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate): 79 | super(_DenseBlock, self).__init__() 80 | for i in range(num_layers): 81 | layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate) 82 | self.add_module('denselayer%d' % (i + 1), layer) 83 | 84 | 85 | class _Transition(nn.Sequential): 86 | def __init__(self, num_input_features, num_output_features): 87 | super(_Transition, self).__init__() 88 | self.add_module('norm', nn.BatchNorm3d(num_input_features)) 89 | self.add_module('relu', nn.ReLU(inplace=True)) 90 | self.add_module('conv', nn.Conv3d(num_input_features, num_output_features, 91 | kernel_size=1, stride=1, bias=False)) 92 | self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2)) 93 | 94 | 95 | class DenseNet(nn.Module): 96 | """Densenet-BC model class 97 | Args: 98 | growth_rate (int) - how many filters to add each layer (k in paper) 99 | block_config (list of 4 ints) - how many layers in each pooling block 100 | num_init_features (int) - the number of filters to learn in the first convolution layer 101 | bn_size (int) - multiplicative factor for number of bottle neck layers 102 | (i.e. bn_size * k features in the bottleneck layer) 103 | drop_rate (float) - dropout rate after each dense layer 104 | num_classes (int) - number of classification classes 105 | """ 106 | def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16), 107 | num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True): 108 | 109 | super(DenseNet, self).__init__() 110 | 111 | self.last_fc = last_fc 112 | 113 | self.sample_size = sample_size 114 | self.sample_duration = sample_duration 115 | 116 | # First convolution 117 | self.features = nn.Sequential(OrderedDict([ 118 | ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7, 119 | stride=(1, 2, 2), padding=(3, 3, 3), bias=False)), 120 | ('norm0', nn.BatchNorm3d(num_init_features)), 121 | ('relu0', nn.ReLU(inplace=True)), 122 | ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)), 123 | ])) 124 | 125 | # Each denseblock 126 | num_features = num_init_features 127 | for i, num_layers in enumerate(block_config): 128 | block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, 129 | bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate) 130 | self.features.add_module('denseblock%d' % (i + 1), block) 131 | num_features = num_features + num_layers * growth_rate 132 | if i != len(block_config) - 1: 133 | trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2) 134 | self.features.add_module('transition%d' % (i + 1), trans) 135 | num_features = num_features // 2 136 | 137 | # Final batch norm 138 | self.features.add_module('norm5', nn.BatchNorm2d(num_features)) 139 | 140 | # Linear layer 141 | self.classifier = nn.Linear(num_features, num_classes) 142 | 143 | def forward(self, x): 144 | features = self.features(x) 145 | out = F.relu(features, inplace=True) 146 | last_duration = math.ceil(self.sample_duration / 16) 147 | last_size = math.floor(self.sample_size / 32) 148 | out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1) 149 | if self.last_fc: 150 | out = self.classifier(out) 151 | return out 152 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/models/resnext.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class ResNeXtBottleneck(nn.Module): 31 | expansion = 2 32 | 33 | def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None): 34 | super(ResNeXtBottleneck, self).__init__() 35 | mid_planes = cardinality * int(planes / 32) 36 | self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False) 37 | self.bn1 = nn.BatchNorm3d(mid_planes) 38 | self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride, 39 | padding=1, groups=cardinality, bias=False) 40 | self.bn2 = nn.BatchNorm3d(mid_planes) 41 | self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False) 42 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 43 | self.relu = nn.ReLU(inplace=True) 44 | self.downsample = downsample 45 | self.stride = stride 46 | 47 | def forward(self, x): 48 | residual = x 49 | 50 | out = self.conv1(x) 51 | out = self.bn1(out) 52 | out = self.relu(out) 53 | 54 | out = self.conv2(out) 55 | out = self.bn2(out) 56 | out = self.relu(out) 57 | 58 | out = self.conv3(out) 59 | out = self.bn3(out) 60 | 61 | if self.downsample is not None: 62 | residual = self.downsample(x) 63 | 64 | out += residual 65 | out = self.relu(out) 66 | 67 | return out 68 | 69 | 70 | class ResNeXt(nn.Module): 71 | 72 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True): 73 | self.last_fc = last_fc 74 | 75 | self.inplanes = 64 76 | super(ResNeXt, self).__init__() 77 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 78 | padding=(3, 3, 3), bias=False) 79 | self.bn1 = nn.BatchNorm3d(64) 80 | self.relu = nn.ReLU(inplace=True) 81 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 82 | self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality) 83 | self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2) 84 | self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2) 85 | self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2) 86 | last_duration = math.ceil(sample_duration / 16) 87 | last_size = math.ceil(sample_size / 32) 88 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 89 | self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes) 90 | 91 | for m in self.modules(): 92 | if isinstance(m, nn.Conv3d): 93 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 94 | m.weight.data.normal_(0, math.sqrt(2. / n)) 95 | elif isinstance(m, nn.BatchNorm3d): 96 | m.weight.data.fill_(1) 97 | m.bias.data.zero_() 98 | 99 | def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1): 100 | downsample = None 101 | if stride != 1 or self.inplanes != planes * block.expansion: 102 | if shortcut_type == 'A': 103 | downsample = partial(downsample_basic_block, 104 | planes=planes * block.expansion, 105 | stride=stride) 106 | else: 107 | downsample = nn.Sequential( 108 | nn.Conv3d(self.inplanes, planes * block.expansion, 109 | kernel_size=1, stride=stride, bias=False), 110 | nn.BatchNorm3d(planes * block.expansion) 111 | ) 112 | 113 | layers = [] 114 | layers.append(block(self.inplanes, planes, cardinality, stride, downsample)) 115 | self.inplanes = planes * block.expansion 116 | for i in range(1, blocks): 117 | layers.append(block(self.inplanes, planes, cardinality)) 118 | 119 | return nn.Sequential(*layers) 120 | 121 | def forward(self, x): 122 | x = self.conv1(x) 123 | x = self.bn1(x) 124 | x = self.relu(x) 125 | x = self.maxpool(x) 126 | 127 | x = self.layer1(x) 128 | x = self.layer2(x) 129 | x = self.layer3(x) 130 | x = self.layer4(x) 131 | 132 | x = self.avgpool(x) 133 | 134 | x = x.view(x.size(0), -1) 135 | if self.last_fc: 136 | x = self.fc(x) 137 | 138 | return x 139 | 140 | def get_fine_tuning_parameters(model, ft_begin_index): 141 | if ft_begin_index == 0: 142 | return model.parameters() 143 | 144 | ft_module_names = [] 145 | for i in range(ft_begin_index, 5): 146 | ft_module_names.append('layer{}'.format(ft_begin_index)) 147 | ft_module_names.append('fc') 148 | 149 | parameters = [] 150 | for k, v in model.named_parameters(): 151 | for ft_module in ft_module_names: 152 | if ft_module in k: 153 | parameters.append({'params': v}) 154 | break 155 | else: 156 | parameters.append({'params': v, 'lr': 0.0}) 157 | 158 | return parameters 159 | 160 | def resnet50(**kwargs): 161 | """Constructs a ResNet-50 model. 162 | """ 163 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs) 164 | return model 165 | 166 | def resnet101(**kwargs): 167 | """Constructs a ResNet-101 model. 168 | """ 169 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs) 170 | return model 171 | 172 | def resnet152(**kwargs): 173 | """Constructs a ResNet-101 model. 174 | """ 175 | model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs) 176 | return model 177 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/models/wide_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class WideBottleneck(nn.Module): 31 | expansion = 2 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(WideBottleneck, self).__init__() 35 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 36 | self.bn1 = nn.BatchNorm3d(planes) 37 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 38 | padding=1, bias=False) 39 | self.bn2 = nn.BatchNorm3d(planes) 40 | self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False) 41 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 42 | self.relu = nn.ReLU(inplace=True) 43 | self.downsample = downsample 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | residual = x 48 | 49 | out = self.conv1(x) 50 | out = self.bn1(out) 51 | out = self.relu(out) 52 | 53 | out = self.conv2(out) 54 | out = self.bn2(out) 55 | out = self.relu(out) 56 | 57 | out = self.conv3(out) 58 | out = self.bn3(out) 59 | 60 | if self.downsample is not None: 61 | residual = self.downsample(x) 62 | 63 | out += residual 64 | out = self.relu(out) 65 | 66 | return out 67 | 68 | 69 | class WideResNet(nn.Module): 70 | 71 | def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True): 72 | self.last_fc = last_fc 73 | 74 | self.inplanes = 64 75 | super(WideResNet, self).__init__() 76 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 77 | padding=(3, 3, 3), bias=False) 78 | self.bn1 = nn.BatchNorm3d(64) 79 | self.relu = nn.ReLU(inplace=True) 80 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 81 | self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type) 82 | self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2) 83 | self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2) 84 | self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2) 85 | last_duration = math.ceil(sample_duration / 16) 86 | last_size = math.ceil(sample_size / 32) 87 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 88 | self.fc = nn.Linear(512 * k * block.expansion, num_classes) 89 | 90 | for m in self.modules(): 91 | if isinstance(m, nn.Conv3d): 92 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 93 | m.weight.data.normal_(0, math.sqrt(2. / n)) 94 | elif isinstance(m, nn.BatchNorm3d): 95 | m.weight.data.fill_(1) 96 | m.bias.data.zero_() 97 | 98 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 99 | downsample = None 100 | if stride != 1 or self.inplanes != planes * block.expansion: 101 | if shortcut_type == 'A': 102 | downsample = partial(downsample_basic_block, 103 | planes=planes * block.expansion, 104 | stride=stride) 105 | else: 106 | downsample = nn.Sequential( 107 | nn.Conv3d(self.inplanes, planes * block.expansion, 108 | kernel_size=1, stride=stride, bias=False), 109 | nn.BatchNorm3d(planes * block.expansion) 110 | ) 111 | 112 | layers = [] 113 | layers.append(block(self.inplanes, planes, stride, downsample)) 114 | self.inplanes = planes * block.expansion 115 | for i in range(1, blocks): 116 | layers.append(block(self.inplanes, planes)) 117 | 118 | return nn.Sequential(*layers) 119 | 120 | def forward(self, x): 121 | x = self.conv1(x) 122 | x = self.bn1(x) 123 | x = self.relu(x) 124 | x = self.maxpool(x) 125 | 126 | x = self.layer1(x) 127 | x = self.layer2(x) 128 | x = self.layer3(x) 129 | x = self.layer4(x) 130 | 131 | x = self.avgpool(x) 132 | 133 | x = x.view(x.size(0), -1) 134 | if self.last_fc: 135 | x = self.fc(x) 136 | 137 | return x 138 | 139 | def get_fine_tuning_parameters(model, ft_begin_index): 140 | if ft_begin_index == 0: 141 | return model.parameters() 142 | 143 | ft_module_names = [] 144 | for i in range(ft_begin_index, 5): 145 | ft_module_names.append('layer{}'.format(ft_begin_index)) 146 | ft_module_names.append('fc') 147 | 148 | parameters = [] 149 | for k, v in model.named_parameters(): 150 | for ft_module in ft_module_names: 151 | if ft_module in k: 152 | parameters.append({'params': v}) 153 | break 154 | else: 155 | parameters.append({'params': v, 'lr': 0.0}) 156 | 157 | return parameters 158 | 159 | def resnet50(**kwargs): 160 | """Constructs a ResNet-50 model. 161 | """ 162 | model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs) 163 | return model 164 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_opts(): 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('--input', default='input', type=str, help='Input file path') 6 | parser.add_argument('--video_root', default='', type=str, help='Root path of input videos') 7 | parser.add_argument('--model', default='', type=str, help='Model file path') 8 | parser.add_argument('--output', default='output.json', type=str, help='Output file path') 9 | parser.add_argument('--mode', default='score', type=str, help='Mode (score | feature). score outputs class scores. feature outputs features (after global average pooling).') 10 | parser.add_argument('--batch_size', default=32, type=int, help='Batch Size') 11 | parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading') 12 | parser.add_argument('--model_name', default='resnet', type=str, help='Currently only support resnet') 13 | parser.add_argument('--model_depth', default=34, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)') 14 | parser.add_argument('--resnet_shortcut', default='A', type=str, help='Shortcut type of resnet (A | B)') 15 | parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k') 16 | parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality') 17 | parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.') 18 | parser.set_defaults(verbose=False) 19 | parser.add_argument('--verbose', action='store_true', help='') 20 | parser.set_defaults(verbose=False) 21 | 22 | args = parser.parse_args() 23 | 24 | return args 25 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/spatial_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | import numbers 4 | import collections 5 | import numpy as np 6 | import torch 7 | from PIL import Image, ImageOps 8 | try: 9 | import accimage 10 | except ImportError: 11 | accimage = None 12 | 13 | 14 | class Compose(object): 15 | """Composes several transforms together. 16 | Args: 17 | transforms (list of ``Transform`` objects): list of transforms to compose. 18 | Example: 19 | >>> transforms.Compose([ 20 | >>> transforms.CenterCrop(10), 21 | >>> transforms.ToTensor(), 22 | >>> ]) 23 | """ 24 | 25 | def __init__(self, transforms): 26 | self.transforms = transforms 27 | 28 | def __call__(self, img): 29 | for t in self.transforms: 30 | img = t(img) 31 | return img 32 | 33 | 34 | class ToTensor(object): 35 | """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. 36 | Converts a PIL.Image or numpy.ndarray (H x W x C) in the range 37 | [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. 38 | """ 39 | 40 | def __call__(self, pic): 41 | """ 42 | Args: 43 | pic (PIL.Image or numpy.ndarray): Image to be converted to tensor. 44 | Returns: 45 | Tensor: Converted image. 46 | """ 47 | if isinstance(pic, np.ndarray): 48 | # handle numpy array 49 | img = torch.from_numpy(pic.transpose((2, 0, 1))) 50 | # backward compatibility 51 | return img.float() 52 | 53 | if accimage is not None and isinstance(pic, accimage.Image): 54 | nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32) 55 | pic.copyto(nppic) 56 | return torch.from_numpy(nppic) 57 | 58 | # handle PIL Image 59 | if pic.mode == 'I': 60 | img = torch.from_numpy(np.array(pic, np.int32, copy=False)) 61 | elif pic.mode == 'I;16': 62 | img = torch.from_numpy(np.array(pic, np.int16, copy=False)) 63 | else: 64 | img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes())) 65 | # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK 66 | if pic.mode == 'YCbCr': 67 | nchannel = 3 68 | elif pic.mode == 'I;16': 69 | nchannel = 1 70 | else: 71 | nchannel = len(pic.mode) 72 | img = img.view(pic.size[1], pic.size[0], nchannel) 73 | # put it from HWC to CHW format 74 | # yikes, this transpose takes 80% of the loading time/CPU 75 | img = img.transpose(0, 1).transpose(0, 2).contiguous() 76 | if isinstance(img, torch.ByteTensor): 77 | return img.float() 78 | else: 79 | return img 80 | 81 | 82 | class Normalize(object): 83 | """Normalize an tensor image with mean and standard deviation. 84 | Given mean: (R, G, B) and std: (R, G, B), 85 | will normalize each channel of the torch.*Tensor, i.e. 86 | channel = (channel - mean) / std 87 | Args: 88 | mean (sequence): Sequence of means for R, G, B channels respecitvely. 89 | std (sequence): Sequence of standard deviations for R, G, B channels 90 | respecitvely. 91 | """ 92 | 93 | def __init__(self, mean, std): 94 | self.mean = mean 95 | self.std = std 96 | 97 | def __call__(self, tensor): 98 | """ 99 | Args: 100 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 101 | Returns: 102 | Tensor: Normalized image. 103 | """ 104 | # TODO: make efficient 105 | for t, m, s in zip(tensor, self.mean, self.std): 106 | t.sub_(m).div_(s) 107 | return tensor 108 | 109 | 110 | class Scale(object): 111 | """Rescale the input PIL.Image to the given size. 112 | Args: 113 | size (sequence or int): Desired output size. If size is a sequence like 114 | (w, h), output size will be matched to this. If size is an int, 115 | smaller edge of the image will be matched to this number. 116 | i.e, if height > width, then image will be rescaled to 117 | (size * height / width, size) 118 | interpolation (int, optional): Desired interpolation. Default is 119 | ``PIL.Image.BILINEAR`` 120 | """ 121 | 122 | def __init__(self, size, interpolation=Image.BILINEAR): 123 | assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2) 124 | self.size = size 125 | self.interpolation = interpolation 126 | 127 | def __call__(self, img): 128 | """ 129 | Args: 130 | img (PIL.Image): Image to be scaled. 131 | Returns: 132 | PIL.Image: Rescaled image. 133 | """ 134 | if isinstance(self.size, int): 135 | w, h = img.size 136 | if (w <= h and w == self.size) or (h <= w and h == self.size): 137 | return img 138 | if w < h: 139 | ow = self.size 140 | oh = int(self.size * h / w) 141 | return img.resize((ow, oh), self.interpolation) 142 | else: 143 | oh = self.size 144 | ow = int(self.size * w / h) 145 | return img.resize((ow, oh), self.interpolation) 146 | else: 147 | return img.resize(self.size, self.interpolation) 148 | 149 | 150 | class CenterCrop(object): 151 | """Crops the given PIL.Image at the center. 152 | Args: 153 | size (sequence or int): Desired output size of the crop. If size is an 154 | int instead of sequence like (h, w), a square crop (size, size) is 155 | made. 156 | """ 157 | 158 | def __init__(self, size): 159 | if isinstance(size, numbers.Number): 160 | self.size = (int(size), int(size)) 161 | else: 162 | self.size = size 163 | 164 | def __call__(self, img): 165 | """ 166 | Args: 167 | img (PIL.Image): Image to be cropped. 168 | Returns: 169 | PIL.Image: Cropped image. 170 | """ 171 | w, h = img.size 172 | th, tw = self.size 173 | x1 = int(round((w - tw) / 2.)) 174 | y1 = int(round((h - th) / 2.)) 175 | return img.crop((x1, y1, x1 + tw, y1 + th)) 176 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/temporal_transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | 5 | class LoopPadding(object): 6 | def __init__(self, size): 7 | self.size = size 8 | 9 | def __call__(self, frame_indices): 10 | out = frame_indices 11 | 12 | for index in out: 13 | if len(out) >= self.size: 14 | break 15 | out.append(index) 16 | 17 | return out 18 | 19 | 20 | class TemporalCenterCrop(object): 21 | """Temporally crop the given frame indices at a center. 22 | 23 | If the number of frames is less than the size, 24 | loop the indices as many times as necessary to satisfy the size. 25 | 26 | Args: 27 | size (int): Desired output size of the crop. 28 | """ 29 | 30 | def __init__(self, size): 31 | self.size = size 32 | 33 | def __call__(self, frame_indices): 34 | """ 35 | Args: 36 | frame_indices (list): frame indices to be cropped. 37 | Returns: 38 | list: Cropped frame indices. 39 | """ 40 | 41 | center_index = len(frame_indices) // 2 42 | begin_index = max(0, center_index - (self.size // 2)) 43 | end_index = min(begin_index + self.size, len(frame_indices)) 44 | 45 | out = frame_indices[begin_index:end_index] 46 | 47 | for index in out: 48 | if len(out) >= self.size: 49 | break 50 | out.append(index) 51 | 52 | return out 53 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import os 5 | import sys 6 | import json 7 | 8 | from utils import AverageMeter 9 | 10 | 11 | def calculate_video_results(output_buffer, video_id, test_results, class_names): 12 | video_outputs = torch.stack(output_buffer) 13 | average_scores = torch.mean(video_outputs, dim=0) 14 | sorted_scores, locs = torch.topk(average_scores, k=10) 15 | 16 | video_results = [] 17 | for i in range(sorted_scores.size(0)): 18 | video_results.append({'label': class_names[locs[i]], 'score': sorted_scores[i]}) 19 | 20 | test_results['results'][video_id] = video_results 21 | 22 | 23 | def test(data_loader, model, opt, class_names): 24 | print('test') 25 | 26 | model.eval() 27 | 28 | batch_time = AverageMeter() 29 | data_time = AverageMeter() 30 | 31 | end_time = time.time() 32 | output_buffer = [] 33 | previous_video_id = '' 34 | test_results = {'results': {}} 35 | for i, (inputs, targets) in enumerate(data_loader): 36 | data_time.update(time.time() - end_time) 37 | 38 | inputs = Variable(inputs, volatile=True) 39 | outputs = model(inputs) 40 | 41 | for j in range(outputs.size(0)): 42 | if not (i == 0 and j == 0) and targets[j] != previous_video_id: 43 | calculate_video_results(output_buffer, previous_video_id, 44 | test_results, class_names) 45 | output_buffer = [] 46 | output_buffer.append(outputs[j].data.cpu()) 47 | previous_video_id = targets[j] 48 | 49 | if (i % 100) == 0: 50 | with open(os.path.join(opt.result_path, 51 | '{}.json'.format(opt.test_subset)), 52 | 'w') as f: 53 | json.dump(test_results, f) 54 | 55 | batch_time.update(time.time() - end_time) 56 | end_time = time.time() 57 | 58 | print('[{}/{}]\t' 59 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 60 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( 61 | i + 1, len(data_loader), batch_time=batch_time, data_time=data_time)) 62 | with open(os.path.join(opt.result_path, 63 | '{}.json'.format(opt.test_subset)), 64 | 'w') as f: 65 | json.dump(test_results, f) 66 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import os 5 | import sys 6 | 7 | from utils import AverageMeter, calculate_accuracy 8 | 9 | 10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt, 11 | epoch_logger, batch_logger): 12 | print('train at epoch {}'.format(epoch)) 13 | 14 | model.train() 15 | 16 | batch_time = AverageMeter() 17 | data_time = AverageMeter() 18 | losses = AverageMeter() 19 | accuracies = AverageMeter() 20 | 21 | end_time = time.time() 22 | for i, (inputs, targets) in enumerate(data_loader): 23 | data_time.update(time.time() - end_time) 24 | 25 | if not opt.no_cuda: 26 | targets = targets.cuda(async=True) 27 | inputs = Variable(inputs) 28 | targets = Variable(targets) 29 | outputs = model(inputs) 30 | loss = criterion(outputs, targets) 31 | acc = calculate_accuracy(outputs, targets) 32 | 33 | losses.update(loss.data[0], inputs.size(0)) 34 | accuracies.update(acc, inputs.size(0)) 35 | 36 | optimizer.zero_grad() 37 | loss.backward() 38 | optimizer.step() 39 | 40 | batch_time.update(time.time() - end_time) 41 | end_time = time.time() 42 | 43 | batch_logger.log({ 44 | 'epoch': epoch, 45 | 'batch': i + 1, 46 | 'iter': (epoch - 1) * len(data_loader) + (i + 1), 47 | 'loss': losses.val, 48 | 'acc': accuracies.val, 49 | 'lr': optimizer.param_groups[0]['lr'] 50 | }) 51 | 52 | print('Epoch: [{0}][{1}/{2}]\t' 53 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 54 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 55 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 56 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 57 | epoch, i + 1, len(data_loader), batch_time=batch_time, 58 | data_time=data_time, loss=losses, acc=accuracies)) 59 | 60 | epoch_logger.log({ 61 | 'epoch': epoch, 62 | 'loss': losses.avg, 63 | 'acc': accuracies.avg, 64 | 'lr': optimizer.param_groups[0]['lr'] 65 | }) 66 | 67 | if epoch % opt.checkpoint == 0: 68 | save_file_path = os.path.join(opt.result_path, 'save_{}.pth'.format(epoch)) 69 | states = { 70 | 'epoch': epoch + 1, 71 | 'arch': opt.arch, 72 | 'state_dict': model.state_dict(), 73 | 'optimizer' : optimizer.state_dict(), 74 | } 75 | torch.save(states, save_file_path) 76 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/video_feature_extractor/videocnn/validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | import time 4 | import sys 5 | 6 | from utils import AverageMeter, calculate_accuracy 7 | 8 | 9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger): 10 | print('validation at epoch {}'.format(epoch)) 11 | 12 | model.eval() 13 | 14 | batch_time = AverageMeter() 15 | data_time = AverageMeter() 16 | losses = AverageMeter() 17 | accuracies = AverageMeter() 18 | 19 | end_time = time.time() 20 | for i, (inputs, targets) in enumerate(data_loader): 21 | data_time.update(time.time() - end_time) 22 | 23 | if not opt.no_cuda: 24 | targets = targets.cuda(async=True) 25 | inputs = Variable(inputs, volatile=True) 26 | targets = Variable(targets, volatile=True) 27 | outputs = model(inputs) 28 | loss = criterion(outputs, targets) 29 | acc = calculate_accuracy(outputs, targets) 30 | 31 | losses.update(loss.data[0], inputs.size(0)) 32 | accuracies.update(acc, inputs.size(0)) 33 | 34 | batch_time.update(time.time() - end_time) 35 | end_time = time.time() 36 | 37 | print('Epoch: [{0}][{1}/{2}]\t' 38 | 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 39 | 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 40 | 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 41 | 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format( 42 | epoch, i + 1, len(data_loader), batch_time=batch_time, 43 | data_time=data_time, loss=losses, acc=accuracies)) 44 | 45 | logger.log({ 46 | 'epoch': epoch, 47 | 'loss': losses.avg, 48 | 'acc': accuracies.avg 49 | }) 50 | 51 | return losses.avg 52 | -------------------------------------------------------------------------------- /code/lecture_aware_embds/we_embd.py: -------------------------------------------------------------------------------- 1 | from gensim.models.keyedvectors import KeyedVectors 2 | 3 | import torch as th 4 | from torch.utils.data import Dataset 5 | import pickle 6 | import torch.nn.functional as F 7 | import numpy as np 8 | import re 9 | from collections import defaultdict 10 | from torch.utils.data.dataloader import default_collate 11 | 12 | from stop_words import ENGLISH_STOP_WORDS 13 | 14 | we_dim = 300 15 | max_words = 50 16 | 17 | we = KeyedVectors.load_word2vec_format('/ssd_scratch/cvit/darshan/data/GoogleNews-vectors-negative300.bin', binary=True) 18 | 19 | 20 | def _zero_pad_tensor(tensor, size): 21 | if len(tensor) >= size: 22 | return tensor[:size] 23 | else: 24 | zero = np.zeros((size - len(tensor), we_dim), dtype=np.float32) 25 | return np.concatenate((tensor, zero), axis=0) 26 | 27 | 28 | def _tokenize_text(sentence): 29 | w = re.findall(r"[\w']+", str(sentence)) 30 | return w 31 | 32 | def _words_to_we(words): 33 | # words = [word for word in words if word in self.we.vocab] 34 | words = list(map(lambda word: word.lower(), words)) 35 | words = [word for word in words if (word in we.vocab) and (word not in ENGLISH_STOP_WORDS)] 36 | if words: 37 | we_t = _zero_pad_tensor(we[words], max_words) 38 | return th.from_numpy(we_t) 39 | else: 40 | return th.zeros(max_words, we_dim) 41 | 42 | 43 | cap = "Time is suspect Train arrives at 7 o'clock Two simultaneous events Do two different observer agree? NO! Relativity of simultaneity" 44 | 45 | caption = _words_to_we(_tokenize_text(cap)) 46 | 47 | print(cap, caption.shape) -------------------------------------------------------------------------------- /figures/AVLectures_stats.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/figures/AVLectures_stats.jpg --------------------------------------------------------------------------------