├── .gitignore
├── LICENSE
├── README.md
├── code
    ├── TW_FINCH
    │   ├── __pycache__
    │   │   └── finch.cpython-38.pyc
    │   ├── bestMap.m
    │   ├── compute_clustScores.m
    │   ├── confusionmat.m
    │   ├── eval_fs.m
    │   ├── evaluate.m
    │   ├── finch.py
    │   ├── findgroups.m
    │   ├── hungarian.m
    │   └── main.py
    ├── helpers
    │   ├── Split_Videos
    │   │   ├── README.md
    │   │   ├── driver.py
    │   │   ├── make_splits.py
    │   │   └── parse_subtitles.py
    │   └── Trim_Intro
    │   │   ├── concat_subs.py
    │   │   ├── concat_video.sh
    │   │   ├── concat_videos.py
    │   │   ├── cut_intro.py
    │   │   ├── cut_intro.sh
    │   │   └── cut_srt.py
    └── lecture_aware_embds
    │   ├── args.py
    │   ├── avlectures_dataloader.py
    │   ├── eval.py
    │   ├── extract_feats.py
    │   ├── loss.py
    │   ├── loss_ce.py
    │   ├── loss_milnce.py
    │   ├── loss_mms.py
    │   ├── metrics.py
    │   ├── model.py
    │   ├── model_ef.py
    │   ├── stop_words.py
    │   ├── train.py
    │   ├── video_feature_extractor
    │       ├── LICENSE
    │       ├── README.md
    │       ├── create_feature_csv.py
    │       ├── create_feature_csv_indi.py
    │       ├── create_feature_csv_seg.py
    │       ├── create_pickle.py
    │       ├── create_pickle_indi.py
    │       ├── create_pickle_ocr.py
    │       ├── create_pickle_prevnext.py
    │       ├── create_pickle_seg2.py
    │       ├── create_pickle_seg2.sh
    │       ├── create_pickle_seg2_55.sh
    │       ├── create_pickle_seg2_mp.py
    │       ├── create_pickle_seg2_mp_55.sh
    │       ├── create_pickle_seg2_mp_92.sh
    │       ├── create_pickle_seg3.py
    │       ├── create_pickle_segmentation.py
    │       ├── create_pkl_tst.py
    │       ├── extract.py
    │       ├── extract_features_2d_indi.sh
    │       ├── extract_features_3d_indi.sh
    │       ├── helper_pkl.py
    │       ├── lec_list.py
    │       ├── merge_and_bert.py
    │       ├── merge_and_bert.sh
    │       ├── merge_and_bert_mp.py
    │       ├── model.py
    │       ├── ocr_bert_pickle.py
    │       ├── preprocessing.py
    │       ├── random_sequence_shuffler.py
    │       ├── readme.txt
    │       ├── video_loader.py
    │       └── videocnn
    │       │   ├── .gitignore
    │       │   ├── .opts.py.swp
    │       │   ├── LICENSE
    │       │   ├── README.md
    │       │   ├── class_names_list
    │       │   ├── classify.py
    │       │   ├── dataset.py
    │       │   ├── generate_result_video
    │       │       ├── README.md
    │       │       ├── SourceSansPro-Regular.ttf
    │       │       └── generate_result_video.py
    │       │   ├── input
    │       │   ├── main.py
    │       │   ├── mean.py
    │       │   ├── model.py
    │       │   ├── models
    │       │       ├── densenet.py
    │       │       ├── pre_act_resnet.py
    │       │       ├── resnet.py
    │       │       ├── resnext.py
    │       │       └── wide_resnet.py
    │       │   ├── opts.py
    │       │   ├── spatial_transforms.py
    │       │   ├── temporal_transforms.py
    │       │   ├── test.py
    │       │   ├── train.py
    │       │   └── validation.py
    │   └── we_embd.py
└── figures
    └── AVLectures_stats.jpg


/.gitignore:
--------------------------------------------------------------------------------
  1 | *.npy
  2 | *.pickle
  3 | *.pth
  4 | *.pkl
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | code/lecture_aware_embds/.ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | .dmypy.json
117 | dmypy.json
118 | 
119 | # Pyre type checker
120 | .pyre/
121 | 
122 | .prof
123 | 
124 | code/lecture_aware_embds/test.py
125 | code/lecture_aware_embds/test_dataloader.py
126 | 
127 | code/lecture_aware_embds/video_feature_extractor/model/
128 | code/lecture_aware_embds/video_feature_extractor/extract_features_2d.sh
129 | code/lecture_aware_embds/video_feature_extractor/extract_features_3d.sh
130 | code/lecture_aware_embds/video_feature_extractor/create_pickle.sh
131 | code/lecture_aware_embds/video_feature_extractor/slurm-*.out
132 | 
133 | code/lecture_aware_embds/*.sh
134 | code/lecture_aware_embds/slurm-*.out
135 | code/lecture_aware_embds/*.ipynb
136 | 


--------------------------------------------------------------------------------
/code/TW_FINCH/__pycache__/finch.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/code/TW_FINCH/__pycache__/finch.cpython-38.pyc


--------------------------------------------------------------------------------
/code/TW_FINCH/bestMap.m:
--------------------------------------------------------------------------------
 1 | function [newL2] = bestMap(L1,L2)
 2 | %bestmap: permute labels of L2 to match L1 as good as possible
 3 | %   [newL2] = bestMap(L1,L2);
 4 | %
 5 | %   version 2.0 --May/2007
 6 | %   version 1.0 --November/2003
 7 | %
 8 | %   Written by Deng Cai (dengcai AT gmail.com)
 9 | 
10 | 
11 | %===========    
12 | 
13 | L1 = L1(:);
14 | L2 = L2(:);
15 | if size(L1) ~= size(L2)
16 |     error('size(L1) must == size(L2)');
17 | end
18 | 
19 | Label1 = unique(L1);
20 | nClass1 = length(Label1);
21 | Label2 = unique(L2);
22 | nClass2 = length(Label2);
23 | 
24 | nClass = max(nClass1,nClass2);
25 | G = zeros(nClass);
26 | for i=1:nClass1
27 | 	for j=1:nClass2
28 | 		G(i,j) = length(find((L1 == Label1(i)) & (L2 == Label2(j))));
29 | 	end
30 | end
31 | 
32 | %% Compute Hungarian Match Matrix
33 | [c,t] = hungarian(-G);
34 | 
35 | newL2 = zeros(size(L2));
36 | 
37 | for i=1:length(Label2)  
38 |     if c(i) <= nClass1
39 |         newL2(L2 == Label2(i)) = Label1(c(i));
40 |     else        
41 |         newL2(L2 == Label2(i)) = c(i);
42 |     end
43 | end
44 | 


--------------------------------------------------------------------------------
/code/TW_FINCH/compute_clustScores.m:
--------------------------------------------------------------------------------
 1 | function [fscore, precision, recall]=compute_clustScores(confusionMat)
 2 | 
 3 | % confusionMat=confusionmat(true_label,predicted_label);
 4 | 
 5 | acc = trace(confusionMat)/sum(confusionMat(:));
 6 | 
 7 | recall =  diag(confusionMat)./sum(confusionMat,2);
 8 | 
 9 | precision = diag(confusionMat)./sum(confusionMat,1)';
10 | 
11 | f1Scores =  2*(precision.*recall)./(precision +recall);
12 | 
13 | precision(isnan(precision))=0; 
14 | recall(isnan(recall))=0;
15 | f1Scores(isnan(f1Scores))=0;
16 | 
17 | 
18 | precision =mean(precision);
19 | recall= mean(recall);
20 | fscore = mean(f1Scores);
21 | 
22 | end
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/code/TW_FINCH/eval_fs.m:
--------------------------------------------------------------------------------
 1 | function [mof, iou, fscore, res] = eval_fs(label_pre, label_gt, datasets_path)
 2 | % Evaluates 50Salads dataset in Eval Mode
 3 | 
 4 | % compute Hungarian based accuracy and IOU score
 5 | res = bestMap(label_gt(:), label_pre(:));
 6 | 
 7 | label_gt = fs_eval_mode_map(label_gt, datasets_path);         
 8 | res = fs_eval_mode_map(res, datasets_path);
 9 | 
10 | 
11 | 
12 | mof = length(find(label_gt(:) == res(:)))/length(label_gt(:));
13 | 
14 | k = length(unique(res));
15 | % compute IOU 
16 | try
17 |   iou= jaccard(categorical(label_gt), categorical(res));
18 | catch
19 |   iou= jaccard(label_gt, res);
20 | end
21 | % penalize under/over clustering equally in iou
22 | iou(isnan(iou))=0;
23 | iou= sum(iou)/ k;
24 | 
25 | % compute fscore
26 | [fscore, ~, ~]=compute_clustScores(label_gt, res);
27 | 
28 | 
29 |  function labs = fs_eval_mode_map(labs, datasets_path)
30 |         mapping_path = fullfile(datasets_path, '50Salads', 'mapping');
31 |         map=readtable(fullfile(mapping_path, 'mapping.txt'));
32 |         label_str = map.Var2(labs);
33 |         
34 |         map_val=readtable(fullfile(mapping_path, 'mappingeval.txt'));
35 |         map2=table([1:numel(map_val.Var2)]', 'RowNames', map_val.Var2);
36 |         mapped_label = table2array(map2(label_str,1));
37 |         grp = map_val.Var1 +1;
38 |         labs = grp(mapped_label);
39 |     end
40 | end


--------------------------------------------------------------------------------
/code/TW_FINCH/evaluate.m:
--------------------------------------------------------------------------------
 1 | function [mof, iou] = evaluate(label_pre, label_gt, res, iou)
 2 | % compute Hungarian based accuracy and IOU score
 3 | % res = bestMap(label_gt(:), label_pre(:));
 4 | mof = length(find(label_gt(:) == res(:)))/length(label_gt(:));
 5 | 
 6 | k = length(unique(label_pre));
 7 | % % compute IOU 
 8 | % try
 9 | %   iou= jaccard(categorical(label_gt), categorical(res));
10 | % catch
11 | %   iou= jaccard(label_gt, res);
12 | % end
13 | % % penalize under/over clustering equally in iou
14 | iou(isnan(iou))=0;
15 | iou= sum(iou)/ k;
16 | % compute fscore
17 | % [fscore, ~, ~]=compute_clustScores(label_gt, res);
18 | return [mof, iou]
19 | end


--------------------------------------------------------------------------------
/code/TW_FINCH/findgroups.m:
--------------------------------------------------------------------------------
  1 | function [gnums, varargout] = findgroups(varargin)
  2 | %FINDGROUPS Find groups and return group numbers
  3 | %   G = FINDGROUPS(A) returns G, a vector of group numbers created from the
  4 | %   grouping variable A. G contains integer values from 1 to N, indicating
  5 | %   N distinct groups for the N unique values in A.
  6 | %
  7 | %   A is a categorical, numeric, logical, string, datetime, duration,
  8 | %   or calendarDuration vector, or a cell array of character vectors.
  9 | %   G has the same length as A.
 10 | %
 11 | %   [G,ID] = FINDGROUPS(A) also returns ID, a vector of the N unique values
 12 | %   that identify each group in A. ID has the same type as A.
 13 | %
 14 | %   [G,ID1,ID2,...] = FINDGROUPS(A1,A2,...) returns group numbers created
 15 | %   from one or more grouping variables A1,A2,... . Each group is defined
 16 | %   by a unique combination of values across A1,A2,... .
 17 | %   [ID1(J),ID2(J),...] contains the values that define the J-th group.
 18 | %
 19 | %   [G,TID] = FINDGROUPS(T) returns group numbers created from the
 20 | %   variables in the table T. The length of G equals the number of rows of
 21 | %   T. Each group is defined by a unique combination of values in the rows
 22 | %   of T. TID is a table where TID(J,:) contains the values that define the
 23 | %   J-th group.
 24 | %
 25 | %   FINDGROUPS returns NaNs for corresponding missing elements in A.
 26 | %   Examples of missing elements are:
 27 | %       * NaN in a double array
 28 | %       * '' in a cell array of character vectors
 29 | %       * Any element that displays as <missing>, without quotes
 30 | %   For more information on missing elements type "help ismissing".
 31 | %
 32 | %   Examples:
 33 | %      % Load patients data.
 34 | %      % List Weight, Gender, and Smoker variables for patients.
 35 | %      load patients;
 36 | %      whos Weight Gender Smoker
 37 | %      
 38 | %      % Find the mean weights by gender.
 39 | %      G = findgroups(Gender);
 40 | %      Y = splitapply(@mean,Weight,G)
 41 | %
 42 | %      % Find the median weights by gender. Create a table containing the
 43 | %      % results.
 44 | %      [G,gender] = findgroups(Gender);
 45 | %      medianWeight = splitapply(@median,Weight,G)
 46 | %      results = table(gender,medianWeight)
 47 | %
 48 | %      % Find the mean weights for all four groups of patients. 
 49 | %      G = findgroups(Gender,Smoker);
 50 | %      Y = splitapply(@mean,Weight,G)
 51 | %
 52 | %      % Find the mean weights for the four groups of patients. Create a table
 53 | %      % containing the results.
 54 | %      [G,gender,smoker] = findgroups(Gender,Smoker);
 55 | %      meanWeight = splitapply(@mean,Weight,G);
 56 | %      results = table(gender,smoker,meanWeight)
 57 | %     
 58 | %      % Read power loss data into a table.
 59 | %      % Find the maximum power loss in each region and by cause of power
 60 | %      % outage. Specify the grouping variables using table indexing.
 61 | %      % Return the maximum power losses in a table.
 62 | %      T = readtable('outages.csv');
 63 | %      summary(T)
 64 | %      [G,powerLoss] = findgroups(T(:,{'Region','Cause'}));
 65 | %      powerLoss.maxLoss = splitapply(@max,T.Loss,G)
 66 | %      
 67 | %   See also SPLITAPPLY, UNIQUE, ISMEMBER, ISMISSING
 68 | 
 69 | % Copyright 2015-2019 The MathWorks, Inc.
 70 | 
 71 | narginchk(1,inf);
 72 | nargoutchk(0, nargin+1);
 73 | 
 74 | % Parse inputs into grouping variables. Remember which grouping variables
 75 | % come from table input and the corresponding variable names
 76 | [groupVars, outVarIdx, tOutTemplate] = parseInput(varargin);
 77 | 
 78 | % Call into the grouping helper function used by groupsummary
 79 | inclnan = false;
 80 | inclempty = false;
 81 | if nargout <=1
 82 |     gnums = matlab.internal.math.mgrp2idx(groupVars,0,inclnan,inclempty);
 83 |     if isrow(groupVars{1})
 84 |         gnums = gnums';
 85 |     end
 86 |     return;
 87 | else
 88 |     [gnums,~,gnames] = matlab.internal.math.mgrp2idx(groupVars,0,inclnan,inclempty);
 89 |     if isrow(groupVars{1})
 90 |         gnums = gnums';
 91 |         gnames = cellfun(@transpose,gnames,'UniformOutput',false);
 92 |     end
 93 | end
 94 | 
 95 | % Build output for group IDs
 96 | varargout = cell(1, nargin);
 97 | for i = 1:nargin
 98 |     if istable(tOutTemplate{i})
 99 |         gnames_i = gnames(outVarIdx{i});
100 |         varargout{i} = table(gnames_i{:});
101 |         varargout{i}.Properties = tOutTemplate{i}.Properties;
102 |     else
103 |         varargout(i) = gnames(outVarIdx{i});
104 |     end
105 | end
106 | end
107 | 
108 | %-------------------------------------------------------------------------------
109 | function [groupVars, outVarIdx, tOutTemplate] = parseInput(userInput)
110 | % ParseInput Extract grouping variables from user inputs.
111 | %   [GROUPVARS, OUTVARIDX, TOUTTEMPLATE] = PARSEINPUT(USERINPUT) extracts
112 | %   into GROUPVARS a cell array of grouping variables from USERINPUT.
113 | %   PARSEINPUT does not verify types of USERINPUT. Variables in table
114 | %   entries in USERINPUT are extracted as individual grouping variables;
115 | %   non-table entries are treated as grouping variables on their own.
116 | %
117 | %   OUTVARIDX is a cell array of indices the same length as USERINPUT. Each
118 | %   cell contains indices into GROUPVARS. These indices indicate which
119 | %   grouping variables in GROUPVARS correspond to each element in
120 | %   USERINPUT. Cells of OUTVARIDX that correspond to table entries in
121 | %   USERINPUT have the same number of indices as there are variables in the
122 | %   table.
123 | %
124 | %   TOUTTEMPLATE is a cell array the same length as USERINPUT. Cells that
125 | %   corresponds to table entries in USERINPUT contain a 0-by-N sub-table
126 | %   where N is the number of variables in that table. Cells that
127 | %   corresponds to non-table entries in USERINPUT will be empty.
128 | 
129 | % Total number of grouping variables equal sum of non-table inputs and
130 | % total number of variables across all table inputs
131 | isTabularInput = cellfun(@matlab.internal.datatypes.istabular, userInput);
132 | nGrpVars = sum(cellfun(@width, userInput(isTabularInput))) + sum(~isTabularInput);
133 | 
134 | groupVars    = cell(1, nGrpVars);
135 | outVarIdx    = cell(size(userInput));
136 | tOutTemplate = cell(size(userInput));
137 | 
138 | % Extract grouping variables from userInput
139 | groupVarIdx = 0; % loop invariant: number of grouping variable already extracted
140 | for i = 1:length(userInput)
141 |     if isTabularInput(i)
142 |         t = userInput{i};
143 |         if istimetable(t), t = timetable2table(t,'ConvertRowTimes',false); end
144 |         varIndices = groupVarIdx + (1:width(t));
145 |         for k = 1:numel(varIndices)
146 |             groupVars{varIndices(k)} = t{:,k};
147 |         end
148 |         tOutTemplate{i} = t([],:);
149 |         tOutTemplate{i}.Properties.RowNames = {}; % clear rowNames for use as output template
150 |     else
151 |         varIndices = groupVarIdx + 1;
152 |         groupVars(varIndices) = userInput(i);
153 |     end
154 |     
155 |     outVarIdx{i} = varIndices;
156 |     
157 |     % Update loop invariant: number of grouping variable already extracted
158 |     groupVarIdx = groupVarIdx + length(outVarIdx{i});
159 | end
160 | 
161 | if isempty(groupVars)
162 |     throwAsCaller(MException(message('MATLAB:findgroups:GroupingVarNotVector')));
163 | end
164 | 
165 | end
166 | 


--------------------------------------------------------------------------------
/code/TW_FINCH/main.py:
--------------------------------------------------------------------------------
  1 | from finch import FINCH
  2 | import numpy as np
  3 | import os
  4 | import cv2
  5 | import sys
  6 | import pickle as pkl
  7 | from tqdm import tqdm
  8 | 
  9 | def check_clusters(cluster):
 10 |     prev_cluster = cluster[0]
 11 |     for c in cluster:
 12 |         if (c == prev_cluster) or (c == prev_cluster + 1):
 13 |             prev_cluster = c
 14 |         else:
 15 |             return False
 16 |     
 17 |     return True
 18 | 
 19 | # learned lecture aware embeddings on which you want to perform clustering using TW-FINCH 
 20 | a = pkl.load(open('/ssd_scratch/cvit/darshan_2/seg_embds/2d3dOCR_ss_test50ft50.pkl', 'rb'))
 21 | b = pkl.load(open('/home2/darshan.singh/combined_stats.pkl', 'rb'))
 22 | 
 23 | d = {}
 24 | cnt = 0
 25 | cnt_2, cnt_3, cnt_4 = 0, 0, 0
 26 | 
 27 | for course in a:
 28 |     course_cnt = 0
 29 |     for lec in a[course]:
 30 |         course_cnt += 1
 31 |     print("For course", course, "->", course_cnt)
 32 | 
 33 | lst_alpha = []
 34 | 
 35 | for course in tqdm(list(a.keys())):
 36 |     d[course] = {}
 37 |     for lec in a[course]:
 38 |         cnt += 1
 39 |         clusters = b[course][lec]['num_segments']
 40 |         vid_emb = a[course][lec]['vid_embd']
 41 |         txt_emb = a[course][lec]['text_embd']
 42 |         vidtext_emb = np.hstack((vid_emb, txt_emb))
 43 |         d[course][lec] = {}    
 44 | 
 45 |         _, _, vid_clusters = FINCH(vid_emb, req_clust=clusters, tw_finch=True, alpha = 1)
 46 |         alpha = 1
 47 |         while not check_clusters(vid_clusters):
 48 |             _, _, vid_clusters = FINCH(vid_emb, req_clust=clusters, tw_finch=True, alpha = alpha)
 49 |             alpha += 0.1
 50 |         if alpha > 5:
 51 |             lst_alpha.append(alpha)
 52 | 
 53 |         if not check_clusters(vid_clusters):
 54 |             cnt_2 += 1
 55 | 
 56 |         _, _, txt_clusters = FINCH(txt_emb, req_clust=clusters, tw_finch=True, alpha = 1)
 57 |         loop_counter = 0
 58 |         alpha = 1
 59 |         while not check_clusters(txt_clusters):
 60 |             _, _, txt_clusters = FINCH(txt_emb, req_clust=clusters, tw_finch=True, alpha = alpha)
 61 |             alpha += 0.1
 62 |             loop_counter += 1
 63 |         if alpha > 5:
 64 |             lst_alpha.append(alpha)
 65 | 
 66 |         if not check_clusters(txt_clusters):
 67 |             cnt_3 += 1
 68 | 
 69 |         _, _, vidtxt_clusters = FINCH(vidtext_emb, req_clust=clusters, tw_finch=True, alpha = 1)
 70 | 
 71 |         alpha = 1
 72 |         while not check_clusters(vidtxt_clusters):
 73 |             _, _, vidtxt_clusters = FINCH(vidtext_emb, req_clust=clusters, tw_finch=True, alpha = alpha)
 74 |             alpha += 0.1
 75 |         if alpha > 5:
 76 |             lst_alpha.append(alpha)
 77 | 
 78 |         if not check_clusters(vidtxt_clusters):
 79 |             cnt_4 += 1
 80 | 
 81 |         vid_clusters = vid_clusters.reshape(-1)
 82 |         txt_clusters = txt_clusters.reshape(-1)
 83 |         vidtxt_clusters = vidtxt_clusters.reshape(-1)
 84 |         d[course][lec]['vid_clusters'] = vid_clusters
 85 |         d[course][lec]['txt_clusters'] = txt_clusters
 86 |         d[course][lec]['vidtxt_clusters'] = vidtxt_clusters
 87 | 
 88 | print(cnt, len(lst_alpha), cnt_2, cnt_3, cnt_4)
 89 | print(lst_alpha)
 90 | 
 91 | clusters = d.copy()
 92 | lecs_excluded = {}
 93 | 
 94 | for course in clusters:
 95 |     for lec in clusters[course]:
 96 |         for t in clusters[course][lec]:
 97 |             lst = []
 98 |             for i in clusters[course][lec][t]:
 99 |                 lst.append(i)
100 |             prev_cluster = lst[0]
101 |             for i, c in enumerate(lst):
102 |                 if (c == prev_cluster) or (c == prev_cluster + 1):
103 |                     prev_cluster = c
104 |                 else:
105 |                     if course not in lecs_excluded:
106 |                         lecs_excluded[course] = []
107 |                     if lec not in lecs_excluded[course]:
108 |                         lecs_excluded[course].append(lec)
109 |                     break
110 | 
111 | for course in lecs_excluded:
112 |     for lec in lecs_excluded[course]:
113 |         print(course, '->', lec)
114 | 
115 | from os.path import join
116 | 
117 | # Path where you want to save the predicted clusters
118 | with open(join('/ssd_scratch/cvit/darshan_2/clusters/2d3dOCR_ss_test50ft50.pkl'), 'wb') as handle:
119 |     pkl.dump(d, handle, protocol=pkl.HIGHEST_PROTOCOL)


--------------------------------------------------------------------------------
/code/helpers/Split_Videos/README.md:
--------------------------------------------------------------------------------
1 | # Split the videos using the following steps
2 | 
3 | **Step1**
4 | Execute the `driver.py` program. To specify the path of the Datasubset use the `--base_dir` optional argument. The default path of the DataSubset is `/ssd_scratch/cvit/AVLectures/DataSubset`. To specify the minimum and maximum time of the splits use the `--min_time` and `--max_time` arguments respectively. By default `min_time=7 seconds` and `max_time= 15 seconds`. After executing this program the following will be created inside `base_dir`:
5 | 1. Inside each of the course directory a folder called `split_vids` would be created which contains the splits of all the videos.
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/code/helpers/Split_Videos/driver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import subprocess
 3 | import os
 4 | 
 5 | from os.path import join
 6 | from glob import glob
 7 | 
 8 | import make_splits as ms
 9 | import parse_subtitles as ps
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
13 | parser.add_argument("-x", "--min_time", type=int, required=False, help="Minimum time (in seconds)", default = 7)
14 | parser.add_argument("-y", "--max_time", type=int, required=False, help="Maximum time (in seconds)", default = 15)
15 | 
16 | args = parser.parse_args()
17 | 
18 | base_dir = args.base_dir
19 | min_time = args.min_time
20 | max_time = args.max_time
21 | 
22 | print("Base Directory: ", base_dir)
23 | print("Min time = ", min_time, "seconds")
24 | print("Max time = ", max_time, "seconds")
25 | 
26 | dir_list = []
27 | 
28 | for dir in glob(join(base_dir, '*')):
29 |      if 'mit' in dir:
30 |           dir_list.append(dir)
31 | 
32 | dir_list.sort()
33 | print(dir_list)
34 | 
35 | for dir in dir_list:
36 | 
37 |      print("Inside Directory - ", dir)
38 | 
39 |      base_dir = dir
40 | 
41 |      # step 1
42 | 
43 |      p = ps.ParseSubtitle(base_dir = base_dir, min_time = min_time, max_time = max_time)
44 | 
45 |      srt_files = []
46 | 
47 |      for fl in glob(join(base_dir, 'subtitles', '*')):
48 |           if fl.endswith('.srt'):
49 |                srt_files.append(fl)
50 |      
51 |      for fl in srt_files:
52 |           print(fl)
53 |           p.parse(fl)
54 |      
55 |      # step 2
56 | 
57 |      p.combine()
58 | 
59 |      # step 3
60 |      
61 |      m = ms.SplitVideo(base_dir = base_dir)
62 |      m._split()
63 | 
64 | print("Done Successfully")
65 | 


--------------------------------------------------------------------------------
/code/helpers/Split_Videos/make_splits.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.connection import wait
 2 | import subprocess, traceback
 3 | import time
 4 | import os
 5 | from os.path import join
 6 | import multiprocessing as mp
 7 | from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
 8 | from tqdm import tqdm
 9 | from glob import glob
10 | import threading
11 | 
12 | import signal
13 | 
14 | class SplitVideo():
15 | 
16 |     # c = 0
17 | 
18 |     def __init__(self, base_dir):
19 |         
20 |         self.base_dir = base_dir
21 |         self.delimiter = "@#@"
22 | 
23 |     def process_cmd(self, cmd):
24 |         try:
25 |             subprocess.call(cmd, shell=True)
26 |         except KeyboardInterrupt:
27 |             exit(0)
28 |         except:
29 |             traceback.print_exc()		
30 | 
31 |     def _split(self):
32 | 
33 |         os.makedirs(join(self.base_dir, 'splits_vid'), exist_ok=True)
34 | 
35 |         CMD_ffmpeg = []
36 |         with open(join(self.base_dir, 'combined.txt'), 'r') as f:
37 |             lines = f.readlines()
38 |             for l in lines:
39 |                 name, _ , start, end = l.strip().split(self.delimiter)
40 |                 
41 |                 start = start.replace(',', '.')
42 |                 end = end.replace(',', '.')
43 |                 
44 | 
45 |                 video_file = ""
46 |                 videofile_name = join(self.base_dir, "splits_vid", name)
47 | 
48 |                 name = '-'.join(name.split('-')[:-1])
49 | 
50 |                 video_file = join(self.base_dir, 'videos', name + '.mp4')
51 | 
52 |                 cmd1 = 'ffmpeg -hide_banner -loglevel error -ss {} -to {} -i {} -strict -2 {} -y'.format(
53 |                     start, end, video_file, videofile_name
54 |                 )
55 | 
56 |                 CMD_ffmpeg.append(cmd1)
57 | 
58 |         p = ThreadPoolExecutor(20)
59 |         
60 |         futures = [p.submit(self.process_cmd, j) for j in CMD_ffmpeg]
61 |         _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]


--------------------------------------------------------------------------------
/code/helpers/Split_Videos/parse_subtitles.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from os.path import join
  3 | import argparse
  4 | import ast
  5 | from glob import glob
  6 | 
  7 | class ParseSubtitle():
  8 | 
  9 |     def __init__(self, base_dir=None, min_time=7, max_time=15):
 10 |         
 11 |         self.base_dir = os.curdir if base_dir is None else base_dir
 12 |         self.min_time = min_time
 13 |         self.max_time = max_time
 14 |         self.delimiter = "@#@"
 15 | 
 16 |     def getTime(self, t):
 17 |         h, m, sms = t.split(":")
 18 |         if ',' in sms: # Example t = '00:00:03,980'
 19 |             s, ms = sms.split(",")
 20 |         elif '.' in sms: # Example t = '00:00:03.980'
 21 |             s, ms = sms.split(".")
 22 |         else: # Example t = '00:00:03'
 23 |             s = sms
 24 |             ms = '0'
 25 |         tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 26 |         return tm
 27 |     
 28 |     def toFFMPEGtime(self, t):
 29 |         ss, ms = divmod(t*1000, 1000)
 30 |         mm, ss = divmod(ss, 60)
 31 |         hh, mm = divmod(mm, 60)
 32 | 
 33 |         return "{:02d}:{:02d}:{:02d}.{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 34 |     
 35 |     def parse(self, filename):
 36 |         
 37 |         filename_split = filename.split('/')
 38 |         base_dir, filename = "/".join(filename_split[:-1]), filename_split[-1]
 39 | 
 40 |         name = filename.replace('.srt', '')
 41 | 
 42 |         outfile = join(base_dir, name + "_parsed.txt")
 43 | 
 44 |         sub_file = open(outfile, 'w')
 45 |         self.lines = []
 46 |         with open(str(join(base_dir, filename)), 'r') as f:
 47 |             lines = f.readlines()
 48 |             
 49 |             row, st, en = "", "", ""
 50 |             start, num = 0, 0
 51 |             st_prev = ""
 52 |             en_prev = ""
 53 |             
 54 |             for line in lines:
 55 |                 l = line.strip()
 56 |                 l = l.replace(self.delimiter, '')  # newly added
 57 |                 if "-->" in l:
 58 |                     start = 1
 59 |                     row = ""
 60 |                     tm = l.split(" ")
 61 |                     st, en = tm[0].strip(), tm[2].strip()
 62 |                 elif l != "":
 63 |                     row += l + " "
 64 |                 else:
 65 |                     row += self.delimiter + st + self.delimiter + en
 66 |                     if start:
 67 |                         row = "{}-{:05d}.mp4{}{}\n".format(name, num, self.delimiter, row)
 68 |                         self.lines.append(row)
 69 |                         sub_file.write(row)
 70 |                         num += 1
 71 |                         start = 0
 72 |                     st_prev = st
 73 |                     en_prev = en
 74 | 
 75 |             if((st != st_prev or en == en_prev) and (st == en_prev)):
 76 |                 row += self.delimiter + st + self.delimiter + en
 77 |                 if start:
 78 |                     row = "{}-{:05d}.mp4{}{}\n".format(name, num, self.delimiter, row)
 79 |                     self.lines.append(row)
 80 |                     sub_file.write(row)
 81 |                     num += 1
 82 |                     start = 0
 83 | 
 84 |         sub_file.close()
 85 |         self.merge(name, base_dir)
 86 | 
 87 |     def merge(self, yid, base_dir):
 88 |         
 89 |         outfile = join(base_dir, yid + "_merged.txt")
 90 |         sub_file = open(outfile, 'w')
 91 | 
 92 |         s = '00:00:00.000'
 93 |         make_start = 1
 94 |         gtLessmintime = False
 95 |         ll = []
 96 |         tm = []
 97 |         cnt = 0
 98 |         for line in self.lines:
 99 | 
100 |             #row = line.strip().split('|')
101 |             row = line.strip().split(self.delimiter)
102 | 
103 |             vid_id ,start, end = row[0], row[2], row[3]
104 | 
105 |             vid_id = "-".join(vid_id.split('-')[:-1])
106 | 
107 |             if make_start:
108 |                 s = start
109 | 
110 |             gt = self.getTime(end) - self.getTime(s)
111 | 
112 |             if self.min_time <= gt <= self.max_time:
113 |                 gtLessmintime = False
114 |                 ll.append(row[1])
115 |                 tm.append([row[1], row[2], row[3]])
116 |                 sen = " ".join(ll)
117 | 
118 |                 sub_file.write('{1}-{2:05d}.mp4{0}{3}{0}{4}{0}{5}\n'.format(self.delimiter, vid_id, cnt, sen, tm[0][1], tm[-1][2]))
119 |                 cnt += 1
120 |                 make_start = 1
121 |                 ll = []
122 |                 tm = []
123 |             elif gt < self.min_time:
124 |                 gtLessmintime = True
125 | 
126 |                 make_start = 0
127 |                 ll.append(row[1])
128 |                 tm.append([row[1], row[2], row[3]])
129 |             
130 |             else: # if gt > self.max_time
131 |                 gtLessmintime = False
132 |                 ll.append(row[1])
133 |                 tm.append([row[1], row[2], row[3]])
134 |                 sen = " ".join(ll)
135 | 
136 |                 sub_file.write('{1}-{2:05d}.mp4{0}{3}{0}{4}{0}{5}\n'.format(self.delimiter, vid_id, cnt, sen, tm[0][1], tm[-1][2]))
137 |                 cnt += 1
138 |                 make_start = 1
139 |                 ll = []
140 |                 tm = []
141 | 
142 |         if gtLessmintime:
143 |             sen = " ".join(ll)
144 |             sub_file.write('{1}-{2:05d}.mp4{0}{3}{0}{4}{0}{5}\n'.format(self.delimiter, vid_id, cnt, sen, tm[0][1], tm[-1][2]))
145 | 
146 |         sub_file.close()
147 | 
148 |     def combine(self):
149 | 
150 |         combined_file = open(join(self.base_dir, 'combined.txt'), 'w')
151 |         for fl in glob(join(self.base_dir, 'subtitles', '*_merged.txt')):
152 | 
153 |             with open(fl, 'r') as f:
154 |                 lines = f.readlines()
155 |                 for line in lines:
156 |                     combined_file.write(line)
157 |             
158 |         combined_file.close()


--------------------------------------------------------------------------------
/code/helpers/Trim_Intro/concat_subs.py:
--------------------------------------------------------------------------------
  1 | import subprocess, traceback
  2 | import os
  3 | import argparse
  4 | 
  5 | import cv2
  6 | 
  7 | import pickle as pkl
  8 | 
  9 | from os.path import join
 10 | from glob import glob
 11 | from pathlib import Path
 12 | from tqdm import tqdm
 13 | 
 14 | def get_length(filename):
 15 |     result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
 16 |                              "format=duration", "-of",
 17 |                              "default=noprint_wrappers=1:nokey=1", filename],
 18 |     stdout=subprocess.PIPE,
 19 |     stderr=subprocess.STDOUT)
 20 |     return float(result.stdout)
 21 | 
 22 | def get_length_cv2(vid_path):
 23 |     video = cv2.VideoCapture(vid_path)
 24 |     frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
 25 |     fps = video.get(cv2.CAP_PROP_FPS)
 26 |     return round(frame_count / fps, 2)
 27 | 
 28 | def getTime(t):
 29 |     h, m, sms = t.split(":")
 30 |     if ',' in sms: # Example t = '00:00:03,980'
 31 |         s, ms = sms.split(",")
 32 |     elif '.' in sms: # Example t = '00:00:03.980'
 33 |         s, ms = sms.split(".")
 34 |     else: # Example t = '00:00:03'
 35 |         s = sms
 36 |         ms = '0'
 37 |     tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 38 |     return tm
 39 | 
 40 | def toFFMPEGtime(t):
 41 |     ss, ms = divmod(t*1000, 1000)
 42 |     mm, ss = divmod(ss, 60)
 43 |     hh, mm = divmod(mm, 60)
 44 | 
 45 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 46 | 
 47 | 
 48 | def concat_subtitles(course, out_sub_filename, sub_files):
 49 |     num_sub_files = len(sub_files)
 50 |     offset = 0
 51 |     cnt = 1
 52 | 
 53 |     for sub in sub_files:
 54 |         with open(join(course, 'subtitles', sub), 'r') as orig_sub, open(join(course, 'concatenated_subtitles', out_sub_filename), 'a') as concat_sub:
 55 |             lines = orig_sub.readlines()
 56 |             endHere = False
 57 |             last_arrow_idx = None
 58 |             for i, line in reversed(list(enumerate(lines))):
 59 |                 l = line.strip()
 60 |                 if '-->' in l:
 61 |                     last_arrow_idx = i
 62 |                     break
 63 | 
 64 |             for i, line in enumerate(lines):
 65 |                 l = line.strip()
 66 | 
 67 |                 if i < len(lines) - 1:
 68 |                     next_line =  lines[i + 1].strip()
 69 | 
 70 |                     if '-->' in next_line:
 71 |                         concat_sub.write('{}\n'.format(str(cnt)))
 72 |                         cnt += 1
 73 |                         endHere = False
 74 |                         continue
 75 | 
 76 |                 if '-->' in l:
 77 |                     ts = l.split(' ')
 78 |                     st, en = ts[0].strip(), ts[2].strip()
 79 |                     st_shifted = toFFMPEGtime(getTime(st) + offset)
 80 |                     en_shifted = toFFMPEGtime(getTime(en) + offset)
 81 | 
 82 |                     if i == last_arrow_idx:
 83 |                         last_arrow_time = toFFMPEGtime(get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4'))) + offset)
 84 |                         concat_sub.write('{} --> {}\n'.format(st_shifted, last_arrow_time))
 85 |                     else:
 86 |                         concat_sub.write('{} --> {}\n'.format(st_shifted, en_shifted))
 87 |                     endHere = True
 88 |                 else:
 89 |                     concat_sub.write('{}'.format(line))
 90 |                     endHere = False
 91 |             
 92 |             if endHere:
 93 |                 concat_sub.write('\n')
 94 |             
 95 |             offset += get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4')))
 96 | 
 97 | sub_files = ['MIT8_01F16_L19v01_360p.srt', 'MIT8_01F16_L19v02_360p.srt', 'MIT8_01F16_L19v03_360p.srt', 
 98 |             'MIT8_01F16_L19v04_360p.srt', 'MIT8_01F16_L19v05_360p.srt', 'MIT8_01F16_L19v06_360p.srt', 
 99 |             'MIT8_01F16_L19v07_360p.srt']
100 | 
101 | concat_subtitles('/ssd_scratch/cvit/darshan/dataset_MITOCW_v1/mit032', 'L19.srt', sub_files)


--------------------------------------------------------------------------------
/code/helpers/Trim_Intro/concat_video.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A $USER
 3 | 
 4 | module load ffmpeg/4.4.1
 5 | 
 6 | echo "Concat started"
 7 | 
 8 | python concat_videos.py
 9 | 
10 | echo "DONE successfully"
11 | 


--------------------------------------------------------------------------------
/code/helpers/Trim_Intro/concat_videos.py:
--------------------------------------------------------------------------------
  1 | import subprocess, traceback
  2 | import os
  3 | import argparse
  4 | import pickle as pkl
  5 | 
  6 | from os.path import join
  7 | from glob import glob
  8 | from pathlib import Path
  9 | from tqdm import tqdm
 10 | import cv2
 11 | 
 12 | from collections import OrderedDict
 13 | from concurrent.futures import ThreadPoolExecutor, as_completed
 14 | 
 15 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1'
 16 | 
 17 | num_workers = 20
 18 | 
 19 | delimiter = '@@'
 20 | 
 21 | def get_length(filename):
 22 |     result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
 23 |                              "format=duration", "-of",
 24 |                              "default=noprint_wrappers=1:nokey=1", filename],
 25 |     stdout=subprocess.PIPE,
 26 |     stderr=subprocess.STDOUT)
 27 |     return float(result.stdout)
 28 | 
 29 | def get_length_cv2(vid_path):
 30 |     video = cv2.VideoCapture(vid_path)
 31 |     frame_count = video.get(cv2.CAP_PROP_FRAME_COUNT)
 32 |     fps = video.get(cv2.CAP_PROP_FPS)
 33 |     return round(frame_count / fps, 3)
 34 | 
 35 | def getTime(t):
 36 |     h, m, sms = t.split(":")
 37 |     if ',' in sms: # Example t = '00:00:03,980'
 38 |         s, ms = sms.split(",")
 39 |     elif '.' in sms: # Example t = '00:00:03.980'
 40 |         s, ms = sms.split(".")
 41 |     else: # Example t = '00:00:03'
 42 |         s = sms
 43 |         ms = '0'
 44 |     tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 45 |     return tm
 46 | 
 47 | def toFFMPEGtime(t):
 48 |     ss, ms = divmod(t*1000, 1000)
 49 |     mm, ss = divmod(ss, 60)
 50 |     hh, mm = divmod(mm, 60)
 51 | 
 52 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 53 | 
 54 | def process_cmd(cmd):
 55 |     try:
 56 |         subprocess.call(cmd, shell=True)
 57 |     except KeyboardInterrupt:
 58 |         exit(0)
 59 |     except:
 60 |         traceback.print_exc()	
 61 | 
 62 | def concat_subtitles(course, out_sub_filename, sub_files):
 63 |     num_sub_files = len(sub_files)
 64 |     offset = 0
 65 |     cnt = 1
 66 | 
 67 |     for sub in sub_files:
 68 |         with open(join(course, 'subtitles', sub), 'r') as orig_sub, open(join(course, 'concatenated_subtitles', out_sub_filename), 'a') as concat_sub:
 69 |             lines = orig_sub.readlines()
 70 |             endHere = False
 71 |             last_arrow_idx = None
 72 |             for i, line in reversed(list(enumerate(lines))):
 73 |                 l = line.strip()
 74 |                 if '-->' in l:
 75 |                     last_arrow_idx = i
 76 |                     break
 77 | 
 78 |             for i, line in enumerate(lines):
 79 |                 l = line.strip()
 80 | 
 81 |                 if i < len(lines) - 1:
 82 |                     next_line =  lines[i + 1].strip()
 83 | 
 84 |                     if '-->' in next_line:
 85 |                         concat_sub.write('{}\n'.format(str(cnt)))
 86 |                         cnt += 1
 87 |                         endHere = False
 88 |                         continue
 89 | 
 90 |                 if '-->' in l:
 91 |                     ts = l.split(' ')
 92 |                     st, en = ts[0].strip(), ts[2].strip()
 93 |                     st_shifted = toFFMPEGtime(getTime(st) + offset)
 94 |                     en_shifted = toFFMPEGtime(getTime(en) + offset)
 95 | 
 96 |                     if i == last_arrow_idx:
 97 |                         last_arrow_time = toFFMPEGtime(get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4'))) + offset)
 98 |                         concat_sub.write('{} --> {}\n'.format(st_shifted, last_arrow_time))
 99 |                     else:
100 |                         concat_sub.write('{} --> {}\n'.format(st_shifted, en_shifted))
101 |                     endHere = True
102 |                 else:
103 |                     concat_sub.write('{}'.format(line))
104 |                     endHere = False
105 |             
106 |             if endHere:
107 |                 concat_sub.write('\n')
108 | 
109 |             offset += get_length_cv2(join(course, 'videos', sub.replace('.srt', '.mp4')))
110 | 
111 | course_list = []
112 | 
113 | for course in glob(join(base_dir, '*')):
114 |      if 'mit' in course:
115 |           course_list.append(course)
116 | 
117 | course_list.sort()
118 | 
119 | for course in course_list:
120 |     Path(join(course, 'concatenated_videos')).mkdir(parents=True, exist_ok=True)
121 |     Path(join(course, 'concatenated_subtitles')).mkdir(parents=True, exist_ok=True)
122 | 
123 |     course_name = course.split('/')[-1]
124 |     print("Inside course - ", course_name)
125 | 
126 | 
127 |     cmd_list = []
128 | 
129 |     with open(join(Path.home(), 'Segmentation', 'segments', 'lecname', course_name + '.txt')) as f:
130 |         lines = f.readlines()
131 | 
132 |         course_stats = OrderedDict()
133 | 
134 |         for line in lines:
135 |             l = line.strip()
136 |             segment_name = l.split(delimiter)[0]
137 |             lec_name = l.split(delimiter)[-1]
138 | 
139 |             if lec_name not in course_stats:
140 |                 course_stats[lec_name] = []
141 |             course_stats[lec_name].append(segment_name)
142 |     
143 |     for lec_name in course_stats:
144 | 
145 |         num_segments = len(course_stats[lec_name])
146 |         
147 |         # Usual concat command
148 |         concat_cmd = "ffmpeg -hide_banner -loglevel error"
149 |         for i in range(num_segments):
150 |             concat_cmd += " -i {}".format(join(course, 'videos', course_stats[lec_name][i]))
151 |         concat_cmd += ' -filter_complex "'
152 |         for i in range(num_segments):
153 |             concat_cmd += "[{0}:v] [{0}:a] ".format(i)
154 |         concat_cmd += 'concat=n={}:v=1:a=1 [v] [a]"'.format(num_segments)
155 |         concat_cmd += ' -map "[v]" -map "[a]" {}'.format(join(course, 'concatenated_videos', lec_name + '.mp4'))
156 | 
157 | 
158 |         sub_list = []
159 |         for i in range(num_segments):
160 |             sub_list.append(course_stats[lec_name][i].replace('.mp4', '.srt'))
161 | 
162 |         concat_subtitles(course, lec_name + '.srt', sub_list)
163 | 
164 |         cmd_list.append(concat_cmd)
165 | 
166 |     p = ThreadPoolExecutor(num_workers)
167 | 
168 |     futures = [p.submit(process_cmd, j) for j in cmd_list]
169 |     _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
170 | 
171 | 
172 | print("DONE SUCCESSFULLY")


--------------------------------------------------------------------------------
/code/helpers/Trim_Intro/cut_intro.py:
--------------------------------------------------------------------------------
  1 | import subprocess, traceback
  2 | import os
  3 | import argparse
  4 | 
  5 | import pickle as pkl
  6 | 
  7 | from os.path import join
  8 | from glob import glob
  9 | from pathlib import Path
 10 | from tqdm import tqdm
 11 | 
 12 | from concurrent.futures import ThreadPoolExecutor, as_completed
 13 | 
 14 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1'
 15 | 
 16 | num_workers = 30
 17 | 
 18 | def getTime(t):
 19 |     h, m, sms = t.split(":")
 20 |     if ',' in sms: # Example t = '00:00:03,980'
 21 |         s, ms = sms.split(",")
 22 |     elif '.' in sms: # Example t = '00:00:03.980'
 23 |         s, ms = sms.split(".")
 24 |     else: # Example t = '00:00:03'
 25 |         s = sms
 26 |         ms = '0'
 27 |     tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 28 |     return tm
 29 | 
 30 | def toFFMPEGtime(t):
 31 |     ss, ms = divmod(t*1000, 1000)
 32 |     mm, ss = divmod(ss, 60)
 33 |     hh, mm = divmod(mm, 60)
 34 | 
 35 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 36 | 
 37 | def trim_subtitles(course, sub_file, start_ts, end_ts):
 38 | 
 39 |     with open(join(course, 'subtitles', sub_file), 'r') as orig_sub, open(join(course, 'trimmed_subtitles', sub_file), 'w') as trimmed_sub:
 40 |         start_copy = False
 41 |         end_copy = False
 42 |         count = 1
 43 | 
 44 |         lines = orig_sub.readlines()
 45 |         for i, line in enumerate(lines):
 46 |             l = line.strip()
 47 | 
 48 |             if '-->' in l and not start_copy:
 49 |                 ts = l.split(' ')
 50 |                 st, en = ts[0].strip(), ts[2].strip()
 51 |                 if (getTime(st) - start_ts < 0) and (getTime(en) - start_ts <= 0):
 52 |                     start_copy = False
 53 |                 else:
 54 |                     start_copy = True
 55 |                     trimmed_sub.write('{}\n'.format(str(count)))
 56 |                     count += 1
 57 | 
 58 |                     if getTime(st) != start_ts:
 59 |                         trimmed_sub.write('{} --> {}\n'.format('00:00:00,000', toFFMPEGtime(abs(getTime(en) - start_ts))))
 60 |                         continue
 61 | 
 62 | 
 63 |             if start_copy and not end_copy:
 64 | 
 65 |                 if i < len(lines) - 1:
 66 |                     next_line =  lines[i + 1].strip()
 67 |                     if '-->' in next_line:
 68 |                         next_line_st = next_line.split(' ')[0]
 69 |                         if getTime(next_line_st) >= end_ts:
 70 |                             end_copy = True
 71 |                             return None
 72 |                         trimmed_sub.write('{}\n'.format(str(count)))
 73 |                         count += 1
 74 |                         continue
 75 | 
 76 |                 if '-->' in l:
 77 |                     ts = l.split(' ') 
 78 |                     st, en = ts[0].strip(), ts[2].strip()
 79 | 
 80 |                     st_shifted = toFFMPEGtime(getTime(st) - start_ts)
 81 |                     en_shifted = toFFMPEGtime(getTime(en) - start_ts)
 82 |                     trimmed_sub.write('{} --> {}\n'.format(st_shifted, en_shifted))
 83 |                 else:
 84 |                     trimmed_sub.write(line)
 85 | 
 86 | 
 87 | def process_cmd(cmd):
 88 |     try:
 89 |         subprocess.call(cmd, shell=True)
 90 |     except KeyboardInterrupt:
 91 |         exit(0)
 92 |     except:
 93 |         traceback.print_exc()	
 94 | 
 95 | course_list = []
 96 | 
 97 | for course in glob(join(base_dir, '*')):
 98 |      if 'mit' in course:
 99 |           course_list.append(course)
100 | 
101 | course_list.sort()
102 | 
103 | for course in course_list:
104 |     print(course)
105 | 
106 | for course in course_list:
107 | 
108 |     Path(join(course, 'trimmed_videos')).mkdir(parents=True, exist_ok=True)
109 |     Path(join(course, 'trimmed_subtitles')).mkdir(parents=True, exist_ok=True)
110 | 
111 |     course_name = course.split('/')[-1]
112 |     print("Inside course - ", course_name)
113 | 
114 | 
115 |     cmd_list = []
116 | 
117 |     pkl_file = pkl.load(open(join(Path.home(), 'Segmentation', 'segments', 'stats', course_name + '.pkl'), 'rb'))
118 | 
119 |     for lec_name in pkl_file:
120 |         vid_name = lec_name + '.mp4'
121 |         sub_file = lec_name + '.srt'
122 |         start_ts = pkl_file[lec_name]['st']
123 |         end_ts = pkl_file[lec_name]['en']
124 | 
125 |         trim_cmd = 'ffmpeg -hide_banner -loglevel error -i {} -ss {} -to {} -strict -2 {} -y'.format(
126 |             join(course, 'videos', vid_name), start_ts, end_ts, join(course, 'trimmed_videos', vid_name)
127 |         )
128 | 
129 |         trim_subtitles(course, sub_file, int(start_ts), int(end_ts))
130 | 
131 |         cmd_list.append(trim_cmd)
132 |         
133 |     p = ThreadPoolExecutor(num_workers)
134 | 
135 |     futures = [p.submit(process_cmd, j) for j in cmd_list]
136 |     _ = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
137 | 
138 | 
139 | print("DONE SUCCESSFULLY")


--------------------------------------------------------------------------------
/code/helpers/Trim_Intro/cut_intro.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A $USER
 3 | 
 4 | module load ffmpeg/4.4.1
 5 | 
 6 | echo "Trimming started"
 7 | 
 8 | python cut_intro.py
 9 | 
10 | echo "DONE successfully"
11 | 


--------------------------------------------------------------------------------
/code/helpers/Trim_Intro/cut_srt.py:
--------------------------------------------------------------------------------
 1 | import subprocess, traceback
 2 | import os
 3 | import argparse
 4 | 
 5 | from os.path import join
 6 | from glob import glob
 7 | from pathlib import Path
 8 | from tqdm import tqdm
 9 | 
10 | from concurrent.futures import ThreadPoolExecutor, as_completed
11 | 
12 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1'
13 | 
14 | def getTime(t):
15 |     h, m, sms = t.split(":")
16 |     if ',' in sms: # Example t = '00:00:03,980'
17 |         s, ms = sms.split(",")
18 |     elif '.' in sms: # Example t = '00:00:03.980'
19 |         s, ms = sms.split(".")
20 |     else: # Example t = '00:00:03'
21 |         s = sms
22 |         ms = '0'
23 |     tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
24 |     return tm
25 | 
26 | def toFFMPEGtime(t):
27 |     ss, ms = divmod(t*1000, 1000)
28 |     mm, ss = divmod(ss, 60)
29 |     hh, mm = divmod(mm, 60)
30 | 
31 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
32 | 
33 | def trim_subtitles(course, sub_file, start_ts, end_ts):
34 | 
35 |     with open(join(course, 'subtitles', sub_file), 'r') as orig_sub, open(join(course, 'trimmed_subtitles', sub_file), 'w') as trimmed_sub:
36 |         start_copy = False
37 |         end_copy = False
38 |         count = 1
39 | 
40 |         lines = orig_sub.readlines()
41 |         for i, line in enumerate(lines):
42 |             l = line.strip()
43 | 
44 |             if '-->' in l and not start_copy:
45 |                 ts = l.split(' ')
46 |                 st, en = ts[0].strip(), ts[2].strip()
47 |                 if (getTime(st) - start_ts < 0) and (getTime(en) - start_ts <= 0):
48 |                     start_copy = False
49 |                 else:
50 |                     start_copy = True
51 |                     trimmed_sub.write('{}\n'.format(str(count)))
52 |                     count += 1
53 | 
54 |                     if getTime(st) != start_ts:
55 |                         trimmed_sub.write('{} --> {}\n'.format('00:00:00,000', toFFMPEGtime(abs(getTime(en) - start_ts))))
56 |                         continue
57 |                         
58 | 
59 |             if start_copy and not end_copy:
60 | 
61 |                 if i < len(lines) - 1:
62 |                     next_line =  lines[i + 1].strip()
63 |                     if '-->' in next_line:
64 |                         next_line_st = next_line.split(' ')[0]
65 |                         if getTime(next_line_st) >= end_ts:
66 |                             end_copy = True
67 |                             return None
68 |                         trimmed_sub.write('{}\n'.format(str(count)))
69 |                         count += 1
70 |                         continue
71 | 
72 |                 if '-->' in l:
73 |                     ts = l.split(' ') 
74 |                     st, en = ts[0].strip(), ts[2].strip()
75 | 
76 |                     st_shifted = toFFMPEGtime(getTime(st) - start_ts)
77 |                     en_shifted = toFFMPEGtime(getTime(en) - start_ts)
78 |                     trimmed_sub.write('{} --> {}\n'.format(st_shifted, en_shifted))
79 |                 else:
80 |                     trimmed_sub.write(line)
81 | 
82 | 
83 | 
84 | course_list = []
85 | 
86 | for course in glob(join(base_dir, '*')):
87 |      if 'mit' in course:
88 |           course_list.append(course)
89 | 
90 | course_list.sort()
91 | 
92 | for course in course_list:
93 |     print(course)
94 | 
95 | trim_subtitles('/ssd_scratch/cvit/darshan/dataset_MITOCW_v1/mit001', 'ocw-18.01-f07-lec01_300k.srt', 22, 120)


--------------------------------------------------------------------------------
/code/lecture_aware_embds/args.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | def get_args(description='Text-Video'):
  4 |     parser = argparse.ArgumentParser(description=description)
  5 |     parser.add_argument(
  6 |         '--train_csv',
  7 |         type=str,
  8 |         default='data/v1.csv',
  9 |         help='train csv')
 10 |     parser.add_argument(
 11 |         '--features_path_2D',
 12 |         type=str,
 13 |         default='feature_2d',
 14 |         help='feature path for 2D features')
 15 |     parser.add_argument(
 16 |         '--features_path_3D',
 17 |         type=str,
 18 |         default='feature_3d',
 19 |         help='feature path for 3D features')
 20 |     parser.add_argument(
 21 |         '--caption_path',
 22 |         type=str,
 23 |         default='data/caption.pickle',
 24 |         help='caption pickle file path')
 25 |     parser.add_argument(
 26 |         '--word2vec_path',
 27 |         type=str,
 28 |         default='data/GoogleNews-vectors-negative300.bin',
 29 |         help='word embedding path')
 30 |     parser.add_argument(
 31 |         '--word2vec', 
 32 |         dest='word2vec', 
 33 |         action='store_true',
 34 |         help='If you want to use word2vec embeddings')
 35 |     parser.add_argument(
 36 |         '--BERT', 
 37 |         dest='word2vec', 
 38 |         action='store_false',
 39 |         help='If you want to use BERT embeddings')    
 40 |     parser.set_defaults(word2vec=True)              
 41 |     parser.add_argument(
 42 |         '--BERT_train_path', 
 43 |         type=str, 
 44 |         default='',
 45 |         help='BERT embeddings path of training data')
 46 |     parser.add_argument(
 47 |         '--BERT_val_path', 
 48 |         type=str, 
 49 |         default='',
 50 |         help='BERT embeddings path of validation data')                            
 51 |     parser.add_argument(
 52 |         '--pretrain_path',
 53 |         type=str,
 54 |         default='',
 55 |         help='pre train model path')
 56 |     parser.add_argument(
 57 |         '--checkpoint_dir',
 58 |         type=str,
 59 |         default='',
 60 |         help='checkpoint model folder')
 61 |     parser.add_argument('--num_thread_reader', type=int, default=1,
 62 |                                 help='')
 63 |     parser.add_argument('--embd_dim', type=int, default=2048,
 64 |                                 help='embedding dim')
 65 |     parser.add_argument('--lr', type=float, default=0.0001,
 66 |                                 help='initial learning rate')
 67 |     parser.add_argument('--epochs', type=int, default=20,
 68 |                                 help='upper epoch limit')
 69 |     parser.add_argument('--batch_size', type=int, default=256,
 70 |                                 help='batch size')
 71 |     parser.add_argument('--batch_size_val', type=int, default=3500,
 72 |                                 help='batch size eval')
 73 |     parser.add_argument('--lr_decay', type=float, default=0.9,
 74 |                                 help='Learning rate exp epoch decay')
 75 |     parser.add_argument('--n_display', type=int, default=10,
 76 |                                 help='Information display frequence')
 77 |     parser.add_argument('--feature_dim', type=int, default=4096,
 78 |                                 help='video feature dimension')
 79 |     parser.add_argument('--we_dim', type=int, default=300,
 80 |                                 help='word embedding dimension')
 81 |     parser.add_argument('--ocr_dim', type=int, default=2048,
 82 |                                 help='OCR text embedding dimension')
 83 |     parser.add_argument('--seed', type=int, default=1,
 84 |                                 help='random seed')
 85 |     parser.add_argument('--verbose', type=int, default=1,
 86 |                                 help='')
 87 |     parser.add_argument('--max_words', type=int, default=20,
 88 |                                 help='')
 89 |     parser.add_argument('--min_words', type=int, default=0,
 90 |                                 help='')
 91 |     parser.add_argument('--feature_framerate', type=int, default=1,
 92 |                                 help='')
 93 |     parser.add_argument('--min_time', type=float, default=5.0,
 94 |                                 help='Gather small clips')
 95 |     parser.add_argument('--margin', type=float, default=0.1,
 96 |                                 help='margin for loss')
 97 |     parser.add_argument('--hard_negative_rate', type=float, default=0.5,
 98 |                                 help='rate of intra negative sample')
 99 |     parser.add_argument('--negative_weighting', type=int, default=1,
100 |                                 help='Weight the loss for intra negative')
101 |     parser.add_argument('--n_pair', type=int, default=1,
102 |                                 help='Num of pair to output from data loader')
103 |     parser.add_argument('--avlectures', type=int, default=0,
104 |                                 help='Train on AVLectures data')                            
105 |     parser.add_argument('--eval_avlectures', type=int, default=0,
106 |                                 help='Evaluate on AVLectures data')                            
107 |     parser.add_argument('--sentence_dim', type=int, default=-1,
108 |                                 help='sentence dimension')
109 |     parser.add_argument('--save_every', type=int, default=1,
110 |                                 help='intervals at which the checkpoints to be saved')     
111 |     parser.add_argument('--only_2d', type=int, default=0,
112 |                                 help='1, if you want to use only 2D features for training and inference. 0 otherwise')  
113 |     parser.add_argument('--only_3d', type=int, default=0,
114 |                                 help='1, if you want to use only 3D features for training and inference. 0 otherwise')
115 |     parser.add_argument('--only_ocr', type=int, default=0,
116 |                                 help='1, if you want to use only ocr features for training and inference. 0 otherwise')     
117 |     parser.add_argument('--ocr', type=int, default=0,
118 |                                 help='1, if you want to use OCR features for training and inference. 0 otherwise')                             
119 |     parser.add_argument(
120 |         '--avlectures_train_path',
121 |         type=str,
122 |         default='data/avlectures_train.pkl',
123 |         help='')
124 |     parser.add_argument(
125 |         '--avlectures_helper_path',
126 |         type=str,
127 |         default='data/avlectures_helper.pkl',
128 |         help='')
129 |     parser.add_argument(
130 |         '--avlectures_val_path',
131 |         type=str,
132 |         default='data/avlectures_val.pkl',
133 |         help='')    
134 |     args = parser.parse_args()
135 |     return args
136 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/eval.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import torch as th
 7 | from torch.utils.data import DataLoader
 8 | from args import get_args
 9 | from model import Net
10 | from metrics import compute_metrics, print_computed_metrics
11 | from gensim.models.keyedvectors import KeyedVectors
12 | import pickle
13 | import glob
14 | from lsmdc_dataloader import LSMDC_DataLoader
15 | from msrvtt_dataloader import MSRVTT_DataLoader
16 | from youcook_dataloader import Youcook_DataLoader
17 | from avlectures_dataloader import AVLectures_DataLoader
18 | 
19 | args = get_args()
20 | if args.verbose:
21 |     print(args)
22 | 
23 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument'
24 | 
25 | if args.word2vec:
26 |     print('Loading word vectors: {}'.format(args.word2vec_path))
27 |     we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
28 |     print('done')
29 | else:
30 |     we = None
31 | 
32 | if args.eval_avlectures:
33 |     dataset_avlectures = AVLectures_DataLoader(
34 |         data=args.avlectures_val_path,
35 |         helper_pkl = args.avlectures_helper_path,
36 |         we=we,
37 |         max_words=args.max_words,
38 |         we_dim=args.we_dim,
39 |         word2vec=args.word2vec,
40 |         ocr=args.ocr,
41 |         only_2d=args.only_2d,
42 |         only_3d=args.only_3d
43 |     )
44 |     dataloader_avlectures = DataLoader(
45 |         dataset_avlectures,
46 |         batch_size=args.batch_size_val,
47 |         num_workers=args.num_thread_reader,
48 |         shuffle=False,
49 |     )
50 | 
51 | net = Net(
52 |     video_dim=args.feature_dim,
53 |     embd_dim=args.embd_dim,
54 |     we_dim=args.we_dim,
55 |     max_words=args.max_words,
56 |     word2vec=args.word2vec,
57 |     ocr=args.ocr,
58 |     ocr_dim=args.ocr_dim,
59 |     only_ocr=args.only_ocr
60 | )
61 | net.eval()
62 | net.cuda()
63 | 
64 | if args.verbose:
65 |     print('Starting evaluation loop ...')
66 | 
67 | def Eval_retrieval(model, eval_dataloader, dataset_name):
68 |     model.eval()
69 |     print('Evaluating Text-Video retrieval on {} data'.format(dataset_name))
70 |     with th.no_grad():
71 |         for i_batch, data in enumerate(eval_dataloader):
72 |             text = data['text'].cuda()
73 |             video = data['video'].cuda()
74 |             vid = data['video_id']
75 |             ocr_embd = None
76 |             if args.ocr:
77 |                 ocr_embd = data['ocr_embd'].cuda()
78 |             m = model(video, text, ocr_embd)
79 |             m  = m.cpu().detach().numpy()
80 |             metrics = compute_metrics(m)
81 |             print_computed_metrics(metrics)
82 | 
83 | all_checkpoints = glob.glob(args.pretrain_path)
84 | 
85 | for c in all_checkpoints:
86 |     print('Eval checkpoint: {}'.format(c))
87 |     print('Loading checkpoint: {}'.format(c))
88 |     net.load_checkpoint(c)
89 |     if args.eval_avlectures:
90 |         Eval_retrieval(net, dataloader_avlectures, 'AVLectures')
91 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/extract_feats.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | from operator import le
  6 | 
  7 | import torch as th
  8 | import numpy as np
  9 | from torch.utils.data import DataLoader
 10 | from args import get_args
 11 | from model_ef import Net
 12 | # from model import Net
 13 | from metrics import compute_metrics, print_computed_metrics
 14 | from gensim.models.keyedvectors import KeyedVectors
 15 | import pickle
 16 | import glob
 17 | from lsmdc_dataloader import LSMDC_DataLoader
 18 | from msrvtt_dataloader import MSRVTT_DataLoader
 19 | from youcook_dataloader import Youcook_DataLoader
 20 | from avlectures_dataloader import AVLectures_DataLoader
 21 | 
 22 | 
 23 | args = get_args()
 24 | if args.verbose:
 25 |     print(args)
 26 | 
 27 | assert args.pretrain_path != '', 'Need to specify pretrain_path argument'
 28 | 
 29 | if args.word2vec:
 30 |     print('Loading word vectors: {}'.format(args.word2vec_path))
 31 |     we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
 32 |     print('done')
 33 | else:
 34 |     we = args.BERT_val_path
 35 | 
 36 | 
 37 | if args.eval_avlectures:
 38 |     dataset_val = AVLectures_DataLoader(
 39 |         data=args.avlectures_val_path,
 40 |         helper_pkl = args.avlectures_helper_path,
 41 |         we=we,
 42 |         max_words=args.max_words,
 43 |         we_dim=args.we_dim,
 44 |         word2vec=args.word2vec,
 45 |         ocr=args.ocr,
 46 |         only_2d=args.only_2d,
 47 |         only_3d=args.only_3d
 48 |     )
 49 |     dataloader_val = DataLoader(
 50 |         dataset_val,
 51 |         batch_size=args.batch_size_val,
 52 |         num_workers=args.num_thread_reader,
 53 |         shuffle=False,
 54 |     )
 55 | 
 56 | 
 57 | net = Net(
 58 |     video_dim=args.feature_dim,
 59 |     embd_dim=args.embd_dim,
 60 |     we_dim=args.we_dim,
 61 |     max_words=args.max_words,
 62 |     word2vec=args.word2vec,
 63 |     ocr=args.ocr,
 64 |     ocr_dim=args.ocr_dim,
 65 |     only_ocr=args.only_ocr
 66 | )
 67 | net.eval()
 68 | net.cuda()
 69 | 
 70 | if args.verbose:
 71 |     print('Starting evaluation loop ...')
 72 | 
 73 | pkl_data = {}
 74 | 
 75 | def Eval_retrieval(model, eval_dataloader, dataset_name):
 76 |     model.eval()
 77 |     print('Evaluating Text-Video retrieval on {} data'.format(dataset_name))
 78 |     with th.no_grad():
 79 |         for i_batch, data in enumerate(eval_dataloader):
 80 |             text = data['text'].cuda()
 81 |             video = data['video'].cuda()
 82 |             ocr_embd = None
 83 |             if args.ocr:
 84 |                 ocr_embd = data['ocr_embd'].cuda()
 85 |             vid = data['video_id']
 86 |             course_name = data['course_name'][0]
 87 |             st = data['st']
 88 |             et = data['et']
 89 |             vid_duration = data['vid_duration']
 90 |             st = st.item()
 91 |             et = et.item()
 92 |             vid_duration = vid_duration.item()
 93 |             m, vid_embd, text_embd = model(video, text, ocr_embd)
 94 | 
 95 |             # for trained embds
 96 |             text_embd = text_embd.cpu().detach().numpy()
 97 |             vid_embd = vid_embd.cpu().detach().numpy()
 98 |             m  = m.cpu().detach().numpy()
 99 | 
100 |             lecture_name = "-".join(vid[0].split('-')[:-1])
101 |             split_number = int(vid[0].split('-')[-1])
102 |             
103 |             if course_name not in pkl_data:
104 |                 pkl_data[course_name] = {}
105 |             
106 |             if lecture_name not in pkl_data[course_name]:
107 |                 pkl_data[course_name][lecture_name] = {"vid_embd": [], "text_embd": [], "stet": [], "vid_duration": []}
108 | 
109 |             pkl_data[course_name][lecture_name]["vid_embd"].append((split_number, vid_embd))
110 |             pkl_data[course_name][lecture_name]["text_embd"].append((split_number, text_embd))
111 |             pkl_data[course_name][lecture_name]["stet"].append((split_number, st, et))
112 |             pkl_data[course_name][lecture_name]["vid_duration"].append((split_number, vid_duration))
113 | 
114 | all_checkpoints = glob.glob(args.pretrain_path)
115 | 
116 | for c in all_checkpoints:
117 |     print('Eval checkpoint: {}'.format(c))
118 |     print('Loading checkpoint: {}'.format(c))
119 |     net.load_checkpoint(c)
120 |     if args.eval_avlectures:
121 |         Eval_retrieval(net, dataloader_val, 'AVLectures')
122 | 
123 | pkl_data_new = {}
124 | 
125 | for c in pkl_data.keys():
126 | 
127 |     if c not in pkl_data_new:
128 |         pkl_data_new[c] = {}
129 | 
130 |     for k in pkl_data[c].keys():
131 | 
132 |         vid_embd_sorted = pkl_data[c][k]['vid_embd']
133 |         vid_embd_sorted = sorted(vid_embd_sorted, key=lambda x: x[0])
134 | 
135 |         text_embd_sorted = pkl_data[c][k]['text_embd']
136 |         text_embd_sorted = sorted(text_embd_sorted, key=lambda x: x[0])
137 | 
138 |         vid_duration_sorted = pkl_data[c][k]['vid_duration']
139 |         vid_duration_sorted = sorted(vid_duration_sorted, key=lambda x: x[0])
140 | 
141 |         stet_sorted = pkl_data[c][k]['stet']
142 |         stet_sorted = sorted(stet_sorted, key=lambda x: x[0])
143 | 
144 |         vid_embd = vid_embd_sorted[0][1]
145 |         text_embd = text_embd_sorted[0][1]
146 |         vid_duration = [vid_duration_sorted[0][1]]
147 |         stet = [(stet_sorted[0][0], stet_sorted[0][1], stet_sorted[0][2])]s
148 | 
149 |         for i in range(1, len(vid_embd_sorted)):
150 |             prev_vid_embd = vid_embd
151 |             prev_text_embd = text_embd
152 | 
153 |             vid_embd = np.concatenate((prev_vid_embd, vid_embd_sorted[i][1]))
154 |             text_embd = np.concatenate((prev_text_embd, text_embd_sorted[i][1]))
155 | 
156 |             vid_duration.append(vid_duration_sorted[i][1])
157 | 
158 |             stet.append((stet_sorted[i][0], stet_sorted[i][1], stet_sorted[i][2]))
159 | 
160 |         pkl_data_new[c][k] = {"vid_embd": vid_embd, "text_embd": text_embd, "vid_duration": vid_duration, "stet": stet}
161 | 
162 | # Path where you want to save the extracted features
163 | with open('/ssd_scratch/cvit/darshan_2/seg_embds/2d3dOCR_ss_test50ft50.pkl', 'wb') as handle:
164 | 	pickle.dump(pkl_data_new, handle, protocol=pickle.HIGHEST_PROTOCOL)
165 | 
166 | print("DONE SUCCESSFULLY")


--------------------------------------------------------------------------------
/code/lecture_aware_embds/loss.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import torch.nn.functional as F
 7 | import torch as th
 8 | import numpy as np
 9 | 
10 | class MaxMarginRankingLoss(th.nn.Module):
11 |     def __init__(self,
12 |                  margin=1.0,
13 |                  negative_weighting=False,
14 |                  batch_size=1,
15 |                  n_pair=1,
16 |                  hard_negative_rate=0.5,
17 |         ):
18 |         super(MaxMarginRankingLoss, self).__init__()
19 |         self.margin = margin
20 |         self.n_pair = n_pair
21 |         self.batch_size = batch_size
22 |         easy_negative_rate = 1 - hard_negative_rate
23 |         self.easy_negative_rate = easy_negative_rate
24 |         self.negative_weighting = negative_weighting
25 |         if n_pair > 1:
26 |             alpha = easy_negative_rate / ((batch_size - 1) * (1 - easy_negative_rate))
27 |             mm_mask = (1 - alpha) * np.eye(self.batch_size) + alpha
28 |             mm_mask = np.kron(mm_mask, np.ones((n_pair, n_pair)))
29 |             mm_mask = th.tensor(mm_mask) * (batch_size * (1 - easy_negative_rate))
30 |             self.mm_mask = mm_mask.float().cuda()
31 | 
32 | 
33 |     def forward(self, x):
34 |         d = th.diag(x)
35 |         max_margin = F.relu(self.margin + x - d.view(-1, 1)) + \
36 |                      F.relu(self.margin + x - d.view(1, -1))
37 |         if self.negative_weighting and self.n_pair > 1:
38 |             max_margin = max_margin * self.mm_mask
39 |         return max_margin.mean()
40 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/loss_ce.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import torch.nn.functional as F
 7 | import torch as th
 8 | import numpy as np
 9 | import torch.nn as nn
10 | 
11 | class CE_loss(th.nn.Module):
12 |     def __init__(self):
13 |         super(CE_loss, self).__init__()
14 | 
15 |     def forward(self, S, margin=0.001):
16 | 
17 |         target = th.LongTensor(list(range(S.size(0)))).to(S.device)
18 |         ce_loss = nn.CrossEntropyLoss()
19 |         loss = ce_loss(S, target)
20 |         return loss
21 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/loss_milnce.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | 
 3 | 
 4 | class MILNCELoss(th.nn.Module):
 5 |     def __init__(self):
 6 |         super(MILNCELoss, self).__init__()
 7 | 
 8 |     def forward(self, video_embd, text_embd):
 9 |         x = th.matmul(video_embd, text_embd.t())
10 |         x = x.view(video_embd.shape[0], video_embd.shape[0], -1)
11 |         nominator = x * th.eye(x.shape[0])[:,:,None].cuda()
12 |         nominator = nominator.sum(dim=1)
13 |         nominator = th.logsumexp(nominator, dim=1)
14 |         denominator = th.cat((x, x.permute(1,0,2)), dim=1).view(x.shape[0], -1)
15 |         denominator = th.logsumexp(denominator, dim=1)
16 |         return th.mean(denominator - nominator)


--------------------------------------------------------------------------------
/code/lecture_aware_embds/loss_mms.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import torch.nn.functional as F
 7 | import torch as th
 8 | import numpy as np
 9 | 
10 | class MMS_loss(th.nn.Module):
11 |     def __init__(self):
12 |         super(MMS_loss, self).__init__()
13 | 
14 |     def forward(self, S, margin=0.001):
15 |         deltas = margin * th.eye(S.size(0)).to(S.device)
16 |         S = S - deltas
17 | 
18 |         target = th.LongTensor(list(range(S.size(0)))).to(S.device)
19 |         I2C_loss = F.nll_loss(F.log_softmax(S, dim=1), target)
20 |         C2I_loss = F.nll_loss(F.log_softmax(S.t(), dim=1), target)
21 |         loss = I2C_loss + C2I_loss
22 |         return loss


--------------------------------------------------------------------------------
/code/lecture_aware_embds/metrics.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import unicode_literals
 4 | from __future__ import print_function
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | def compute_metrics(x):
10 |     sx = np.sort(-x, axis=1)
11 |     d = np.diag(-x)
12 |     d = d[:, np.newaxis]
13 |     ind = sx - d
14 |     ind = np.where(ind == 0)
15 |     ind = ind[1]
16 |     metrics = {}
17 |     metrics['R1'] = float(np.sum(ind == 0)) / len(ind)
18 |     metrics['R5'] = float(np.sum(ind < 5)) / len(ind)
19 |     metrics['R10'] = float(np.sum(ind < 10)) / len(ind)
20 |     metrics['MR'] = np.median(ind) + 1
21 |     return metrics
22 | 
23 | 
24 | def print_computed_metrics(metrics):
25 |     r1 = metrics['R1']
26 |     r5 = metrics['R5']
27 |     r10 = metrics['R10']
28 |     mr = metrics['MR']
29 |     print('R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}'.format(r1, r5, r10, mr))
30 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | import re
 10 | 
 11 | class Net(nn.Module):
 12 |     def __init__(
 13 |             self,
 14 |             embd_dim=1024,
 15 |             video_dim=2048,
 16 |             n_pair=1,
 17 |             we_dim=300,
 18 |             max_words=30,
 19 |             sentence_dim=-1,
 20 |             we=None,
 21 |             word2vec=True,
 22 |             ocr=0,
 23 |             ocr_dim=2048,
 24 |             only_ocr=0
 25 |     ):
 26 |         super(Net, self).__init__()
 27 |         self.ocr = ocr
 28 | 
 29 |         if sentence_dim <= 0:
 30 |             self.text_pooling = Sentence_Maxpool(we_dim, embd_dim, word2vec)
 31 |             if ocr:
 32 |                 self.ocr_pooling = Sentence_Maxpool(we_dim, ocr_dim, word2vec)
 33 |         else:
 34 |             self.text_pooling = Sentence_Maxpool(we_dim, sentence_dim)
 35 |             if ocr:
 36 |                 self.ocr_pooling = Sentence_Maxpool(we_dim, sentence_dim)
 37 | 
 38 |         self.GU_text = Gated_Embedding_Unit(
 39 |             self.text_pooling.out_dim, embd_dim, gating=True)
 40 | 
 41 |         if ocr:
 42 |             self.GU_ocr = Gated_Embedding_Unit(
 43 |                 self.ocr_pooling.out_dim, ocr_dim, gating=True)
 44 | 
 45 |         self.GU_video = Gated_Embedding_Unit(
 46 |             video_dim, embd_dim, gating=True)
 47 | 
 48 |         self.n_pair = n_pair
 49 |         self.embd_dim = embd_dim
 50 |         self.we = we
 51 |         self.we_dim = we_dim
 52 |         self.word2vec = word2vec
 53 |         self.only_ocr = only_ocr
 54 |         
 55 | 
 56 |     def save_checkpoint(self, path):
 57 |         th.save(self.state_dict(), path)
 58 | 
 59 |     def load_checkpoint(self, path, cpu=False):
 60 |         if cpu:
 61 |             self.load_state_dict(th.load(path,
 62 |                 map_location=lambda storage, loc: storage))
 63 |         else:
 64 |             self.load_state_dict(th.load(path))
 65 | 
 66 |     def forward(self, video, text, ocr):
 67 |         if ocr != None:
 68 |             if self.only_ocr:
 69 |                 video = self.ocr_pooling(ocr)
 70 |             else:
 71 |                 video = th.cat((video, self.ocr_pooling(ocr)), dim = 1)
 72 |         video = self.GU_video(video)
 73 |         text = self.GU_text(self.text_pooling(text))
 74 |         return th.matmul(text, video.t()), video, text
 75 | 
 76 | 
 77 | class Gated_Embedding_Unit(nn.Module):
 78 |     def __init__(self, input_dimension, output_dimension, gating=True):
 79 |         super(Gated_Embedding_Unit, self).__init__()
 80 |         self.fc = nn.Linear(input_dimension, output_dimension)
 81 |         self.cg = Context_Gating(output_dimension)
 82 |         self.gating = gating
 83 | 
 84 |     def forward(self, x):
 85 |         x = self.fc(x)
 86 |         if self.gating:
 87 |             x = self.cg(x)
 88 |         x = F.normalize(x)
 89 |         return x
 90 | 
 91 | class Sentence_Maxpool(nn.Module):
 92 |     def __init__(self, word_dimension, output_dim, word2vec=True, relu=True):
 93 |         super(Sentence_Maxpool, self).__init__()
 94 |         self.fc = nn.Linear(word_dimension, output_dim)
 95 |         self.out_dim = output_dim
 96 |         self.word2vec = word2vec 
 97 |         self.relu = relu
 98 | 
 99 |     def forward(self, x):
100 |         x = self.fc(x)
101 |         if self.relu:
102 |             x = F.relu(x)
103 |         
104 |         if self.word2vec:
105 |             return th.max(x, dim=1)[0] # if word2vec
106 |         else:
107 |             return x # if not word2vec
108 | 
109 | class Context_Gating(nn.Module):
110 |     def __init__(self, dimension, add_batch_norm=False):
111 |         super(Context_Gating, self).__init__()
112 |         self.fc = nn.Linear(dimension, dimension)
113 |         self.add_batch_norm = add_batch_norm
114 |         self.batch_norm = nn.BatchNorm1d(dimension)
115 | 
116 |     def forward(self, x):
117 |         x1 = self.fc(x)
118 |         if self.add_batch_norm:
119 |             x1 = self.batch_norm(x1)
120 |         x = th.cat((x, x1), 1)
121 |         return F.glu(x, 1)
122 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/model_ef.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch.nn as nn
  7 | import torch as th
  8 | import torch.nn.functional as F
  9 | import re
 10 | 
 11 | class Net(nn.Module):
 12 |     def __init__(
 13 |             self,
 14 |             embd_dim=1024,
 15 |             video_dim=2048,
 16 |             n_pair=1,
 17 |             we_dim=300,
 18 |             max_words=30,
 19 |             sentence_dim=-1,
 20 |             we=None,
 21 |             word2vec=True,
 22 |             ocr=0,
 23 |             ocr_dim=2048,
 24 |             only_ocr=0
 25 |     ):
 26 |         super(Net, self).__init__()
 27 |         self.ocr = ocr
 28 | 
 29 |         if sentence_dim <= 0:
 30 |             self.text_pooling = Sentence_Maxpool(we_dim, embd_dim, word2vec)
 31 |             if ocr:
 32 |                 self.ocr_pooling = Sentence_Maxpool(we_dim, ocr_dim, word2vec)
 33 |         else:
 34 |             self.text_pooling = Sentence_Maxpool(we_dim, sentence_dim)
 35 |             if ocr:
 36 |                 self.ocr_pooling = Sentence_Maxpool(we_dim, sentence_dim)
 37 | 
 38 |         self.GU_text = Gated_Embedding_Unit(
 39 |             self.text_pooling.out_dim, embd_dim, gating=True)
 40 | 
 41 |         if ocr:
 42 |             self.GU_ocr = Gated_Embedding_Unit(
 43 |                 self.ocr_pooling.out_dim, ocr_dim, gating=True)
 44 | 
 45 |         self.GU_video = Gated_Embedding_Unit(
 46 |             video_dim, embd_dim, gating=True)
 47 |         self.n_pair = n_pair
 48 |         self.embd_dim = embd_dim
 49 |         self.we = we
 50 |         self.we_dim = we_dim
 51 |         self.word2vec = word2vec
 52 |         self.only_ocr = only_ocr
 53 | 
 54 | 
 55 |     def save_checkpoint(self, path):
 56 |         th.save(self.state_dict(), path)
 57 | 
 58 |     def load_checkpoint(self, path, cpu=False):
 59 |         if cpu:
 60 |             self.load_state_dict(th.load(path,
 61 |                 map_location=lambda storage, loc: storage))
 62 |         else:
 63 |             self.load_state_dict(th.load(path))
 64 | 
 65 |     def forward(self, video, text, ocr):
 66 |         if ocr != None:
 67 |             if self.only_ocr:
 68 |                 video = self.ocr_pooling(ocr)
 69 |             else:
 70 |                 video = th.cat((video, self.ocr_pooling(ocr)), dim = 1)
 71 |         video = self.GU_video(video)
 72 |         text = self.GU_text(self.text_pooling(text))
 73 |         return (th.matmul(text, video.t()), video, text)
 74 | 
 75 | 
 76 | 
 77 | class Gated_Embedding_Unit(nn.Module):
 78 |     def __init__(self, input_dimension, output_dimension, gating=True):
 79 |         super(Gated_Embedding_Unit, self).__init__()
 80 |         self.fc = nn.Linear(input_dimension, output_dimension)
 81 |         self.cg = Context_Gating(output_dimension)
 82 |         self.gating = gating
 83 | 
 84 |     def forward(self, x):
 85 |         x = self.fc(x)
 86 |         if self.gating:
 87 |             x = self.cg(x)
 88 |         x = F.normalize(x)
 89 |         return x
 90 | 
 91 | class Sentence_Maxpool(nn.Module):
 92 |     def __init__(self, word_dimension, output_dim, word2vec=True, relu=True):
 93 |         super(Sentence_Maxpool, self).__init__()
 94 |         self.fc = nn.Linear(word_dimension, output_dim)
 95 |         self.out_dim = output_dim
 96 |         self.word2vec = word2vec 
 97 |         self.relu = relu
 98 | 
 99 |     def forward(self, x):
100 |         x = self.fc(x)
101 |         if self.relu:
102 |             x = F.relu(x)
103 |         
104 |         if self.word2vec:
105 |             return th.max(x, dim=1)[0] # if word2vec
106 |         else:
107 |             return x # if not word2vec
108 | 
109 | class Context_Gating(nn.Module):
110 |     def __init__(self, dimension, add_batch_norm=False):
111 |         super(Context_Gating, self).__init__()
112 |         self.fc = nn.Linear(dimension, dimension)
113 |         self.add_batch_norm = add_batch_norm
114 |         self.batch_norm = nn.BatchNorm1d(dimension)
115 | 
116 |     def forward(self, x):
117 |         x1 = self.fc(x)
118 |         if self.add_batch_norm:
119 |             x1 = self.batch_norm(x1)
120 |         x = th.cat((x, x1), 1)
121 |         return F.glu(x, 1)
122 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/stop_words.py:
--------------------------------------------------------------------------------
 1 | # This list of English stop words is taken from the "Glasgow Information
 2 | # Retrieval Group". The original list can be found at
 3 | # http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words
 4 | ENGLISH_STOP_WORDS = frozenset([
 5 |     "a", "about", "above", "across", "actually", "after", "afterwards", "again", "against",
 6 |     "all", "almost", "alone", "along", "already", "also", "although", "always",
 7 |     "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
 8 |     "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
 9 |     "around", "as", "at", "back", "be", "became", "because", "become",
10 |     "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
11 |     "below", "beside", "besides", "between", "beyond", "bill", "both",
12 |     "bottom", "but", "by", "call", "can", "cannot", "cant", "can't", "co", "con",
13 |     "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "don't",
14 |     "down", "due", "during", "each", "easy", "eg", "eight", "either", "eleven", "else",
15 |     "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
16 |     "everything", "everywhere", "except", "few", "fifteen", "fifty",
17 |     "find", "fire", "first", "five", "for", "former", "formerly", "forty",
18 |     "found", "four", "from", "further", "give",
19 |     "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
20 |     "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
21 |     "how", "however", "hundred", "i", "ie", "if", "i'm", "i'll", "i've", "in", "inc", "indeed",
22 |     "interest", "is", "it", "it'll", "its", "it's", "itself", "just", "keep", "last", "latter",
23 |     "latterly", "least", "less", "like", "ltd", "made", "many", "may", "me",
24 |     "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
25 |     "much", "must", "my", "myself", "name", "namely", "neither",
26 |     "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
27 |     "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "ok", "okay", "on",
28 |     "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
29 |     "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
30 |     "please", "put", "rather", "re", "really", "same", "see", "seem", "seemed",
31 |     "seeming", "seems", "serious", "several", "she", "should", "show", "side",
32 |     "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
33 |     "something", "sometime", "sometimes", "somewhere", "still", "such",
34 |     "take", "ten", "than", "thank", "thanks", "that", "that's", "the", "their", "them",
35 |     "themselves", "then", "thence", "there", "thereafter", "thereby",
36 |     "therefore", "therein", "thereupon", "these", "they",
37 |     "third", "this", "those", "though", "three", "through", "throughout",
38 |     "thru", "thus", "to", "together", "too", "top", "toward", "towards",
39 |     "twelve", "twenty", "two", "un", "until", "up", "upon", "us",
40 |     "very", "via", "view", "viewing", "viewer", "was", "we", "we'll", "well", "welcome",
41 |     "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter",
42 |     "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
43 |     "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
44 |     "within", "without", "would", "wont", "won't", "yet", "you", "your", "yours", "you've", "you'll", "yourself",
45 |     "yourselves", "youtube", "going", "want", "right", "you're", "we're", "know", "gonna", "need", "bit",
46 |     "look", "yeah", "guys", "sure", "let's", "video", "oh", "let", "today","they're", "did", "looks",
47 |     "different", "great" , "different", "say", "um", "probably", "kind", "doesn't", "does", "maybe", "hey",
48 |     "we've", "better", "hope", "there's", "try"])
49 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | from __future__ import print_function
  5 | 
  6 | import torch as th
  7 | from torch.utils.data import DataLoader
  8 | import numpy as np
  9 | import torch.optim as optim
 10 | from args import get_args
 11 | import random
 12 | import os
 13 | from avlectures_dataloader import AVLectures_DataLoader
 14 | from model import Net
 15 | from metrics import compute_metrics, print_computed_metrics
 16 | from loss import MaxMarginRankingLoss
 17 | from loss_mms import MMS_loss
 18 | from loss_ce import CE_loss
 19 | from loss_milnce import MILNCELoss
 20 | from gensim.models.keyedvectors import KeyedVectors
 21 | import pickle
 22 | 
 23 | 
 24 | args = get_args()
 25 | if args.verbose:
 26 |     print(args)
 27 | 
 28 | # predefining random initial seeds
 29 | th.manual_seed(args.seed)
 30 | np.random.seed(args.seed)
 31 | random.seed(args.seed)
 32 | 
 33 | if args.checkpoint_dir != '' and not(os.path.isdir(args.checkpoint_dir)):
 34 |     os.mkdir(args.checkpoint_dir)
 35 | 
 36 | if not(args.avlectures):
 37 |     print('Loading captions: {}'.format(args.caption_path))
 38 |     caption = pickle.load(open(args.caption_path, 'rb'))
 39 |     print('done')
 40 | 
 41 | if args.word2vec:
 42 | 
 43 |     print('Loading word vectors: {}'.format(args.word2vec_path))
 44 |     we = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
 45 |     
 46 |     if args.avlectures:
 47 |         we_train = we 
 48 |         we_val = we 
 49 |     
 50 |     print('done')
 51 | 
 52 | else:
 53 |     we_train = None
 54 |     we_val = None
 55 | 
 56 | if args.avlectures:
 57 |     dataset = AVLectures_DataLoader(
 58 |         data=args.avlectures_train_path,
 59 |         helper_pkl = args.avlectures_helper_path,
 60 |         we=we_train,
 61 |         max_words=args.max_words,
 62 |         we_dim=args.we_dim,
 63 |         word2vec=args.word2vec,
 64 |         ocr=args.ocr,
 65 |         n_pair=args.n_pair,
 66 |         only_2d=args.only_2d,
 67 |         only_3d=args.only_3d
 68 |     )
 69 | dataset_size = len(dataset)
 70 | dataloader = DataLoader(
 71 |     dataset,
 72 |     batch_size=args.batch_size,
 73 |     num_workers=args.num_thread_reader,
 74 |     shuffle=True,
 75 |     batch_sampler=None,
 76 |     drop_last=True,
 77 | )
 78 | if args.eval_avlectures:
 79 |     dataset_val = AVLectures_DataLoader(
 80 |         data=args.avlectures_val_path,
 81 |         helper_pkl = args.avlectures_helper_path,
 82 |         we=we_val,
 83 |         max_words=args.max_words,
 84 |         we_dim=args.we_dim,
 85 |         word2vec=args.word2vec,
 86 |         ocr=args.ocr,
 87 |         only_2d=args.only_2d,
 88 |         only_3d=args.only_3d
 89 |     )
 90 |     dataloader_val = DataLoader(
 91 |         dataset_val,
 92 |         batch_size=args.batch_size_val,
 93 |         num_workers=args.num_thread_reader,
 94 |         shuffle=False,
 95 |     )
 96 | 
 97 | net = Net(
 98 |     video_dim=args.feature_dim,
 99 |     embd_dim=args.embd_dim,
100 |     we_dim=args.we_dim,
101 |     n_pair=args.n_pair,
102 |     max_words=args.max_words,
103 |     sentence_dim=args.sentence_dim,
104 |     word2vec=args.word2vec,
105 |     ocr=args.ocr,
106 |     ocr_dim=args.ocr_dim,
107 |     only_ocr=args.only_ocr
108 | )
109 | net.train()
110 | # Optimizers + Loss
111 | 
112 | loss_op = MaxMarginRankingLoss(
113 |     margin=args.margin,
114 |     negative_weighting=args.negative_weighting,
115 |     batch_size=args.batch_size,
116 |     n_pair=args.n_pair,
117 |     hard_negative_rate=args.hard_negative_rate,
118 | )
119 | 
120 | # loss_op = MMS_loss()
121 | # loss_op = CE_loss()
122 | # loss_op = MILNCELoss()
123 | 
124 | net.cuda()
125 | loss_op.cuda()
126 | 
127 | 
128 | if args.pretrain_path != '':
129 |     net.load_checkpoint(args.pretrain_path)
130 | 
131 | optimizer = optim.Adam(net.parameters(), lr=args.lr)
132 | 
133 | if args.verbose:
134 |     print('Starting training loop ...')
135 | 
136 | def TrainOneBatch(model, opt, data, loss_fun):
137 |     text = data['text'].cuda()
138 |     video = data['video'].cuda()
139 |     ocr_embd = None
140 |     if args.ocr:
141 |         ocr_embd = data['ocr_embd'].cuda()
142 |     video = video.view(-1, video.shape[-1])
143 |     if args.word2vec:
144 |         text = text.view(-1, text.shape[-2], text.shape[-1]) # original
145 |         if args.ocr:
146 |             ocr_embd = ocr_embd.view(-1, ocr_embd.shape[-2], ocr_embd.shape[-1])
147 |     else:
148 |         if args.n_pair > 1:
149 |             text = text.view(-1, text.shape[-2], text.shape[-1]) # original
150 |             text = text.squeeze()
151 |             if args.ocr:
152 |                 ocr_embd = ocr_embd.view(-1, ocr_embd.shape[-2], ocr_embd.shape[-1])
153 |                 ocr_embd =  ocr_embd.squeeze()
154 |     opt.zero_grad()
155 |     with th.set_grad_enabled(True):
156 |         sim_matrix, v, t = model(video, text, ocr_embd)
157 |         loss = loss_fun(sim_matrix)
158 |     loss.backward()
159 |     opt.step()
160 |     return loss.item()
161 | 
162 | def Eval_retrieval(model, eval_dataloader, dataset_name):
163 |     model.eval()
164 |     print('Evaluating Text-Video retrieval on {} data'.format(dataset_name))
165 |     with th.no_grad():
166 |         for i_batch, data in enumerate(eval_dataloader):
167 |             text = data['text'].cuda()
168 |             video = data['video'].cuda()
169 |             ocr_embd = None
170 |             if args.ocr:
171 |                 ocr_embd = data['ocr_embd'].cuda()
172 |             m = model(video, text, ocr_embd)
173 |             m  = m.cpu().detach().numpy()
174 |             metrics = compute_metrics(m)
175 |             print_computed_metrics(metrics)
176 | 
177 | for epoch in range(args.epochs):
178 |     running_loss = 0.0
179 |     if args.eval_avlectures:
180 |         Eval_retrieval(net, dataloader_val, 'AVLectures')    
181 |     if args.verbose:
182 |         print('Epoch: %d' % epoch)
183 |     for i_batch, sample_batch in enumerate(dataloader):
184 |         batch_loss = TrainOneBatch(net, optimizer, sample_batch, loss_op) # orig
185 |         running_loss += batch_loss
186 |         if (i_batch + 1) % args.n_display == 0 and args.verbose:
187 |             print('Epoch %d, Epoch status: %.4f, Training loss: %.4f' %
188 |             (epoch + 1, args.batch_size * float(i_batch) / dataset_size,
189 |             running_loss / args.n_display))
190 |             running_loss = 0.0
191 |     for param_group in optimizer.param_groups:
192 |         param_group['lr'] *= args.lr_decay
193 |     if args.checkpoint_dir != '':
194 |         if epoch + 1 == args.epochs or (epoch + 1) % args.save_every == 0:
195 |             path = os.path.join(args.checkpoint_dir, 'e{}.pth'.format(epoch + 1))
196 |             net.save_checkpoint(path)
197 | 
198 | if args.eval_avlectures:
199 |     Eval_retrieval(net, dataloader_val, 'AVLectures')
200 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/README.md:
--------------------------------------------------------------------------------
  1 | # Steps to extract features
  2 | 
  3 | **Step 1:**
  4 | 
  5 | Execute the `create_feature_csv.py` program. To specify the path of the Datasubset use the `--base_dir` optional argument. The default path of the DataSubset is `/ssd_scratch/cvit/AVLectures/DataSubset`. After executing this program the following will be created inside `base_dir`.
  6 | 
  7 | a. *input_2d.csv*
  8 | 
  9 | b. *input_3d.csv*
 10 | 
 11 | c. Also empty directories called *features, features/2d/, features/3d/* will be created.
 12 | 
 13 | **Step 2:**
 14 | 
 15 | Once we have the 2d, 3d CSV files and empty directories to store 2d & 3d features, our next task is to extract the 2d and 3d features from the videos using the `extract.py` program.
 16 | First extract the 2d features using the following command:
 17 | ```
 18 | python extract.py --csv=input_2d.csv --type=2d --batch_size=64 --num_decoding_thread=4
 19 | ```
 20 | Then download the 3D ResNext-101 model as follows (for 3d feature extraction):
 21 | 
 22 | ```
 23 | mkdir model
 24 | $ cd model
 25 | $ wget https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/models/resnext101.pth
 26 | ```
 27 | Now extract the 3d features using the following command:
 28 | ```
 29 | $ python extract.py --csv=input_3d.csv --type=3d --batch_size=64 --num_decoding_thread=4
 30 | ```
 31 | 
 32 | **Step 3:**
 33 | 
 34 | Now it is time to create the pickle file of our data. To do this execute the `create_pickle.py` program. To specify the path of the Datasubset use the `--base_dir` optional argument. The default path of the DataSubset is `/ssd_scratch/cvit/AVLectures/DataSubset`. After executing this program a pickle file called `avl.pkl` will be created inside the `base_dir`.
 35 | 
 36 | # Fast and Easy to use video feature extractor
 37 | 
 38 | This repo aims at providing an easy to use and efficient code for extracting
 39 | video features using deep CNN (2D or 3D).
 40 | 
 41 | It has been originally designed to extract video features for the large scale video dataset HowTo100M (https://www.di.ens.fr/willow/research/howto100m/) in an efficient manner.
 42 | 
 43 | 
 44 | Most of the time, extracting CNN features from video is cumbersome.
 45 | In fact, this usually requires dumping video frames into the disk, loading the dumped frames one
 46 | by one, pre processing them and use a CNN to extract features on chunks of videos.
 47 | This process is not efficient because of the dumping of frames on disk which is
 48 | slow and can use a lot of inodes when working with large dataset of videos.
 49 | 
 50 | To avoid having to do that, this repo provides a simple python script for that task: Just provide a list of raw videos and the script will take care of on the fly video decoding (with ffmpeg) and feature extraction using state-of-the-art models. While being fast, it also happen to be very convenient.
 51 | 
 52 | This script is also optimized for multi processing GPU feature extraction.
 53 | 
 54 | 
 55 | # Requirements
 56 | - Python 3
 57 | - PyTorch (>= 1.0)
 58 | - ffmpeg-python (https://github.com/kkroening/ffmpeg-python)
 59 | 
 60 | # How To Use ?
 61 | 
 62 | First of all you need to generate a csv containing the list of videos you
 63 | want to process. For instance, if you have video1.mp4 and video2.webm to process,
 64 | you will need to generate a csv of this form:
 65 | 
 66 | ```
 67 | video_path,feature_path
 68 | absolute_path_video1.mp4,absolute_path_of_video1_features.npy
 69 | absolute_path_video2.webm,absolute_path_of_video2_features.npy
 70 | ```
 71 | 
 72 | And then just simply run:
 73 | 
 74 | ```sh
 75 | python extract.py --csv=input.csv --type=2d --batch_size=64 --num_decoding_thread=4
 76 | ```
 77 | This command will extract 2d video feature for video1.mp4 (resp. video2.webm) at path_of_video1_features.npy (resp. path_of_video2_features.npy) in
 78 | a form of a numpy array.
 79 | To get feature from the 3d model instead, just change type argument 2d per 3d.
 80 | The parameter --num_decoding_thread will set how many parallel cpu thread are used for the decoding of the videos.
 81 | 
 82 | Please note that the script is intended to be run on ONE single GPU only.
 83 | if multiple gpu are available, please make sure that only one free GPU is set visible
 84 | by the script with the CUDA_VISIBLE_DEVICES variable environnement for example.
 85 | 
 86 | # Can I use multiple GPU to speed up feature extraction ?
 87 | 
 88 | Yes ! just run the same script with same input csv on another GPU (that can be from a different machine, provided that the disk to output the features is shared between the machines). The script will create a new feature extraction process that will only focus on processing the videos that have not been processed yet, without overlapping with the other extraction process already running.
 89 | 
 90 | # What models are implemented ?
 91 | So far, only one 2D and one 3D models can be used.
 92 | 
 93 | - The 2D model is the pytorch model zoo ResNet-152 pretrained on ImageNet. The 2D features are extracted at 1 feature per second at the resolution of 224.
 94 | - The 3D model is a ResNexT-101 16 frames (https://github.com/kenshohara/3D-ResNets-PyTorch) pretrained on Kinetics. The 3D features are extracted at 1.5 feature per second at the resolution of 112.
 95 | 
 96 | # Downloading pretrained models
 97 | This will download the pretrained 3D ResNext-101 model we used from: https://github.com/kenshohara/3D-ResNets-PyTorch 
 98 | 
 99 | ```sh
100 | mkdir model
101 | cd model
102 | wget https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/models/resnext101.pth
103 | ```
104 | 
105 | 
106 | 
107 | # Acknowledgements
108 | The code re-used code from https://github.com/kenshohara/3D-ResNets-PyTorch
109 | for 3D CNN.
110 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_feature_csv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import argparse
 4 | 
 5 | from pathlib import Path
 6 | from os.path import join
 7 | from glob import glob
 8 | 
 9 | import cv2
10 | 
11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
12 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
13 | # After executing this code, two new CSV files would be created inside the base_dir
14 | # 1. input_2d.csv   2. input_3d.csv
15 | # Also empty directories called "features", "features/2d/", "features/3d/" will be created inside base_dir.
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
19 | args = parser.parse_args()
20 | 
21 | base_dir = args.base_dir
22 | print("Base Directory:")
23 | print(base_dir)
24 | 
25 | delimiter = "@#@"
26 | 
27 | # Create empty directories called "features/2d/" and "features/3d/"
28 | Path(join(base_dir, "features", "2d")).mkdir(parents=True, exist_ok=True)
29 | Path(join(base_dir, "features", "3d")).mkdir(parents=True, exist_ok=True)
30 | 
31 | fields = ['video_path', 'feature_path']
32 | 
33 | filename_2d = join(base_dir, 'input_2d.csv') # for extracting 2d features
34 | filename_3d = join(base_dir, 'input_3d.csv') # for extracting 3d features
35 | 
36 | rows = []
37 | 
38 | folder_list = []
39 | 
40 | for fl in glob(join(base_dir, '*')):
41 | 	if 'mit' in fl:
42 | 		folder_list.append(fl)
43 | 
44 | folder_list.sort()
45 | print(folder_list)
46 | 
47 | def check_vid(vid_path):
48 |     try:
49 |         cap = cv2.VideoCapture(vid_path)
50 |         return cap.isOpened()
51 |     except:
52 |         return False
53 | 
54 | with open(filename_2d, 'w') as csvfile_2d, open(filename_3d, 'w') as csvfile_3d: 
55 | 	csvwriter_2d = csv.writer(csvfile_2d) 
56 | 	csvwriter_2d.writerow(fields) 
57 | 
58 | 	csvwriter_3d = csv.writer(csvfile_3d) 
59 | 	csvwriter_3d.writerow(fields) 
60 | 
61 | 	for folder in folder_list:
62 | 		rows_2d = []
63 | 		rows_3d = []
64 | 		count = 0
65 | 		print("Inside - ", folder)
66 | 		with open(join(folder, 'combined.txt'), 'r') as text_file:
67 | 			lines = text_file.readlines()
68 | 		
69 | 			for line in lines:
70 | 				vid_name = line.split(delimiter)[0]
71 | 				vid_path = join(folder, 'splits_vid', vid_name)
72 | 				# if not check_vid(vid_path):
73 | 				# 	print("This video {0} is not split properly".format(vid_name))
74 | 				# 	continue
75 | 				feature_name = vid_name.replace('.mp4', '.npy')
76 | 				feature_path_2d = join(base_dir, 'features', '2d', feature_name)
77 | 				feature_path_3d = join(base_dir, 'features', '3d', feature_name)
78 | 				rows_2d.append([vid_path, feature_path_2d])
79 | 				rows_3d.append([vid_path, feature_path_3d])
80 | 				count += 1
81 | 
82 | 		print("Count =  ", count)
83 | 		csvwriter_2d.writerows(rows_2d)
84 | 		csvwriter_3d.writerows(rows_3d)


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_feature_csv_indi.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import argparse
 4 | 
 5 | from pathlib import Path
 6 | from os.path import join
 7 | from glob import glob
 8 | 
 9 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
10 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
11 | # After executing this code, two new CSV files would be created inside the base_dir
12 | # 1. input_2d.csv   2. input_3d.csv
13 | # Also empty directories called "features", "features/2d/", "features/3d/" will be created inside base_dir.
14 | 
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
17 | args = parser.parse_args()
18 | 
19 | base_dir = args.base_dir
20 | print("Base Directory:")
21 | print(base_dir)
22 | 
23 | # Create empty directories called "features/2d/" and "features/3d/"
24 | Path(join(base_dir, "features_m011", "2d")).mkdir(parents=True, exist_ok=True)
25 | Path(join(base_dir, "features_m011", "3d")).mkdir(parents=True, exist_ok=True)
26 | 
27 | fields = ['video_path', 'feature_path']
28 | 
29 | filename_2d = join(base_dir, 'input_2d_m011.csv') # for extracting 2d features
30 | filename_3d = join(base_dir, 'input_3d_m011.csv') # for extracting 3d features
31 | 
32 | rows = []
33 | 
34 | folder_list = []
35 | 
36 | for fl in glob(join(base_dir, '*')):
37 | 	if ('mit011'in fl):
38 | 		folder_list.append(fl)
39 | 
40 | folder_list.sort()
41 | print(folder_list)
42 | 
43 | with open(filename_2d, 'w') as csvfile_2d, open(filename_3d, 'w') as csvfile_3d: 
44 | 	csvwriter_2d = csv.writer(csvfile_2d) 
45 | 	csvwriter_2d.writerow(fields) 
46 | 
47 | 	csvwriter_3d = csv.writer(csvfile_3d) 
48 | 	csvwriter_3d.writerow(fields) 
49 | 
50 | 	for folder in folder_list:
51 | 		rows_2d = []
52 | 		rows_3d = []
53 | 		count = 0
54 | 		print("Inside - ", folder)
55 | 		with open(join(folder, 'combined.txt'), 'r') as text_file:
56 | 			lines = text_file.readlines()
57 | 		
58 | 			for line in lines:
59 | 				count += 1
60 | 				vid_name = line.split('|')[0]
61 | 				vid_path = join(folder, 'splits_vid', vid_name)
62 | 				feature_name = vid_name.replace('.mp4', '.npy')
63 | 				feature_path_2d = join(base_dir, 'features_m011', '2d', feature_name)
64 | 				feature_path_3d = join(base_dir, 'features_m011', '3d', feature_name)
65 | 				rows_2d.append([vid_path, feature_path_2d])
66 | 				rows_3d.append([vid_path, feature_path_3d])
67 | 
68 | 		print("Count =  ", count)
69 | 		csvwriter_2d.writerows(rows_2d)
70 | 		csvwriter_3d.writerows(rows_3d)
71 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_feature_csv_seg.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | import argparse
 4 | 
 5 | from pathlib import Path
 6 | from os.path import join
 7 | from glob import glob
 8 | 
 9 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
10 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
11 | # After executing this code, two new CSV files would be created inside the base_dir
12 | # 1. input_2d.csv   2. input_3d.csv
13 | # Also empty directories called "features", "features/2d/", "features/3d/" will be created inside base_dir.
14 | 
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
17 | args = parser.parse_args()
18 | 
19 | base_dir = args.base_dir
20 | print("Base Directory:")
21 | print(base_dir)
22 | 
23 | # Create empty directories called "features/2d/" and "features/3d/"
24 | Path(join(base_dir, "features", "2d")).mkdir(parents=True, exist_ok=True)
25 | Path(join(base_dir, "features", "3d")).mkdir(parents=True, exist_ok=True)
26 | 
27 | fields = ['video_path', 'feature_path']
28 | 
29 | filename_2d = join(base_dir, 'input_2d.csv') # for extracting 2d features
30 | filename_3d = join(base_dir, 'input_3d.csv') # for extracting 3d features
31 | 
32 | rows = []
33 | 
34 | folder_list = []
35 | 
36 | for fl in glob(join(base_dir, '*')):
37 | 	if 'mit' in fl:
38 | 		folder_list.append(fl)
39 | 
40 | folder_list.sort()
41 | print(folder_list)
42 | 
43 | with open(filename_2d, 'w') as csvfile_2d, open(filename_3d, 'w') as csvfile_3d: 
44 | 	csvwriter_2d = csv.writer(csvfile_2d) 
45 | 	csvwriter_2d.writerow(fields) 
46 | 
47 | 	csvwriter_3d = csv.writer(csvfile_3d) 
48 | 	csvwriter_3d.writerow(fields) 
49 | 
50 | 	for folder in folder_list:
51 | 		rows_2d = []
52 | 		rows_3d = []
53 | 		count = 0
54 | 		print("Inside - ", folder)
55 | 		with open(join(folder, 'combined.txt'), 'r') as text_file:
56 | 			lines = text_file.readlines()
57 | 		
58 | 			for line in lines:
59 | 				count += 1
60 | 				vid_name = line.split('|')[0]
61 | 				vid_path = join(folder, vid_name)
62 | 				feature_name = vid_name.replace('.mp4', '.npy')
63 | 				feature_path_2d = join(base_dir, 'features', '2d', feature_name)
64 | 				feature_path_3d = join(base_dir, 'features', '3d', feature_name)
65 | 				rows_2d.append([vid_path, feature_path_2d])
66 | 				rows_3d.append([vid_path, feature_path_3d])
67 | 
68 | 		print("Count =  ", count)
69 | 		csvwriter_2d.writerows(rows_2d)
70 | 		csvwriter_3d.writerows(rows_3d)


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | 
 6 | from os.path import join, isfile
 7 | from glob import glob
 8 | 
 9 | import numpy as np
10 | 
11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
18 | args = parser.parse_args()
19 | 
20 | base_dir = args.base_dir
21 | print("Base Directory:")
22 | print(base_dir)
23 | 
24 | data  = []
25 | 
26 | folder_list = []
27 | 
28 | for fl in glob(join(base_dir, '*')):
29 | 	if 'mit' in fl:
30 | 		folder_list.append(fl)
31 | 
32 | folder_list.sort()
33 | print(folder_list)
34 | 
35 | count_match = 0
36 | total_count = 0
37 | 
38 | for folder in folder_list:
39 | 	count = 0
40 | 
41 | 	print("Inside - ", folder)
42 | 
43 | 	with open(join(folder, 'combined.txt'), 'r') as text_file:
44 | 
45 | 		lines = text_file.readlines()
46 | 
47 | 		for line in lines:
48 | 			count += 1
49 | 			features = {}
50 | 			vid_name = line.split('|')[0]
51 | 			subtitle = line.split('|')[1]
52 | 			subtitle = " ".join(subtitle.split())
53 | 			features_name = vid_name.replace('.mp4', '.npy')
54 | 			
55 | 			if isfile(join(base_dir, 'features', '2d', features_name)) and  isfile(join(base_dir, 'features', '3d', features_name)):
56 | 
57 | 				two_d = np.load(join(base_dir, 'features', '2d', features_name))
58 | 				three_d = np.load(join(base_dir, 'features', '3d', features_name))
59 | 
60 | 				# if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0:
61 | 				# 	print("True")
62 | 				# 	continue
63 | 
64 | 				if two_d.shape == (0, 2048) or three_d.shape == (0, 2048):
65 | 					print("True")
66 | 					continue
67 | 
68 | 				# two_d = two_d.mean(axis = 0)
69 | 				# three_d = three_d.mean(axis = 0)
70 | 				
71 | 				two_d = two_d.max(axis = 0)
72 | 				three_d = three_d.max(axis = 0)
73 | 			
74 | 				features['2d'] = two_d
75 | 				features['3d'] = three_d
76 | 				features['caption'] = subtitle
77 | 				features['id'] = vid_name.replace('.mp4', '')
78 | 
79 | 				data.append(features)
80 | 				count_match += 1
81 | 
82 | 	print("Count = ", count)
83 | 	total_count += count
84 | 
85 | print("Count match = ", count_match)
86 | print("Total count = ", total_count)
87 | 
88 | with open(join(base_dir, 'm1_m2_10s15s_2d3d.pkl'), 'wb') as handle:
89 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	
90 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_indi.py:
--------------------------------------------------------------------------------
  1 | import pickle as pkl
  2 | import os.path
  3 | import os
  4 | import argparse
  5 | 
  6 | from os.path import join, isfile
  7 | from glob import glob
  8 | 
  9 | import numpy as np
 10 | 
 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
 18 | args = parser.parse_args()
 19 | 
 20 | base_dir = args.base_dir
 21 | print("Base Directory:")
 22 | print(base_dir)
 23 | 
 24 | data  = []
 25 | 
 26 | folder_list = []
 27 | 
 28 | for fl in glob(join(base_dir, '*')):
 29 | 	# if ('mit011' in fl) or ('mit012' in fl):
 30 | 	# 	continue
 31 | 	# if 'mit' in fl:
 32 | 	# 	folder_list.append(fl)
 33 | 
 34 | 	if ('mit006' in fl) or ('mit011' in fl):
 35 | 		folder_list.append(fl)
 36 | 
 37 | folder_list.sort()
 38 | print(folder_list)
 39 | 
 40 | count_match = 0
 41 | total_count = 0
 42 | 
 43 | for folder in folder_list:
 44 | 	count = 0
 45 | 
 46 | 	print("Inside - ", folder)
 47 | 
 48 | 	with open(join(folder, 'combined.txt'), 'r') as text_file:
 49 | 
 50 | 		lines = text_file.readlines()
 51 | 
 52 | 		for line in lines:
 53 | 			count += 1
 54 | 			features = {}
 55 | 			vid_name = line.split('|')[0]
 56 | 			subtitle = line.split('|')[1]
 57 | 			subtitle = " ".join(subtitle.split())
 58 | 
 59 | 			features_name = vid_name.replace('.mp4', '.npy')
 60 | 			
 61 | 			if isfile(join(base_dir, 'features_m011', '2d', features_name)) and  isfile(join(base_dir, 'features_m011', '3d', features_name)):
 62 | 
 63 | 				two_d = np.load(join(base_dir, 'features_m011', '2d', features_name))
 64 | 				three_d = np.load(join(base_dir, 'features_m011', '3d', features_name))
 65 | 
 66 | 				if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0:
 67 | 					print("True")
 68 | 					continue
 69 | 
 70 | 				# two_d = two_d.mean(axis = 0)
 71 | 				# three_d = three_d.mean(axis = 0)
 72 | 				
 73 | 				two_d = two_d.max(axis = 0)
 74 | 				three_d = three_d.max(axis = 0)
 75 | 			
 76 | 				features['2d'] = two_d
 77 | 				features['3d'] = three_d
 78 | 				features['caption'] = subtitle
 79 | 				features['id'] = vid_name.replace('.mp4', '')
 80 | 
 81 | 				data.append(features)
 82 | 				count_match += 1
 83 | 			
 84 | 			elif isfile(join(base_dir, 'features', '2d', features_name)) and  isfile(join(base_dir, 'features', '3d', features_name)):
 85 | 
 86 | 				two_d = np.load(join(base_dir, 'features', '2d', features_name))
 87 | 				three_d = np.load(join(base_dir, 'features', '3d', features_name))
 88 | 
 89 | 				if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0:
 90 | 					print("True")
 91 | 					continue
 92 | 
 93 | 				# two_d = two_d.mean(axis = 0)
 94 | 				# three_d = three_d.mean(axis = 0)
 95 | 				
 96 | 				two_d = two_d.max(axis = 0)
 97 | 				three_d = three_d.max(axis = 0)
 98 | 			
 99 | 				features['2d'] = two_d
100 | 				features['3d'] = three_d
101 | 				features['caption'] = subtitle
102 | 				features['id'] = vid_name.replace('.mp4', '')
103 | 
104 | 				data.append(features)
105 | 				count_match += 1			
106 | 
107 | 	print("Count = ", count)
108 | 	total_count += count
109 | 
110 | print("Count match = ", count_match)
111 | print("Total count = ", total_count)
112 | 
113 | with open(join(base_dir,  'm6m11_pysd.pkl'), 'wb') as handle:
114 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	
115 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_ocr.py:
--------------------------------------------------------------------------------
  1 | import pickle as pkl
  2 | import os.path
  3 | import os
  4 | import argparse
  5 | import json
  6 | 
  7 | from os.path import join, isfile
  8 | from glob import glob
  9 | 
 10 | import numpy as np
 11 | import subprocess
 12 | 
 13 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
 14 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
 15 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
 16 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
 17 | 
 18 | parser = argparse.ArgumentParser()
 19 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
 20 | args = parser.parse_args()
 21 | 
 22 | base_dir = args.base_dir
 23 | print("Base Directory:")
 24 | print(base_dir)
 25 | 
 26 | delimiter = "@#@"
 27 | 
 28 | ocr_dir = '/ssd_scratch/cvit/darshan/OCR_dataset_MITOCW_v1'
 29 | 
 30 | seg_stats_dir = '/home2/darshan.singh/Segmentation/stats'
 31 | 
 32 | def get_length(filename):
 33 |     result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
 34 |                              "format=duration", "-of",
 35 |                              "default=noprint_wrappers=1:nokey=1", filename],
 36 |     stdout=subprocess.PIPE,
 37 |     stderr=subprocess.STDOUT)
 38 |     return float(result.stdout)
 39 | 
 40 | def toFFMPEGtime(t):
 41 |     ss, ms = divmod(t*1000, 1000)
 42 |     mm, ss = divmod(ss, 60)
 43 |     hh, mm = divmod(mm, 60)
 44 | 
 45 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 46 | 
 47 | def getTime(t):
 48 |     h, m, sms = t.split(":")
 49 |     if ',' in sms: # Example t = '00:00:03,980'
 50 |         s, ms = sms.split(",")
 51 |     elif '.' in sms: # Example t = '00:00:03.980'
 52 |         s, ms = sms.split(".")
 53 |     else: # Example t = '00:00:03'
 54 |         s = sms
 55 |         ms = 0
 56 |     tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms)/1000
 57 |     return tm
 58 | 
 59 | def getOCR(course_name, vid_name, st, et):
 60 |     lec_name = "-".join(vid_name.split('-')[:-1])
 61 |     split_num = int(vid_name.split('-')[-1].replace('.mp4', ''))
 62 | 
 63 |     # json_data = glob(join(ocr_dir, course_name, lec_name, '*.json'))[0]
 64 |     # json_data = open(json_data, 'r')
 65 |     # json_data = json.load(json_data)
 66 |     # fps = round(json_data['frame_metadata']['True FPS'], 2)
 67 | 
 68 |     ocr_text = ""
 69 |     ocr_frame_num = None
 70 |     ocr_frame_ts = None
 71 |     ocr_lec_name = lec_name
 72 | 
 73 |     for fl in glob(join(ocr_dir, course_name, lec_name, '*.json')):
 74 |         json_data_fl_ref = open(fl, 'r')
 75 |         json_data_fl = json.load(json_data_fl_ref)
 76 |         json_data_fl_ref.close()
 77 |         fps = json_data_fl['frame_metadata']['True FPS']
 78 |         frame_num = json_data_fl['frame_metadata']['Frame number']
 79 | 
 80 |         if frame_num == 1:
 81 |             continue
 82 | 
 83 |         frame_ts = round(frame_num / fps)
 84 |         
 85 |         if st <= frame_ts and frame_ts <= et:
 86 |             if 'fullTextAnnotation' in json_data_fl:
 87 |                 ocr_text = json_data_fl['fullTextAnnotation']['text']
 88 |                 ocr_frame_num = frame_num
 89 |                 ocr_frame_ts = frame_ts
 90 |     
 91 |     return ocr_text, ocr_frame_num, ocr_frame_ts, ocr_lec_name
 92 | 
 93 | data  = []
 94 | 
 95 | folder_list = []
 96 | 
 97 | for fl in glob(join(base_dir, '*')):
 98 |     if 'mit' in fl:
 99 |         folder_list.append(fl)
100 | 
101 | folder_list.sort()
102 | print(folder_list)
103 | 
104 | count_match = 0
105 | total_count = 0
106 | 
107 | for folder in folder_list:
108 |     count = 0
109 |     course_name = folder.split('/')[-1]
110 | 
111 |     print("Inside - ", course_name)
112 | 
113 |     with open(join(folder, 'combined.txt'), 'r') as text_file:
114 | 
115 |         lines = text_file.readlines()
116 | 
117 |         for line in lines:
118 |             count += 1
119 |             features = {}
120 |             vid_name = line.split(delimiter)[0]
121 |             subtitle = line.split(delimiter)[1]
122 |             subtitle = " ".join(subtitle.split())
123 |             st = getTime(line.split(delimiter)[2])
124 |             et = getTime(line.split(delimiter)[3])
125 |             features_name = vid_name.replace('.mp4', '.npy')
126 |             
127 |             if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)):
128 | 
129 |                 two_d = np.load(join(base_dir, 'features', '2d', features_name)) 
130 |                 three_d = np.load(join(base_dir, 'features', '3d', features_name))
131 | 
132 |                 # if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0:
133 |                 # 	print("True")
134 |                 # 	continue
135 | 
136 |                 if two_d.shape == (0, 2048) or three_d.shape == (0, 2048):
137 |                     print("True")
138 |                     continue
139 | 
140 |                 # two_d = two_d.mean(axis = 0)
141 |                 # three_d = three_d.mean(axis = 0)
142 |                 
143 |                 two_d = two_d.max(axis = 0)
144 |                 three_d = three_d.max(axis = 0)
145 |             
146 |                 features['2d'] = two_d
147 |                 features['3d'] = three_d
148 |                 features['caption'] = subtitle
149 |                 features['id'] = vid_name.replace('.mp4', '')
150 |                 features['vid_duration_ffprobe'] = get_length(join(folder, 'splits_vid', vid_name))
151 |                 features['vid_duration'] = et - st
152 |                 features['st'] = st
153 |                 features['et'] = et
154 |                 
155 |                 # retrieving OCR data
156 |                 lec_name = "-".join(vid_name.split('-')[:-1])
157 |                 lec_num = int(lec_name.split('-')[-1].replace('_300k', '').replace('lec', ''))
158 | 
159 |                 seg_stats = pkl.load(open(join(seg_stats_dir, course_name + '.pkl'), 'rb'))
160 | 
161 |                 if lec_num in seg_stats:
162 |                     offset = int(seg_stats[lec_num]['st'])
163 |                 else:
164 |                     offset = 0
165 | 
166 |                 ocr_data = getOCR(course_name, vid_name, float(st) + offset, float(et) + offset)
167 | 
168 |                 features['ocr_text'] = " ".join(ocr_data[0].split())
169 |                 features['ocr_frame_num']  = ocr_data[1]
170 |                 features['ocr_frame_ts'] = ocr_data[2]
171 |                 features['ocr_lec_name'] = ocr_data[3]
172 |                 features['offset'] = offset
173 | 
174 |                 data.append(features)
175 |                 count_match += 1
176 | 
177 |     print("Count = ", count)
178 |     total_count += count
179 | 
180 | print("Count match = ", count_match)
181 | print("Total count = ", total_count)
182 | 
183 | with open(join(base_dir, 'm1_m2_20s25s_2d3d.pkl'), 'wb') as handle:
184 |     pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	
185 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_prevnext.py:
--------------------------------------------------------------------------------
  1 | import pickle as pkl
  2 | import os.path
  3 | import os
  4 | import argparse
  5 | 
  6 | from os.path import join, isfile
  7 | from glob import glob
  8 | 
  9 | import numpy as np
 10 | 
 11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
 12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
 13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
 14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
 15 | 
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
 18 | args = parser.parse_args()
 19 | 
 20 | base_dir = args.base_dir
 21 | print("Base Directory:")
 22 | print(base_dir)
 23 | 
 24 | delimiter = "@#@"
 25 | 
 26 | data  = []
 27 | 
 28 | folder_list = []
 29 | 
 30 | def getTime(t):
 31 | 	h, m, sms = t.split(":")
 32 | 	if ',' in sms: # Example t = '00:00:03,980'
 33 | 		s, ms = sms.split(",")
 34 | 	elif '.' in sms: # Example t = '00:00:03.980'
 35 | 		s, ms = sms.split(".")
 36 | 	else: # Example t = '00:00:03'
 37 | 		s = sms
 38 | 		ms = '0'
 39 | 	tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 40 | 	return tm
 41 | 
 42 | for fl in glob(join(base_dir, '*')):
 43 | 	# if ('mit011' in fl) or ('mit012' in fl):
 44 | 	# 	continue
 45 | 	if 'mit' in fl:
 46 | 		folder_list.append(fl)
 47 | 
 48 | folder_list.sort()
 49 | print(folder_list)
 50 | 
 51 | count_match = 0
 52 | total_count = 0
 53 | 
 54 | for folder in folder_list:
 55 | 	count = 0
 56 | 
 57 | 	course_name = folder.split('/')[-1]
 58 | 	print("Inside course -", course_name)
 59 | 
 60 | 	# for glob(join(folder, 'subtitles'))
 61 | 
 62 | 	with open(join(folder, 'combined.txt'), 'r') as text_file:
 63 | 
 64 | 		lines = text_file.readlines()
 65 | 		lines = sorted(lines, key = lambda line: (line.split(delimiter))[0])
 66 | 
 67 | 		for i in range(len(lines)):
 68 | 			l = lines[i].strip()
 69 | 			count += 1
 70 | 			features = {}
 71 | 
 72 | 			present_vid = lines[i].split(delimiter)[0]  # example : "MIT6_042JF10_lec17_300k-00000.mp4"
 73 | 			present_vid_name = '-'.join((present_vid.split('-'))[:-1]) # example : "MIT6_042JF10_lec17_300k"
 74 | 			present_vid_id = (present_vid.split('-'))[-1] # example : "00000.mp4"
 75 | 			present_vid_id = int(present_vid_id.replace('.mp4', '')) # example : 0
 76 | 			present_vid_subtitle = lines[i].split(delimiter)[1]
 77 | 			present_vid_subtitle = " ".join(present_vid_subtitle.split())
 78 | 
 79 | 			prev_vid = ""
 80 | 			next_vid = ""
 81 | 
 82 | 			subtitle = ""
 83 | 
 84 | 			if i > 0:
 85 | 				prev_vid = lines[i - 1].split(delimiter)[0]
 86 | 			if i < len(lines) - 1:
 87 | 				next_vid = lines[i + 1].split(delimiter)[0]
 88 | 
 89 | 
 90 | 			if prev_vid != '':
 91 | 
 92 | 				prev_vid_name = '-'.join((prev_vid.split('-'))[:-1])
 93 | 				prev_vid_id = (prev_vid.split('-'))[-1]
 94 | 				prev_vid_id = int(prev_vid_id.replace('.mp4', ''))
 95 | 				prev_vid_subtitle = lines[i - 1].split(delimiter)[1]
 96 | 				prev_vid_subtitle = " ".join(prev_vid_subtitle.split())
 97 | 
 98 | 				if present_vid_name == prev_vid_name and prev_vid_id ==  present_vid_id - 1:
 99 | 					subtitle = prev_vid_subtitle + " "
100 | 			
101 | 			subtitle = subtitle + present_vid_subtitle
102 | 
103 | 			if next_vid != '':
104 | 				
105 | 				next_vid_name = '-'.join((next_vid.split('-'))[:-1])
106 | 				next_vid_id = (next_vid.split('-'))[-1]
107 | 				next_vid_id = int(next_vid_id.replace('.mp4', ''))
108 | 				next_vid_subtitle = lines[i + 1].split(delimiter)[1]
109 | 				next_vid_subtitle = " ".join(next_vid_subtitle.split())
110 | 
111 | 				if present_vid_name == next_vid_name and next_vid_id ==  present_vid_id + 1:
112 | 					subtitle = subtitle + " " + next_vid_subtitle		
113 | 
114 | 			features_name = present_vid.replace('.mp4', '.npy')
115 | 			st = getTime(l.split(delimiter)[2])
116 | 			et = getTime(l.split(delimiter)[3])
117 | 			
118 | 			if isfile(join(base_dir, 'features', '2d', features_name)) and  isfile(join(base_dir, 'features', '3d', features_name)):
119 | 
120 | 				two_d = np.load(join(base_dir, 'features', '2d', features_name))
121 | 				three_d = np.load(join(base_dir, 'features', '3d', features_name))
122 | 
123 | 				if two_d.shape == (0, 2048) or three_d.shape == (0, 2048):
124 | 					print("True")
125 | 					continue
126 | 
127 | 				# two_d = two_d.mean(axis = 0)
128 | 				# three_d = three_d.mean(axis = 0)
129 | 				
130 | 				two_d = two_d.max(axis = 0)
131 | 				three_d = three_d.max(axis = 0)
132 | 			
133 | 				features['2d'] = two_d
134 | 				features['3d'] = three_d
135 | 				features['caption'] = subtitle
136 | 				features['id'] = present_vid.replace('.mp4', '')
137 | 				features['st'] = st
138 | 				features['et'] = et
139 | 
140 | 				features['vid_duration'] = et - st
141 | 
142 | 				features['course_name'] = course_name
143 | 
144 | 				data.append(features)
145 | 				count_match += 1
146 | 
147 | 	print("Count = ", count)
148 | 	total_count += count
149 | 
150 | print("Count match = ", count_match)
151 | print("Total count = ", total_count)
152 | 
153 | with open(join(base_dir,  'seg_10s15s_2d3dprevnext.pkl'), 'wb') as handle:
154 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2.py:
--------------------------------------------------------------------------------
  1 | import pickle as pkl
  2 | import os.path
  3 | import os
  4 | import argparse
  5 | import json
  6 | 
  7 | from tqdm import tqdm
  8 | 
  9 | from os.path import join, isfile
 10 | from glob import glob
 11 | 
 12 | # from sentence_transformers import SentenceTransformer, util
 13 | 
 14 | import numpy as np
 15 | import subprocess
 16 | 
 17 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
 18 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
 19 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
 20 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
 21 | 
 22 | parser = argparse.ArgumentParser()
 23 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
 24 | args = parser.parse_args()
 25 | 
 26 | base_dir = args.base_dir
 27 | print("Base Directory:")
 28 | print(base_dir)
 29 | 
 30 | delimiter = "@#@"
 31 | 
 32 | ocr_dir = '/ssd_scratch/cvit/darshan/OCR'
 33 | 
 34 | def get_length(filename):
 35 |     result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
 36 |                              "format=duration", "-of",
 37 |                              "default=noprint_wrappers=1:nokey=1", filename],
 38 |     stdout=subprocess.PIPE,
 39 |     stderr=subprocess.STDOUT)
 40 |     return float(result.stdout)
 41 | 
 42 | def toFFMPEGtime(t):
 43 |     ss, ms = divmod(t*1000, 1000)
 44 |     mm, ss = divmod(ss, 60)
 45 |     hh, mm = divmod(mm, 60)
 46 | 
 47 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 48 | 
 49 | def getTime(t):
 50 | 	h, m, sms = t.split(":")
 51 | 	if ',' in sms: # Example t = '00:00:03,980'
 52 | 		s, ms = sms.split(",")
 53 | 	elif '.' in sms: # Example t = '00:00:03.980'
 54 | 		s, ms = sms.split(".")
 55 | 	else: # Example t = '00:00:03'
 56 | 		s = sms
 57 | 		ms = '0'
 58 | 	tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 59 | 	return tm
 60 | 
 61 | def getOCR(course_name, vid_name, st, et):
 62 |     lec_name = "-".join(vid_name.split('-')[:-1])
 63 |     split_num = int(vid_name.split('-')[-1].replace('.mp4', ''))
 64 | 
 65 |     # json_data = glob(join(ocr_dir, course_name, lec_name, '*.json'))[0]
 66 |     # json_data = open(json_data, 'r')
 67 |     # json_data = json.load(json_data)
 68 |     # fps = round(json_data['frame_metadata']['True FPS'], 2)
 69 | 
 70 |     ocr_text = ""
 71 |     ocr_frame_num = None
 72 |     ocr_frame_ts = None
 73 |     ocr_lec_name = lec_name
 74 | 
 75 |     for fl in glob(join(ocr_dir, course_name, lec_name, '*.json')):
 76 |         json_data_fl_ref = open(fl, 'r')
 77 |         json_data_fl = json.load(json_data_fl_ref)
 78 |         json_data_fl_ref.close()
 79 |         fps = json_data_fl['frame_metadata']['True FPS']
 80 |         frame_num = json_data_fl['frame_metadata']['Frame number']
 81 | 
 82 |         if frame_num == 1:
 83 |             continue
 84 | 
 85 |         frame_ts = round(frame_num / fps, 3)
 86 |         
 87 |         if st <= frame_ts and frame_ts <= et:
 88 |             if 'fullTextAnnotation' in json_data_fl:
 89 |                 ocr_text = json_data_fl['fullTextAnnotation']['text']
 90 |                 ocr_frame_num = frame_num
 91 |                 ocr_frame_ts = frame_ts
 92 |                 break
 93 |     
 94 |     return ocr_text, ocr_frame_num, ocr_frame_ts, ocr_lec_name
 95 | 
 96 | data  = []
 97 | 
 98 | folder_list = []
 99 | 
100 | for fl in glob(join(base_dir, '*')):
101 | 	if 'mit' in fl:
102 | 		folder_list.append(fl)
103 | 
104 | folder_list.sort()
105 | print(folder_list)
106 | 
107 | # model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
108 | # model_ss = SentenceTransformer('all-mpnet-base-v2')
109 | 
110 | count_match = 0
111 | total_count = 0
112 | 
113 | for folder in folder_list:
114 | 	count = 0
115 | 
116 | 	# print("Inside - ", folder)
117 | 	course_name = folder.split('/')[-1]
118 | 	print("Inside course -", course_name)
119 | 
120 | 	with open(join(folder, 'combined.txt'), 'r') as text_file:
121 | 
122 | 		lines = text_file.readlines()
123 | 
124 | 		line_lst = []
125 | 
126 | 		# for line in lines:
127 | 		# 	l = line.strip()
128 | 		# 	line_lst.append(l)
129 | 
130 | 		for line in tqdm(lines):
131 | 			l = line.strip()
132 | 			count += 1
133 | 			features = {}
134 | 			vid_name = l.split(delimiter)[0]
135 | 			subtitle = l.split(delimiter)[1]
136 | 			subtitle = " ".join(subtitle.split())
137 | 			st = getTime(l.split(delimiter)[2])
138 | 			et = getTime(l.split(delimiter)[3])
139 | 			features_name = vid_name.replace('.mp4', '.npy')
140 | 
141 | 			# if isfile(join(base_dir, 'features', '2d', features_name)):
142 | 			if isfile(join(base_dir, 'features', '2d', features_name)) and isfile(join(base_dir, 'features', '3d', features_name)):
143 | 
144 | 				two_d = np.load(join(base_dir, 'features', '2d', features_name)) 
145 | 				three_d = np.load(join(base_dir, 'features', '3d', features_name))
146 | 
147 | 				#if two_d.shape == (0, 2048):	
148 | 				if two_d.shape == (0, 2048) or three_d.shape == (0, 2048):
149 | 					print("True")
150 | 					continue
151 | 				
152 | 				# if mean-pooling
153 | 				# two_d = two_d.mean(axis = 0)
154 | 				# three_d = three_d.mean(axis = 0)
155 | 				
156 | 				# if max-pooling
157 | 				two_d = two_d.max(axis = 0)
158 | 				three_d = three_d.max(axis = 0)
159 | 			
160 | 				features['2d'] = two_d
161 | 				features['3d'] = three_d
162 | 				features['caption'] = subtitle
163 | 				features['id'] = vid_name.replace('.mp4', '')
164 | 				# features['vid_duration'] = get_length(join(folder, 'splits_vid', vid_name))
165 | 				features['vid_duration'] = et - st
166 | 				features['st'] = st
167 | 				features['et'] = et
168 | 				features['course_name'] = course_name
169 | 
170 | 				# features['emb_qa'] = model_qa.encode(subtitle)
171 | 				# features['emb_ss'] = model_ss.encode(subtitle)
172 | 
173 |                 # retrieving OCR data
174 | 				lec_name = "-".join(vid_name.split('-')[:-1])
175 | 				
176 | 				ocr_data = getOCR(course_name, vid_name, float(st), float(et))
177 | 
178 | 				features['ocr_text'] = " ".join(ocr_data[0].split())
179 | 				features['ocr_frame_num']  = ocr_data[1]
180 | 				features['ocr_frame_ts'] = ocr_data[2]
181 | 				features['ocr_lec_name'] = ocr_data[3]
182 | 
183 | 				data.append(features)
184 | 				count_match += 1
185 | 
186 | 	print("Count = ", count)
187 | 	total_count += count
188 | 
189 | print("Count match = ", count_match)
190 | print("Total count = ", total_count)
191 | 
192 | with open(join(base_dir, 'v1_1.pkl'), 'wb') as handle:
193 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	
194 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 10
 4 | #SBATCH --gres=gpu:1
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=INFINITE
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | module load ffmpeg/4.4.1
11 | 
12 | echo "Pickling started"
13 | 
14 | python create_pickle_seg2.py --base_dir='/ssd_scratch/cvit/darshan/segmentation_dataset_v1_10s15s'
15 | 
16 | echo "Pickling ended successfully"
17 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_55.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 10
 4 | #SBATCH --gres=gpu:1
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=INFINITE
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | module load ffmpeg/4.4.1
11 | 
12 | echo "Pickling started"
13 | 
14 | python create_pickle_seg2.py --base_dir='/ssd_scratch/cvit/darshan/segmentation_dataset_v1_10s15s'
15 | 
16 | echo "Pickling ended successfully"
17 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_mp.py:
--------------------------------------------------------------------------------
  1 | import pickle as pkl
  2 | import os.path
  3 | import os
  4 | import argparse
  5 | import json
  6 | 
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | from os.path import join, isfile
 12 | from glob import glob
 13 | 
 14 | # from sentence_transformers import SentenceTransformer, util
 15 | 
 16 | import numpy as np
 17 | import subprocess
 18 | 
 19 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
 20 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
 21 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
 22 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
 23 | 
 24 | parser = argparse.ArgumentParser()
 25 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
 26 | parser.add_argument("-f", "--file_name", type=str, required=False, help="", default = 'm')
 27 | args = parser.parse_args()
 28 | 
 29 | base_dir = args.base_dir
 30 | print("Base Directory:")
 31 | print(base_dir)
 32 | 
 33 | f_name = args.file_name
 34 | f_name = "20_25"
 35 | print(f_name)
 36 | 
 37 | delimiter = "@#@"
 38 | 
 39 | ocr_dir = '/ssd_scratch/cvit/darshan/OCR/dataset_MITOCW_v1'
 40 | 
 41 | base_pkl = pkl.load(open('/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s/v1_1.pkl', 'rb'))
 42 | 
 43 | def get_length(filename):
 44 |     result = subprocess.run(["ffprobe", "-v", "error", "-show_entries",
 45 |                              "format=duration", "-of",
 46 |                              "default=noprint_wrappers=1:nokey=1", filename],
 47 |     stdout=subprocess.PIPE,
 48 |     stderr=subprocess.STDOUT)
 49 |     return float(result.stdout)
 50 | 
 51 | def toFFMPEGtime(t):
 52 |     ss, ms = divmod(t*1000, 1000)
 53 |     mm, ss = divmod(ss, 60)
 54 |     hh, mm = divmod(mm, 60)
 55 | 
 56 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
 57 | 
 58 | def getTime(t):
 59 | 	h, m, sms = t.split(":")
 60 | 	if ',' in sms: # Example t = '00:00:03,980'
 61 | 		s, ms = sms.split(",")
 62 | 	elif '.' in sms: # Example t = '00:00:03.980'
 63 | 		s, ms = sms.split(".")
 64 | 	else: # Example t = '00:00:03'
 65 | 		s = sms
 66 | 		ms = '0'
 67 | 	tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
 68 | 	return tm
 69 | 
 70 | def getOCR(course_name, vid_name, st, et):
 71 |     lec_name = "-".join(vid_name.split('-')[:-1])
 72 |     split_num = int(vid_name.split('-')[-1].replace('.mp4', ''))
 73 | 
 74 |     # json_data = glob(join(ocr_dir, course_name, lec_name, '*.json'))[0]
 75 |     # json_data = open(json_data, 'r')
 76 |     # json_data = json.load(json_data)
 77 |     # fps = round(json_data['frame_metadata']['True FPS'], 2)
 78 | 
 79 |     ocr_text = ""
 80 |     ocr_frame_num = None
 81 |     ocr_frame_ts = None
 82 |     ocr_lec_name = lec_name
 83 | 
 84 |     for fl in glob(join(ocr_dir, course_name, lec_name, '*.json')):
 85 |         json_data_fl_ref = open(fl, 'r')
 86 |         json_data_fl = json.load(json_data_fl_ref)
 87 |         json_data_fl_ref.close()
 88 |         fps = json_data_fl['frame_metadata']['True FPS']
 89 |         frame_num = json_data_fl['frame_metadata']['Frame number']
 90 | 
 91 |         if frame_num == 1:
 92 |             continue
 93 | 
 94 |         frame_ts = round(frame_num / fps, 3)
 95 |         
 96 |         if st <= frame_ts and frame_ts <= et:
 97 |             if 'fullTextAnnotation' in json_data_fl:
 98 |                 ocr_text = json_data_fl['fullTextAnnotation']['text']
 99 |                 ocr_frame_num = frame_num
100 |                 ocr_frame_ts = frame_ts
101 |                 break
102 |     
103 |     return ocr_text, ocr_frame_num, ocr_frame_ts, ocr_lec_name
104 | 
105 | 
106 | def do_job(l):
107 | 
108 |     course_name = l['course_name']
109 | 
110 |     if not os.path.isdir(join(ocr_dir, course_name)):
111 |         return
112 | 
113 |     st = l['st']
114 |     et = l['et']
115 |     vid_name = l['id'] + '.mp4'
116 | 
117 |     lec_name = "-".join(vid_name.split('-')[:-1])
118 | 
119 |     ocr_data = getOCR(course_name, vid_name, float(st), float(et))
120 | 
121 |     l['ocr_text'] = " ".join(ocr_data[0].split())
122 |     l['ocr_frame_num']  = ocr_data[1]
123 |     l['ocr_frame_ts'] = ocr_data[2]
124 |     l['ocr_lec_name'] = ocr_data[3]
125 | 
126 |     return l
127 | 
128 | 
129 | count_match = 0
130 | total_count = 0
131 | # p = ThreadPoolExecutor(20)
132 | p = ProcessPoolExecutor(19)
133 | 
134 | futures = [p.submit(do_job, li) for li in base_pkl]
135 | x = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
136 | 
137 | # print(d_ocr[0])
138 | # print(x)
139 | 
140 | with open(join(base_dir, 'v1_2d3dOCR_{}.pkl'.format(f_name)), 'wb') as handle:
141 | 	pkl.dump(x, handle, protocol=pkl.HIGHEST_PROTOCOL)	
142 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_mp_55.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 20
 4 | #SBATCH --gres=gpu:1
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=INFINITE
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | 
11 | echo "Pickling started"
12 | 
13 | python create_pickle_seg2_mp.py --base_dir='/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' --file_name='m050_m156'
14 | 
15 | echo "Pickling ended successfully"
16 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_seg2_mp_92.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 20
 4 | #SBATCH --gres=gpu:2
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=INFINITE
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | module load ffmpeg/4.4.1
11 | 
12 | echo "Pickling started"
13 | 
14 | python create_pickle_seg2_mp.py --base_dir='/ssd_scratch/cvit/darshan/dataset_MITOCW_v1' --file_name='m080_mit103'
15 | 
16 | echo "Pickling ended successfully"
17 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pickle_segmentation.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | 
 6 | from os.path import join, isfile
 7 | from glob import glob
 8 | 
 9 | import numpy as np
10 | 
11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
18 | args = parser.parse_args()
19 | 
20 | base_dir = args.base_dir
21 | print("Base Directory:")
22 | print(base_dir)
23 | 
24 | data  = []
25 | 
26 | folder_list = []
27 | 
28 | for fl in glob(join(base_dir, '*')):
29 | 	if 'mit' in fl:
30 | 		folder_list.append(fl)
31 | 
32 | folder_list.sort()
33 | print(folder_list)
34 | 
35 | count_match = 0
36 | total_count = 0
37 | row_num = 0
38 | 
39 | for folder in folder_list:
40 | 	count = 0
41 | 
42 | 	print("Inside - ", folder)
43 | 
44 | 	with open(join(folder, 'combined.txt'), 'r') as text_file:
45 | 
46 | 		lines = text_file.readlines()
47 | 
48 | 		for line in lines:
49 | 			count += 1
50 | 			
51 | 			vid_name = line.split('|')[0]
52 | 			subtitle = line.split('|')[1]
53 | 			features_name = vid_name.replace('.mp4', '.npy')
54 | 			
55 | 			#if isfile(join(base_dir, 'features', '2d', features_name)) and  isfile(join(base_dir, 'features', '3d', features_name)):
56 | 
57 | 			two_d = np.load(join(base_dir, 'features', '2d', features_name))
58 | 				# three_d = np.load(join(base_dir, 'features', '3d', features_name))
59 | 
60 | 			if two_d.shape == (0, 2048):
61 | 				print("True")
62 | 				continue
63 | 
64 | 				# two_d = two_d.mean(axis = 0)
65 | 				# three_d = three_d.mean(axis = 0)
66 | 				
67 | 				# two_d = two_d.max(axis = 0)
68 | 				# three_d = three_d.max(axis = 0)
69 | 			
70 | 			for row in two_d:
71 | 				features = {}
72 | 				features['2d'] = row
73 | 				features['caption'] = 'frame ' + str(row_num)
74 | 				row_num += 1
75 | 				features['id'] = 'frame ' + str(row_num)
76 | 				data.append(features)
77 | 				count_match += 1
78 | 
79 | 			
80 | 				# features['2d'] = two_d
81 | 				# features['3d'] = three_d
82 | 				# features['caption'] = subtitle
83 | 				# features['id'] = vid_name.replace('.mp4', '')
84 | 
85 | 				# data.append(features)
86 | 				# count_match += 1
87 | 
88 | 	print("Count = ", count)
89 | 	total_count += count
90 | 
91 | print("Count match = ", count_match)
92 | print("Total count = ", total_count)
93 | 
94 | with open(join(base_dir, 'seg.pkl'), 'wb') as handle:
95 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	
96 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/create_pkl_tst.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | 
 6 | from os.path import join, isfile
 7 | from glob import glob
 8 | 
 9 | import numpy as np
10 | 
11 | # The default path of DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset' (also referred to as base_dir)
12 | # base_dir should also contain a directory inside it with the name "features" which inturn contains two directories called "2d" and "3d" which contains 2d and 3d video features respectively. 
13 | # If you want to change the default path, then you can do it using the optional '--base_dir' argument
14 | # After executing this code, a pickle file called 'avl.pkl' would be created inside the base_dir.
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("-b", "--base_dir", type=str, required=False, help="Path of DataSubset", default = '/ssd_scratch/cvit/AVLectures/DataSubset')
18 | args = parser.parse_args()
19 | 
20 | base_dir = args.base_dir
21 | print("Base Directory:")
22 | print(base_dir)
23 | 
24 | data  = []
25 | 
26 | folder_list = []
27 | 
28 | for fl in glob(join(base_dir, '*')):
29 | 	# if ('mit011' in fl) or ('mit012' in fl):
30 | 	# 	continue
31 | 	# if 'mit' in fl:
32 | 	# 	folder_list.append(fl)
33 | 
34 | 	if ('mit006' in fl):
35 | 		folder_list.append(fl)
36 | 
37 | folder_list.sort()
38 | print(folder_list)
39 | 
40 | count_match = 0
41 | total_count = 0
42 | 
43 | features = {}
44 | 
45 | for folder in folder_list:
46 | 	count = 0
47 | 
48 | 	print("Inside - ", folder)
49 | 
50 | 	with open(join(folder, 'combined.txt'), 'r') as text_file:
51 | 
52 | 		lines = text_file.readlines()
53 | 
54 | 		for line in lines:
55 | 			count += 1
56 | 			
57 | 			vid_name = line.split('|')[0]
58 | 			subtitle = line.split('|')[1]
59 | 			subtitle = " ".join(subtitle.split())
60 | 
61 | 			features_name = vid_name.replace('.mp4', '.npy')
62 | 			
63 | 			if isfile(join(base_dir, 'features', '2d', features_name)) and  isfile(join(base_dir, 'features', '3d', features_name)):
64 | 
65 | 				two_d = np.load(join(base_dir, 'features', '2d', features_name))
66 | 				three_d = np.load(join(base_dir, 'features', '3d', features_name))
67 | 
68 | 				if two_d.shape == (0, 2048) or three_d.shape == (0, 2048) or len(subtitle) == 0:
69 | 					print("True")
70 | 					continue
71 | 
72 | 				# two_d = two_d.mean(axis = 0)
73 | 				# three_d = three_d.mean(axis = 0)
74 | 				
75 | 				two_d = two_d.max(axis = 0)
76 | 				three_d = three_d.max(axis = 0)
77 | 			
78 | 				# features['2d'] = two_d
79 | 				# features['3d'] = three_d
80 | 				# features['caption'] = subtitle
81 | 				features[features_name.replace('.npy', '')] = subtitle
82 | 
83 | 				# data.append(features)
84 | 				count_match += 1
85 | 
86 | 	print("Count = ", count)
87 | 	total_count += count
88 | 
89 | print("Count match = ", count_match)
90 | print("Total count = ", total_count)
91 | 
92 | with open(join(base_dir,  'm6_pysd_subs.pkl'), 'wb') as handle:
93 | 	pkl.dump(features, handle, protocol=pkl.HIGHEST_PROTOCOL)	
94 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/extract.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | import math
 3 | import numpy as np
 4 | from video_loader import VideoLoader
 5 | from torch.utils.data import DataLoader
 6 | import argparse
 7 | from model import get_model
 8 | from preprocessing import Preprocessing
 9 | from random_sequence_shuffler import RandomSequenceSampler
10 | import torch.nn.functional as F
11 | import os
12 | 
13 | parser = argparse.ArgumentParser(description='Easy video feature extractor')
14 | 
15 | parser.add_argument(
16 |     '--csv',
17 |     type=str,
18 |     help='input csv with video input path')
19 | parser.add_argument('--batch_size', type=int, default=64,
20 |                             help='batch size')
21 | parser.add_argument('--type', type=str, default='2d',
22 |                             help='CNN type')
23 | parser.add_argument('--half_precision', type=int, default=1,
24 |                             help='output half precision float')
25 | parser.add_argument('--num_decoding_thread', type=int, default=4,
26 |                             help='Num parallel thread for video decoding')
27 | parser.add_argument('--l2_normalize', type=int, default=1,
28 |                             help='l2 normalize feature')
29 | parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth',
30 |                             help='Resnext model path')
31 | args = parser.parse_args()
32 | 
33 | dataset = VideoLoader(
34 |     args.csv,
35 |     framerate=1 if args.type == '2d' else 24,
36 |     size=224 if args.type == '2d' else 112,
37 |     centercrop=(args.type == '3d'),
38 | )
39 | n_dataset = len(dataset)
40 | sampler = RandomSequenceSampler(n_dataset, 10)
41 | loader = DataLoader(
42 |     dataset,
43 |     batch_size=1,
44 |     shuffle=False,
45 |     num_workers=args.num_decoding_thread,
46 |     sampler=sampler if n_dataset > 10 else None,
47 | )
48 | preprocess = Preprocessing(args.type)
49 | model = get_model(args)
50 | 
51 | with th.no_grad():
52 |     for k, data in enumerate(loader):
53 |         input_file = data['input'][0]
54 |         output_file = data['output'][0]
55 |         base_dir = "/".join(output_file.split('/')[:-1])
56 |         os.makedirs(base_dir, exist_ok=True)
57 |         if len(data['video'].shape) > 3:
58 |             print('Computing features of video {}/{}: {}'.format(
59 |                 k + 1, n_dataset, input_file))
60 |             video = data['video'].squeeze()
61 |             if len(video.shape) == 4:
62 |                 video = preprocess(video)
63 |                 n_chunk = len(video)
64 |                 features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0)
65 |                 n_iter = int(math.ceil(n_chunk / float(args.batch_size)))
66 |                 for i in range(n_iter):
67 |                     min_ind = i * args.batch_size
68 |                     max_ind = (i + 1) * args.batch_size
69 |                     video_batch = video[min_ind:max_ind].cuda()
70 |                     batch_features = model(video_batch)
71 |                     if args.l2_normalize:
72 |                         batch_features = F.normalize(batch_features, dim=1)
73 |                     features[min_ind:max_ind] = batch_features
74 |                 features = features.cpu().numpy()
75 |                 if args.half_precision:
76 |                     features = features.astype('float16')
77 |                 np.save(output_file, features)
78 |         else:
79 |             print('Video {} already processed.'.format(input_file))
80 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/extract_features_2d_indi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 10
 4 | #SBATCH --gres=gpu:1
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=04-00:00:00
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | echo "Started"
11 | 
12 | python extract.py --csv=/ssd_scratch/cvit/AVL/data_subset_50s_60s/input_2d_m011_m012.csv --type=2d --batch_size=64 --num_decoding_thread=8
13 | 
14 | echo "Done successfully"
15 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/extract_features_3d_indi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 10
 4 | #SBATCH --gres=gpu:1
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=04-00:00:00
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | echo "Started"
11 | 
12 | python extract.py --csv=/ssd_scratch/cvit/AVL/data_subset_50s_60s/input_3d_m011_m012.csv --type=3d --batch_size=64 --num_decoding_thread=8
13 | 
14 | echo "Done successfully"
15 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/helper_pkl.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | import json
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | from os.path import join, isfile
10 | from glob import glob
11 | 
12 | import numpy as np
13 | import subprocess
14 | 
15 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s'
16 | 
17 | delimiter = "@#@"
18 | 
19 | def toFFMPEGtime(t):
20 |     ss, ms = divmod(t*1000, 1000)
21 |     mm, ss = divmod(ss, 60)
22 |     hh, mm = divmod(mm, 60)
23 | 
24 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
25 | 
26 | def getTime(t):
27 | 	h, m, sms = t.split(":")
28 | 	if ',' in sms: # Example t = '00:00:03,980'
29 | 		s, ms = sms.split(",")
30 | 	elif '.' in sms: # Example t = '00:00:03.980'
31 | 		s, ms = sms.split(".")
32 | 	else: # Example t = '00:00:03'
33 | 		s = sms
34 | 		ms = '0'
35 | 	tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
36 | 	return tm
37 | 
38 | 
39 | base_pkl = pkl.load(open(join(base_dir, 'dataset_v1_20s25s_2d3dOCRBERT.pkl'), 'rb'))
40 | 
41 | 
42 | d = {}
43 | 
44 | for f in tqdm(base_pkl):
45 |     vid_name = "-".join(f['id'].split('-')[:-1])
46 |     if vid_name not in d:
47 |         d[vid_name] = []
48 |     d[vid_name].append(f)
49 | 
50 | 
51 | with open(join(base_dir, 'dataset_v1_helper_20s25s.pkl'), 'wb') as handle:
52 | 	pkl.dump(d, handle, protocol=pkl.HIGHEST_PROTOCOL)	
53 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/lec_list.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | import json
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | from os.path import join, isfile
10 | from glob import glob
11 | 
12 | import numpy as np
13 | import subprocess
14 | 
15 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s'
16 | 
17 | delimiter = "@#@"
18 | 
19 | def toFFMPEGtime(t):
20 |     ss, ms = divmod(t*1000, 1000)
21 |     mm, ss = divmod(ss, 60)
22 |     hh, mm = divmod(mm, 60)
23 | 
24 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
25 | 
26 | def getTime(t):
27 | 	h, m, sms = t.split(":")
28 | 	if ',' in sms: # Example t = '00:00:03,980'
29 | 		s, ms = sms.split(",")
30 | 	elif '.' in sms: # Example t = '00:00:03.980'
31 | 		s, ms = sms.split(".")
32 | 	else: # Example t = '00:00:03'
33 | 		s = sms
34 | 		ms = '0'
35 | 	tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
36 | 	return tm
37 | 
38 | 
39 | base_pkl = pkl.load(open(join(base_dir, 'dataset_v1_20s25s_2d3dOCRBERT.pkl'), 'rb'))
40 | 
41 | 
42 | d = []
43 | 
44 | for f in tqdm(base_pkl):
45 |     vid_name = "-".join(f['id'].split('-')[:-1])
46 |     if vid_name not in d:
47 |         d.append(vid_name)
48 | 
49 | with open(join(base_dir, 'dataset_v1_leclist_20s25s.pkl'), 'wb') as handle:
50 | 	pkl.dump(d, handle, protocol=pkl.HIGHEST_PROTOCOL)	
51 | 
52 | print(len(d))
53 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/merge_and_bert.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | import json
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | from os.path import join, isfile
10 | from glob import glob
11 | 
12 | from sentence_transformers import SentenceTransformer, util
13 | 
14 | import numpy as np
15 | import subprocess
16 | import copy
17 | 
18 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s'
19 | 
20 | print("Loading BERT models")
21 | 
22 | # model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
23 | model_ss = SentenceTransformer('all-mpnet-base-v2')
24 | 
25 | print("Done model loading")
26 | 
27 | p_list = ['/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s/v1_2d3dOCR_20_25_1.pkl', 
28 |          '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_20s25s/v1_2d3dOCR_20_25_2.pkl']
29 | 
30 | # for p in glob(join(base_dir, '*')):
31 | #     p_list.append(p)
32 | 
33 | p_list.sort()
34 | 
35 | # p_list = [join(base_dir, 'v1_2d3dOCR_4_8.pkl')]
36 | 
37 | print(p_list)
38 | 
39 | data = []
40 | 
41 | for pkl_fl in p_list:
42 |     b = pkl.load(open(pkl_fl, 'rb'))
43 | 
44 |     for f in tqdm(b):
45 |         if f is not None:
46 |             features = f.copy()
47 | 
48 |             caption = features['caption']
49 | 
50 |             # features['emb_qa'] = model_qa.encode(caption)
51 |             features['emb_ss'] = model_ss.encode(caption)
52 | 
53 |             ocr_text = f['ocr_text']
54 |             # features['ocr_emb_qa'] = model_qa.encode(ocr_text)
55 |             features['ocr_emb_ss'] = model_ss.encode(ocr_text)
56 | 
57 |             data.append(features)
58 | 
59 | 
60 | print(len(data))
61 | 
62 | with open(join(base_dir, 'dataset_v1_20s25s_2d3dOCRBERT.pkl'), 'wb') as handle:
63 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)
64 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/merge_and_bert.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -A darshan.singh
 3 | #SBATCH -n 20
 4 | #SBATCH --gres=gpu:2
 5 | #SBATCH --mem-per-cpu=2G
 6 | #SBATCH --time=INFINITE
 7 | #SBATCH --mail-user=darshans012@gmail.com
 8 | #SBATCH --mail-type=ALL
 9 | 
10 | echo "started"
11 | 
12 | python merge_and_bert.py
13 | 
14 | echo "Finished successfully"
15 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/merge_and_bert_mp.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | import json
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | from os.path import join, isfile
10 | from glob import glob
11 | 
12 | from sentence_transformers import SentenceTransformer, util
13 | 
14 | from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
15 | 
16 | import numpy as np
17 | import subprocess
18 | import copy
19 | 
20 | base_dir = '/ssd_scratch/cvit/darshan/dataset_MITOCW_v1_4s8s'
21 | 
22 | print("Loading BERT models")
23 | 
24 | # model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
25 | model_ss = SentenceTransformer('all-mpnet-base-v2')
26 | 
27 | print("Done model loading")
28 | 
29 | p_list = []
30 | 
31 | for p in glob(join(base_dir, '*')):
32 |     p_list.append(p)
33 | 
34 | p_list.sort()
35 | 
36 | # p_list = [join(base_dir, 'v1_2d3dOCR_4_8.pkl')]
37 | 
38 | print(p_list)
39 | 
40 | data = []
41 | 
42 | def do_job(f):
43 |     features = f.copy()
44 | 
45 |     caption = features['caption']
46 | 
47 |     # features['emb_qa'] = model_qa.encode(caption)
48 |     features['emb_ss'] = model_ss.encode(caption)
49 | 
50 |     ocr_text = f['ocr_text']
51 |     # features['ocr_emb_qa'] = model_qa.encode(ocr_text)
52 |     features['ocr_emb_ss'] = model_ss.encode(ocr_text)
53 | 
54 |     return features
55 | 
56 | for pkl_fl in p_list:
57 |     b = pkl.load(open(pkl_fl, 'rb'))
58 | 
59 |     p = ProcessPoolExecutor(30)
60 | 
61 |     futures = [p.submit(do_job, li) for li in b]
62 |     x = [r.result() for r in tqdm(as_completed(futures), total=len(futures))]
63 |     data.extend(x)
64 | 
65 |     # for f in tqdm(b):
66 |     #     features = f.copy()
67 | 
68 |     #     caption = features['caption']
69 | 
70 |     #     # features['emb_qa'] = model_qa.encode(caption)
71 |     #     features['emb_ss'] = model_ss.encode(caption)
72 | 
73 |     #     ocr_text = f['ocr_text']
74 |     #     # features['ocr_emb_qa'] = model_qa.encode(ocr_text)
75 |     #     features['ocr_emb_ss'] = model_ss.encode(ocr_text)
76 | 
77 |     #     data.append(features)
78 | 
79 | 
80 | print(len(data))
81 | 
82 | with open(join(base_dir, 'datasetv1_4s8s_2d3dOCRBERT.pkl'), 'wb') as handle:
83 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/model.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import torch as th
 3 | import torchvision.models as models
 4 | from videocnn.models import resnext
 5 | from torch import nn
 6 | 
 7 | 
 8 | class GlobalAvgPool(nn.Module):
 9 |     def __init__(self):
10 |         super(GlobalAvgPool, self).__init__()
11 | 
12 |     def forward(self, x):
13 |         return th.mean(x, dim=[-2, -1])
14 | 
15 | 
16 | def get_model(args):
17 |     assert args.type in ['2d', '3d']
18 |     if args.type == '2d':
19 |         print('Loading 2D-ResNet-152 ...')
20 |         model = models.resnet152(pretrained=True)
21 |         model = nn.Sequential(*list(model.children())[:-2], GlobalAvgPool())
22 |         model = model.cuda()
23 |     else:
24 |         print('Loading 3D-ResneXt-101 ...')
25 |         model = resnext.resnet101(
26 |             num_classes=400,
27 |             shortcut_type='B',
28 |             cardinality=32,
29 |             sample_size=112,
30 |             sample_duration=16,
31 |             last_fc=False)
32 |         model = model.cuda()
33 |         model_data = th.load(args.resnext101_model_path)
34 |         model.load_state_dict(model_data)
35 | 
36 |     model.eval()
37 |     print('loaded')
38 |     return model
39 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/ocr_bert_pickle.py:
--------------------------------------------------------------------------------
 1 | import pickle as pkl
 2 | import os.path
 3 | import os
 4 | import argparse
 5 | import json
 6 | 
 7 | from tqdm import tqdm
 8 | 
 9 | from os.path import join, isfile
10 | from glob import glob
11 | 
12 | from sentence_transformers import SentenceTransformer, util
13 | 
14 | import numpy as np
15 | import subprocess
16 | 
17 | 
18 | base_dir = '/ssd_scratch/cvit/darshan/segmentation_dataset_v1_10s15s'
19 | 
20 | 
21 | delimiter = "@#@"
22 | 
23 | def toFFMPEGtime(t):
24 |     ss, ms = divmod(t*1000, 1000)
25 |     mm, ss = divmod(ss, 60)
26 |     hh, mm = divmod(mm, 60)
27 | 
28 |     return "{:02d}:{:02d}:{:02d},{:03d}".format(int(hh), int(mm), int(ss), int(ms))
29 | 
30 | def getTime(t):
31 | 	h, m, sms = t.split(":")
32 | 	if ',' in sms: # Example t = '00:00:03,980'
33 | 		s, ms = sms.split(",")
34 | 	elif '.' in sms: # Example t = '00:00:03.980'
35 | 		s, ms = sms.split(".")
36 | 	else: # Example t = '00:00:03'
37 | 		s = sms
38 | 		ms = '0'
39 | 	tm = 3600 * int(h) + 60 * int(m) + int(s) + int(ms.ljust(3, '0'))/1000
40 | 	return tm
41 | 
42 | 
43 | model_qa = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
44 | model_ss = SentenceTransformer('all-mpnet-base-v2')
45 | 
46 | base_pkl = pkl.load(open(join(base_dir, 'old_seg_10s15s_2d3dBERTOCR.pkl'), 'rb'))
47 | 
48 | data = []
49 | 
50 | for f in tqdm(base_pkl):
51 |     features = f.copy()
52 |     ocr_text = f['ocr_text']
53 |     features['ocr_emb_qa'] = model_qa.encode(ocr_text)
54 |     features['ocr_emb_ss'] = model_ss.encode(ocr_text)
55 | 
56 | 
57 |     data.append(features)
58 | 
59 | 
60 | with open(join(base_dir, 'seg_10s15s_2d3dOCRBERT.pkl'), 'wb') as handle:
61 | 	pkl.dump(data, handle, protocol=pkl.HIGHEST_PROTOCOL)	


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | 
 3 | class Normalize(object):
 4 | 
 5 |     def __init__(self, mean, std):
 6 |         self.mean = th.FloatTensor(mean).view(1, 3, 1, 1)
 7 |         self.std = th.FloatTensor(std).view(1, 3, 1, 1)
 8 | 
 9 |     def __call__(self, tensor):
10 |         tensor = (tensor - self.mean) / (self.std + 1e-8)
11 |         return tensor
12 | 
13 | class Preprocessing(object):
14 | 
15 |     def __init__(self, type):
16 |         self.type = type
17 |         if type == '2d':
18 |             self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
19 |         elif type == '3d':
20 |             self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0])
21 | 
22 |     def _zero_pad(self, tensor, size):
23 |         n = size - len(tensor) % size
24 |         if n == size:
25 |             return tensor
26 |         else:
27 |             z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3])
28 |             return th.cat((tensor, z), 0)
29 | 
30 |     def __call__(self, tensor):
31 |         if self.type == '2d':
32 |             tensor = tensor / 255.0
33 |             tensor = self.norm(tensor)
34 |         elif self.type == '3d':
35 |             tensor = self._zero_pad(tensor, 16)
36 |             tensor = self.norm(tensor)
37 |             tensor = tensor.view(-1, 16, 3, 112, 112)
38 |             tensor = tensor.transpose(1, 2)
39 |         return tensor
40 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/random_sequence_shuffler.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | from torch.utils.data.sampler import Sampler
 3 | import numpy as np
 4 | 
 5 | class RandomSequenceSampler(Sampler):
 6 | 
 7 |     def __init__(self, n_sample, seq_len):
 8 |         self.n_sample = n_sample
 9 |         self.seq_len = seq_len
10 | 
11 |     def _pad_ind(self, ind):
12 |         zeros = np.zeros(self.seq_len - self.n_sample % self.seq_len)
13 |         ind = np.concatenate((ind, zeros))
14 |         return ind
15 | 
16 |     def __iter__(self):
17 |         idx = np.arange(self.n_sample)
18 |         if self.n_sample % self.seq_len != 0:
19 |             idx = self._pad_ind(idx)
20 |         idx = np.reshape(idx, (-1, self.seq_len))
21 |         np.random.shuffle(idx)
22 |         idx = np.reshape(idx, (-1))
23 |         return iter(idx.astype(int))
24 | 
25 |     def __len__(self):
26 |         return self.n_sample + (self.seq_len - self.n_sample % self.seq_len)
27 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/readme.txt:
--------------------------------------------------------------------------------
 1 | Step 1:
 2 | Execute the "create_feature_csv.py" program. To specify the path of the Datasubset use the '--base_dir' optional argument. The default path of the DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset'. After executing this program the following will be created inside base_dir.
 3 | a. input_2d.csv   
 4 | b. input_3d.csv
 5 | c. Also empty directories called "features", "features/2d/", "features/3d/" will be created.
 6 | 
 7 | Step 2:
 8 | Once we have the 2d, 3d CSV files and empty directories to store 2d & 3d features, our next task is to extract the 2d and 3d features from the videos using the "extract.py" program.
 9 | First extract the 2d features using the following command:
10 | $ python extract.py --csv=input_2d.csv --type=2d --batch_size=64 --num_decoding_thread=4
11 | Then download the 3D ResNext-101 model as follows (for 3d feature extraction):
12 | $ mkdir model
13 | $ cd model
14 | $ wget https://www.rocq.inria.fr/cluster-willow/amiech/howto100m/models/resnext101.pth
15 | Now extract the 3d features using the following command:
16 | $ python extract.py --csv=input_3d.csv --type=3d --batch_size=64 --num_decoding_thread=4
17 | 
18 | Step 3:
19 | Now it is time to create the pickle file of our data. To do this execute the "create_pickle.py" program. To specify the path of the Datasubset use the '--base_dir' optional argument. The default path of the DataSubset is '/ssd_scratch/cvit/AVLectures/DataSubset'. After executing this program a pickle file called 'avl.pkl' will be created inside base_dir.
20 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/video_loader.py:
--------------------------------------------------------------------------------
 1 | import torch as th
 2 | from torch.utils.data import Dataset
 3 | import pandas as pd
 4 | import os
 5 | import numpy as np
 6 | import ffmpeg
 7 | 
 8 | 
 9 | class VideoLoader(Dataset):
10 |     """Pytorch video loader."""
11 | 
12 |     def __init__(
13 |             self,
14 |             csv,
15 |             framerate=1,
16 |             size=112,
17 |             centercrop=False,
18 |     ):
19 |         """
20 |         Args:
21 |         """
22 |         self.csv = pd.read_csv(csv)
23 |         self.centercrop = centercrop
24 |         self.size = size
25 |         self.framerate = framerate
26 | 
27 |     def __len__(self):
28 |         return len(self.csv)
29 | 
30 |     def _get_video_dim(self, video_path):
31 |         probe = ffmpeg.probe(video_path)
32 |         video_stream = next((stream for stream in probe['streams']
33 |                              if stream['codec_type'] == 'video'), None)
34 |         width = int(video_stream['width'])
35 |         height = int(video_stream['height'])
36 |         return height, width
37 | 
38 |     def _get_output_dim(self, h, w):
39 |         if isinstance(self.size, tuple) and len(self.size) == 2:
40 |             return self.size
41 |         elif h >= w:
42 |             return int(h * self.size / w), self.size
43 |         else:
44 |             return self.size, int(w * self.size / h)
45 | 
46 |     def __getitem__(self, idx):
47 |         video_path = self.csv['video_path'].values[idx]
48 |         output_file = self.csv['feature_path'].values[idx]
49 | 
50 |         if not(os.path.isfile(output_file)) and os.path.isfile(video_path):
51 |             print('Decoding video: {}'.format(video_path))
52 |             try:
53 |                 h, w = self._get_video_dim(video_path)
54 |             except:
55 |                 print('ffprobe failed at: {}'.format(video_path))
56 |                 return {'video': th.zeros(1), 'input': video_path,
57 |                         'output': output_file}
58 |             height, width = self._get_output_dim(h, w)
59 |             cmd = (
60 |                 ffmpeg
61 |                 .input(video_path)
62 |                 .filter('fps', fps=self.framerate)
63 |                 .filter('scale', width, height)
64 |             )
65 |             if self.centercrop:
66 |                 x = int((width - self.size) / 2.0)
67 |                 y = int((height - self.size) / 2.0)
68 |                 cmd = cmd.crop(x, y, self.size, self.size)
69 |             out, _ = (
70 |                 cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24')
71 |                 .run(capture_stdout=True, quiet=True)
72 |             )
73 |             if self.centercrop and isinstance(self.size, int):
74 |                 height, width = self.size, self.size
75 |             video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3])
76 |             video = th.from_numpy(video.astype('float32'))
77 |             video = video.permute(0, 3, 1, 2)
78 |         else:
79 |             video = th.zeros(1)
80 |             
81 |         return {'video': video, 'input': video_path, 'output': output_file}
82 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | .DS_Store


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/.opts.py.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/code/lecture_aware_embds/video_feature_extractor/videocnn/.opts.py.swp


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Kensho Hara
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/README.md:
--------------------------------------------------------------------------------
 1 | # Video Classification Using 3D ResNet
 2 | This is a pytorch code for video (action) classification using 3D ResNet trained by [this code](https://github.com/kenshohara/3D-ResNets-PyTorch).  
 3 | The 3D ResNet is trained on the Kinetics dataset, which includes 400 action classes.  
 4 | This code uses videos as inputs and outputs class names and predicted class scores for each 16 frames in the score mode.  
 5 | In the feature mode, this code outputs features of 512 dims (after global average pooling) for each 16 frames.  
 6 | 
 7 | **Torch (Lua) version of this code is available [here](https://github.com/kenshohara/video-classification-3d-cnn).**
 8 | 
 9 | ## Requirements
10 | * [PyTorch](http://pytorch.org/)
11 | ```
12 | conda install pytorch torchvision cuda80 -c soumith
13 | ```
14 | * FFmpeg, FFprobe
15 | ```
16 | wget http://johnvansickle.com/ffmpeg/releases/ffmpeg-release-64bit-static.tar.xz
17 | tar xvf ffmpeg-release-64bit-static.tar.xz
18 | cd ./ffmpeg-3.3.3-64bit-static/; sudo cp ffmpeg ffprobe /usr/local/bin;
19 | ```
20 | * Python 3
21 | 
22 | ## Preparation
23 | * Download this code.
24 | * Download the [pretrained model](https://drive.google.com/drive/folders/1zvl89AgFAApbH0At-gMuZSeQB_LpNP-M?usp=sharing).  
25 |   * ResNeXt-101 achieved the best performance in our experiments. (See [paper](https://arxiv.org/abs/1711.09577) in details.)
26 | 
27 | ## Usage
28 | Assume input video files are located in ```./videos```.
29 | 
30 | To calculate class scores for each 16 frames, use ```--mode score```.
31 | ```
32 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode score
33 | ```
34 | To visualize the classification results, use ```generate_result_video/generate_result_video.py```.
35 | 
36 | To calculate video features for each 16 frames, use ```--mode feature```.
37 | ```
38 | python main.py --input ./input --video_root ./videos --output ./output.json --model ./resnet-34-kinetics.pth --mode feature
39 | ```
40 | 
41 | 
42 | ## Citation
43 | If you use this code, please cite the following:
44 | ```
45 | @article{hara3dcnns,
46 |   author={Kensho Hara and Hirokatsu Kataoka and Yutaka Satoh},
47 |   title={Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?},
48 |   journal={arXiv preprint},
49 |   volume={arXiv:1711.09577},
50 |   year={2017},
51 | }
52 | ```
53 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/classify.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | from dataset import Video
 5 | from spatial_transforms import (Compose, Normalize, Scale, CenterCrop, ToTensor)
 6 | from temporal_transforms import LoopPadding
 7 | 
 8 | def classify_video(video_dir, video_name, class_names, model, opt):
 9 |     assert opt.mode in ['score', 'feature']
10 | 
11 |     spatial_transform = Compose([Scale(opt.sample_size),
12 |                                  CenterCrop(opt.sample_size),
13 |                                  ToTensor(),
14 |                                  Normalize(opt.mean, [1, 1, 1])])
15 |     temporal_transform = LoopPadding(opt.sample_duration)
16 |     data = Video(video_dir, spatial_transform=spatial_transform,
17 |                  temporal_transform=temporal_transform,
18 |                  sample_duration=opt.sample_duration)
19 |     data_loader = torch.utils.data.DataLoader(data, batch_size=opt.batch_size,
20 |                                               shuffle=False, num_workers=opt.n_threads, pin_memory=True)
21 | 
22 |     video_outputs = []
23 |     video_segments = []
24 |     for i, (inputs, segments) in enumerate(data_loader):
25 |         inputs = Variable(inputs, volatile=True)
26 |         outputs = model(inputs)
27 | 
28 |         video_outputs.append(outputs.cpu().data)
29 |         video_segments.append(segments)
30 | 
31 |     video_outputs = torch.cat(video_outputs)
32 |     video_segments = torch.cat(video_segments)
33 |     results = {
34 |         'video': video_name,
35 |         'clips': []
36 |     }
37 | 
38 |     _, max_indices = video_outputs.max(dim=1)
39 |     for i in range(video_outputs.size(0)):
40 |         clip_results = {
41 |             'segment': video_segments[i].tolist(),
42 |         }
43 | 
44 |         if opt.mode == 'score':
45 |             clip_results['label'] = class_names[max_indices[i]]
46 |             clip_results['scores'] = video_outputs[i].tolist()
47 |         elif opt.mode == 'feature':
48 |             clip_results['features'] = video_outputs[i].tolist()
49 | 
50 |         results['clips'].append(clip_results)
51 | 
52 |     return results
53 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.utils.data as data
  3 | from PIL import Image
  4 | import os
  5 | import math
  6 | import functools
  7 | import copy
  8 | 
  9 | 
 10 | def pil_loader(path):
 11 |     # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
 12 |     with open(path, 'rb') as f:
 13 |         with Image.open(f) as img:
 14 |             return img.convert('RGB')
 15 | 
 16 | 
 17 | def accimage_loader(path):
 18 |     try:
 19 |         return accimage.Image(path)
 20 |     except IOError:
 21 |         # Potentially a decoding problem, fall back to PIL.Image
 22 |         return pil_loader(path)
 23 | 
 24 | 
 25 | def get_default_image_loader():
 26 |     from torchvision import get_image_backend
 27 |     if get_image_backend() == 'accimage':
 28 |         import accimage
 29 |         return accimage_loader
 30 |     else:
 31 |         return pil_loader
 32 | 
 33 | 
 34 | def video_loader(video_dir_path, frame_indices, image_loader):
 35 |     video = []
 36 |     for i in frame_indices:
 37 |         image_path = os.path.join(video_dir_path, 'image_{:05d}.jpg'.format(i))
 38 |         if os.path.exists(image_path):
 39 |             video.append(image_loader(image_path))
 40 |         else:
 41 |             return video
 42 | 
 43 |     return video
 44 | 
 45 | 
 46 | def get_default_video_loader():
 47 |     image_loader = get_default_image_loader()
 48 |     return functools.partial(video_loader, image_loader=image_loader)
 49 | 
 50 | 
 51 | def load_annotation_data(data_file_path):
 52 |     with open(data_file_path, 'r') as data_file:
 53 |         return json.load(data_file)
 54 | 
 55 | 
 56 | def get_class_labels(data):
 57 |     class_labels_map = {}
 58 |     index = 0
 59 |     for class_label in data['labels']:
 60 |         class_labels_map[class_label] = index
 61 |         index += 1
 62 |     return class_labels_map
 63 | 
 64 | 
 65 | def get_video_names_and_annotations(data, subset):
 66 |     video_names = []
 67 |     annotations = []
 68 | 
 69 |     for key, value in data['database'].items():
 70 |         this_subset = value['subset']
 71 |         if this_subset == subset:
 72 |             if subset == 'testing':
 73 |                 video_names.append('test/{}'.format(key))
 74 |             else:
 75 |                 label = value['annotations']['label']
 76 |                 video_names.append('{}/{}'.format(label, key))
 77 |                 annotations.append(value['annotations'])
 78 | 
 79 |     return video_names, annotations
 80 | 
 81 | 
 82 | def make_dataset(video_path, sample_duration):
 83 |     dataset = []
 84 | 
 85 |     n_frames = len(os.listdir(video_path))
 86 | 
 87 |     begin_t = 1
 88 |     end_t = n_frames
 89 |     sample = {
 90 |         'video': video_path,
 91 |         'segment': [begin_t, end_t],
 92 |         'n_frames': n_frames,
 93 |     }
 94 | 
 95 |     step = sample_duration
 96 |     for i in range(1, (n_frames - sample_duration + 1), step):
 97 |         sample_i = copy.deepcopy(sample)
 98 |         sample_i['frame_indices'] = list(range(i, i + sample_duration))
 99 |         sample_i['segment'] = torch.IntTensor([i, i + sample_duration - 1])
100 |         dataset.append(sample_i)
101 | 
102 |     return dataset
103 | 
104 | 
105 | class Video(data.Dataset):
106 |     def __init__(self, video_path,
107 |                  spatial_transform=None, temporal_transform=None,
108 |                  sample_duration=16, get_loader=get_default_video_loader):
109 |         self.data = make_dataset(video_path, sample_duration)
110 | 
111 |         self.spatial_transform = spatial_transform
112 |         self.temporal_transform = temporal_transform
113 |         self.loader = get_loader()
114 | 
115 |     def __getitem__(self, index):
116 |         """
117 |         Args:
118 |             index (int): Index
119 |         Returns:
120 |             tuple: (image, target) where target is class_index of the target class.
121 |         """
122 |         path = self.data[index]['video']
123 | 
124 |         frame_indices = self.data[index]['frame_indices']
125 |         if self.temporal_transform is not None:
126 |             frame_indices = self.temporal_transform(frame_indices)
127 |         clip = self.loader(path, frame_indices)
128 |         if self.spatial_transform is not None:
129 |             clip = [self.spatial_transform(img) for img in clip]
130 |         clip = torch.stack(clip, 0).permute(1, 0, 2, 3)
131 | 
132 |         target = self.data[index]['segment']
133 | 
134 |         return clip, target
135 | 
136 |     def __len__(self):
137 |         return len(self.data)
138 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/README.md:
--------------------------------------------------------------------------------
 1 | # Result Video Generation
 2 | This is a code for generating videos of classification results.  
 3 | It uses both ```output.json``` and videos as inputs and draw predicted class names in each frame.
 4 | 
 5 | ## Requirements
 6 | * Python 3
 7 | * Pillow
 8 | * ffmpeg, ffprobe
 9 | 
10 | ## Usage
11 | To generate videos based on ```../output.json```, execute the following.
12 | ```
13 | python generate_result_video.py ../output.json ../videos ./videos_pred ../class_names_list 5
14 | ```
15 | The 2nd parameter (```../videos```) is the root directory of videos.
16 | The 3rd parameter (```./videos_pred```) is the directory path of output videos.
17 | The 5th parameter is a size of temporal unit.  
18 | The CNN predicts class scores for a 16 frame clip.  
19 | The code averages the scores over each unit.  
20 | The size 5 means that it averages the scores over 5 clips (i.e. 16x5 frames).  
21 | If you use the size as 0, the scores are averaged over all clips of a video.  
22 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/SourceSansPro-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/SourceSansPro-Regular.ttf


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/generate_result_video/generate_result_video.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import subprocess
  5 | import numpy as np
  6 | from PIL import Image, ImageDraw, ImageFont
  7 | 
  8 | 
  9 | def get_fps(video_file_path, frames_directory_path):
 10 |     p = subprocess.Popen('ffprobe {}'.format(video_file_path),
 11 |                          shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 12 |     _, res = p.communicate()
 13 |     res = res.decode('utf-8')
 14 | 
 15 |     duration_index = res.find('Duration:')
 16 |     duration_str = res[(duration_index + 10):(duration_index + 21)]
 17 |     hour = float(duration_str[0:2])
 18 |     minute = float(duration_str[3:5])
 19 |     sec = float(duration_str[6:10])
 20 |     total_sec = hour * 3600 + minute * 60 + sec
 21 | 
 22 |     n_frames = len(os.listdir(frames_directory_path))
 23 |     fps = round(n_frames / total_sec, 2)
 24 |     return fps
 25 | 
 26 | 
 27 | if __name__ == '__main__':
 28 |     result_json_path = sys.argv[1]
 29 |     video_root_path = sys.argv[2]
 30 |     dst_directory_path = sys.argv[3]
 31 |     if not os.path.exists(dst_directory_path):
 32 |         subprocess.call('mkdir -p {}'.format(dst_directory_path), shell=True)
 33 |     class_name_path = sys.argv[4]
 34 |     temporal_unit = int(sys.argv[5])
 35 | 
 36 |     with open(result_json_path, 'r') as f:
 37 |         results = json.load(f)
 38 | 
 39 |     with open(class_name_path, 'r') as f:
 40 |         class_names = []
 41 |         for row in f:
 42 |             class_names.append(row[:-1])
 43 | 
 44 |     for index in range(len(results)):
 45 |         video_path = os.path.join(video_root_path, results[index]['video'])
 46 |         print(video_path)
 47 | 
 48 |         clips = results[index]['clips']
 49 |         unit_classes = []
 50 |         unit_segments = []
 51 |         if temporal_unit == 0:
 52 |             unit = len(clips)
 53 |         else:
 54 |             unit = temporal_unit
 55 |         for i in range(0, len(clips), unit):
 56 |             n_elements = min(unit, len(clips) - i)
 57 |             scores = np.array(clips[i]['scores'])
 58 |             for j in range(i, min(i + unit, len(clips))):
 59 |                 scores += np.array(clips[i]['scores'])
 60 |             scores /= n_elements
 61 |             unit_classes.append(class_names[np.argmax(scores)])
 62 |             unit_segments.append([clips[i]['segment'][0],
 63 |                                   clips[i + n_elements - 1]['segment'][1]])
 64 | 
 65 |         if os.path.exists('tmp'):
 66 |             subprocess.call('rm -rf tmp', shell=True)
 67 |         subprocess.call('mkdir tmp', shell=True)
 68 | 
 69 |         subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(video_path), shell=True)
 70 | 
 71 |         fps = get_fps(video_path, 'tmp')
 72 | 
 73 |         for i in range(len(unit_classes)):
 74 |             for j in range(unit_segments[i][0], unit_segments[i][1] + 1):
 75 |                 image = Image.open('tmp/image_{:05}.jpg'.format(j)).convert('RGB')
 76 |                 min_length = min(image.size)
 77 |                 font_size = int(min_length * 0.05)
 78 |                 font = ImageFont.truetype(os.path.join(os.path.dirname(__file__),
 79 |                                                        'SourceSansPro-Regular.ttf'),
 80 |                                           font_size)
 81 |                 d = ImageDraw.Draw(image)
 82 |                 textsize = d.textsize(unit_classes[i], font=font)
 83 |                 x = int(font_size * 0.5)
 84 |                 y = int(font_size * 0.25)
 85 |                 x_offset = x
 86 |                 y_offset = y
 87 |                 rect_position = (x, y, x + textsize[0] + x_offset * 2,
 88 |                                  y + textsize[1] + y_offset * 2)
 89 |                 d.rectangle(rect_position, fill=(30, 30, 30))
 90 |                 d.text((x + x_offset, y + y_offset), unit_classes[i],
 91 |                        font=font, fill=(235, 235, 235))
 92 |                 image.save('tmp/image_{:05}_pred.jpg'.format(j))
 93 | 
 94 |         dst_file_path = os.path.join(dst_directory_path, video_path.split('/')[-1])
 95 |         subprocess.call('ffmpeg -y -r {} -i tmp/image_%05d_pred.jpg -b:v 1000k {}'.format(fps, dst_file_path),
 96 |                         shell=True)
 97 | 
 98 |         if os.path.exists('tmp'):
 99 |             subprocess.call('rm -rf tmp', shell=True)
100 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/input:
--------------------------------------------------------------------------------
1 | video1.mp4
2 | video2.mp4
3 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import subprocess
 5 | import numpy as np
 6 | import torch
 7 | from torch import nn
 8 | 
 9 | from opts import parse_opts
10 | from model import generate_model
11 | from mean import get_mean
12 | from classify import classify_video
13 | 
14 | if __name__=="__main__":
15 |     opt = parse_opts()
16 |     opt.mean = get_mean()
17 |     opt.arch = '{}-{}'.format(opt.model_name, opt.model_depth)
18 |     opt.sample_size = 112
19 |     opt.sample_duration = 16
20 |     opt.n_classes = 400
21 | 
22 |     model = generate_model(opt)
23 |     print('loading model {}'.format(opt.model))
24 |     model_data = torch.load(opt.model)
25 |     assert opt.arch == model_data['arch']
26 |     model.load_state_dict(model_data['state_dict'])
27 |     model.eval()
28 |     if opt.verbose:
29 |         print(model)
30 | 
31 |     input_files = []
32 |     with open(opt.input, 'r') as f:
33 |         for row in f:
34 |             input_files.append(row[:-1])
35 | 
36 |     class_names = []
37 |     with open('class_names_list') as f:
38 |         for row in f:
39 |             class_names.append(row[:-1])
40 | 
41 |     ffmpeg_loglevel = 'quiet'
42 |     if opt.verbose:
43 |         ffmpeg_loglevel = 'info'
44 | 
45 |     if os.path.exists('tmp'):
46 |         subprocess.call('rm -rf tmp', shell=True)
47 | 
48 |     outputs = []
49 |     for input_file in input_files:
50 |         video_path = os.path.join(opt.video_root, input_file)
51 |         if os.path.exists(video_path):
52 |             print(video_path)
53 |             subprocess.call('mkdir tmp', shell=True)
54 |             subprocess.call('ffmpeg -i {} tmp/image_%05d.jpg'.format(video_path),
55 |                             shell=True)
56 | 
57 |             result = classify_video('tmp', input_file, class_names, model, opt)
58 |             outputs.append(result)
59 | 
60 |             subprocess.call('rm -rf tmp', shell=True)
61 |         else:
62 |             print('{} does not exist'.format(input_file))
63 | 
64 |     if os.path.exists('tmp'):
65 |         subprocess.call('rm -rf tmp', shell=True)
66 | 
67 |     with open(opt.output, 'w') as f:
68 |         json.dump(outputs, f)
69 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/mean.py:
--------------------------------------------------------------------------------
1 | def get_mean():
2 |     return [114.7748, 107.7354, 99.4750]
3 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/models/densenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from collections import OrderedDict
  5 | import math
  6 | 
  7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264']
  8 | 
  9 | 
 10 | def densenet121(**kwargs):
 11 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
 12 |                      **kwargs)
 13 |     return model
 14 | 
 15 | 
 16 | def densenet169(**kwargs):
 17 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
 18 |                      **kwargs)
 19 |     return model
 20 | 
 21 | 
 22 | def densenet201(**kwargs):
 23 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
 24 |                      **kwargs)
 25 |     return model
 26 | 
 27 | 
 28 | def densenet264(**kwargs):
 29 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48),
 30 |                      **kwargs)
 31 |     return model
 32 | 
 33 | 
 34 | def get_fine_tuning_parameters(model, ft_begin_index):
 35 |     if ft_begin_index == 0:
 36 |         return model.parameters()
 37 | 
 38 |     ft_module_names = []
 39 |     for i in range(ft_begin_index, 5):
 40 |         ft_module_names.append('denseblock{}'.format(ft_begin_index))
 41 |         ft_module_names.append('transition{}'.format(ft_begin_index))
 42 |     ft_module_names.append('norm5')
 43 |     ft_module_names.append('classifier')
 44 | 
 45 |     parameters = []
 46 |     for k, v in model.named_parameters():
 47 |         for ft_module in ft_module_names:
 48 |             if ft_module in k:
 49 |                 parameters.append({'params': v})
 50 |                 break
 51 |         else:
 52 |             parameters.append({'params': v, 'lr': 0.0})
 53 | 
 54 |     return parameters
 55 | 
 56 | 
 57 | class _DenseLayer(nn.Sequential):
 58 |     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 59 |         super(_DenseLayer, self).__init__()
 60 |         self.add_module('norm.1', nn.BatchNorm3d(num_input_features))
 61 |         self.add_module('relu.1', nn.ReLU(inplace=True))
 62 |         self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate,
 63 |                                             kernel_size=1, stride=1, bias=False))
 64 |         self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate))
 65 |         self.add_module('relu.2', nn.ReLU(inplace=True))
 66 |         self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate,
 67 |                                             kernel_size=3, stride=1, padding=1, bias=False))
 68 |         self.drop_rate = drop_rate
 69 | 
 70 |     def forward(self, x):
 71 |         new_features = super(_DenseLayer, self).forward(x)
 72 |         if self.drop_rate > 0:
 73 |             new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
 74 |         return torch.cat([x, new_features], 1)
 75 | 
 76 | 
 77 | class _DenseBlock(nn.Sequential):
 78 |     def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
 79 |         super(_DenseBlock, self).__init__()
 80 |         for i in range(num_layers):
 81 |             layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
 82 |             self.add_module('denselayer%d' % (i + 1), layer)
 83 | 
 84 | 
 85 | class _Transition(nn.Sequential):
 86 |     def __init__(self, num_input_features, num_output_features):
 87 |         super(_Transition, self).__init__()
 88 |         self.add_module('norm', nn.BatchNorm3d(num_input_features))
 89 |         self.add_module('relu', nn.ReLU(inplace=True))
 90 |         self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
 91 |                                           kernel_size=1, stride=1, bias=False))
 92 |         self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
 93 | 
 94 | 
 95 | class DenseNet(nn.Module):
 96 |     """Densenet-BC model class
 97 |     Args:
 98 |         growth_rate (int) - how many filters to add each layer (k in paper)
 99 |         block_config (list of 4 ints) - how many layers in each pooling block
100 |         num_init_features (int) - the number of filters to learn in the first convolution layer
101 |         bn_size (int) - multiplicative factor for number of bottle neck layers
102 |           (i.e. bn_size * k features in the bottleneck layer)
103 |         drop_rate (float) - dropout rate after each dense layer
104 |         num_classes (int) - number of classification classes
105 |     """
106 |     def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16),
107 |                  num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True):
108 | 
109 |         super(DenseNet, self).__init__()
110 | 
111 |         self.last_fc = last_fc
112 | 
113 |         self.sample_size = sample_size
114 |         self.sample_duration = sample_duration
115 | 
116 |         # First convolution
117 |         self.features = nn.Sequential(OrderedDict([
118 |             ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7,
119 |                                 stride=(1, 2, 2), padding=(3, 3, 3), bias=False)),
120 |             ('norm0', nn.BatchNorm3d(num_init_features)),
121 |             ('relu0', nn.ReLU(inplace=True)),
122 |             ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
123 |         ]))
124 | 
125 |         # Each denseblock
126 |         num_features = num_init_features
127 |         for i, num_layers in enumerate(block_config):
128 |             block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
129 |                                 bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
130 |             self.features.add_module('denseblock%d' % (i + 1), block)
131 |             num_features = num_features + num_layers * growth_rate
132 |             if i != len(block_config) - 1:
133 |                 trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
134 |                 self.features.add_module('transition%d' % (i + 1), trans)
135 |                 num_features = num_features // 2
136 | 
137 |         # Final batch norm
138 |         self.features.add_module('norm5', nn.BatchNorm2d(num_features))
139 | 
140 |         # Linear layer
141 |         self.classifier = nn.Linear(num_features, num_classes)
142 | 
143 |     def forward(self, x):
144 |         features = self.features(x)
145 |         out = F.relu(features, inplace=True)
146 |         last_duration = math.ceil(self.sample_duration / 16)
147 |         last_size = math.floor(self.sample_size / 32)
148 |         out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1)
149 |         if self.last_fc:
150 |             out = self.classifier(out)
151 |         return out
152 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/models/resnext.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class ResNeXtBottleneck(nn.Module):
 31 |     expansion = 2
 32 | 
 33 |     def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None):
 34 |         super(ResNeXtBottleneck, self).__init__()
 35 |         mid_planes = cardinality * int(planes / 32)
 36 |         self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
 37 |         self.bn1 = nn.BatchNorm3d(mid_planes)
 38 |         self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride,
 39 |                                padding=1, groups=cardinality, bias=False)
 40 |         self.bn2 = nn.BatchNorm3d(mid_planes)
 41 |         self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False)
 42 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 43 |         self.relu = nn.ReLU(inplace=True)
 44 |         self.downsample = downsample
 45 |         self.stride = stride
 46 | 
 47 |     def forward(self, x):
 48 |         residual = x
 49 | 
 50 |         out = self.conv1(x)
 51 |         out = self.bn1(out)
 52 |         out = self.relu(out)
 53 | 
 54 |         out = self.conv2(out)
 55 |         out = self.bn2(out)
 56 |         out = self.relu(out)
 57 | 
 58 |         out = self.conv3(out)
 59 |         out = self.bn3(out)
 60 | 
 61 |         if self.downsample is not None:
 62 |             residual = self.downsample(x)
 63 | 
 64 |         out += residual
 65 |         out = self.relu(out)
 66 | 
 67 |         return out
 68 | 
 69 | 
 70 | class ResNeXt(nn.Module):
 71 | 
 72 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True):
 73 |         self.last_fc = last_fc
 74 | 
 75 |         self.inplanes = 64
 76 |         super(ResNeXt, self).__init__()
 77 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
 78 |                                padding=(3, 3, 3), bias=False)
 79 |         self.bn1 = nn.BatchNorm3d(64)
 80 |         self.relu = nn.ReLU(inplace=True)
 81 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 82 |         self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality)
 83 |         self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2)
 84 |         self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2)
 85 |         self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2)
 86 |         last_duration = math.ceil(sample_duration / 16)
 87 |         last_size = math.ceil(sample_size / 32)
 88 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
 89 |         self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
 90 | 
 91 |         for m in self.modules():
 92 |             if isinstance(m, nn.Conv3d):
 93 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 94 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 95 |             elif isinstance(m, nn.BatchNorm3d):
 96 |                 m.weight.data.fill_(1)
 97 |                 m.bias.data.zero_()
 98 | 
 99 |     def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1):
100 |         downsample = None
101 |         if stride != 1 or self.inplanes != planes * block.expansion:
102 |             if shortcut_type == 'A':
103 |                 downsample = partial(downsample_basic_block,
104 |                                      planes=planes * block.expansion,
105 |                                      stride=stride)
106 |             else:
107 |                 downsample = nn.Sequential(
108 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
109 |                               kernel_size=1, stride=stride, bias=False),
110 |                     nn.BatchNorm3d(planes * block.expansion)
111 |                 )
112 | 
113 |         layers = []
114 |         layers.append(block(self.inplanes, planes, cardinality, stride, downsample))
115 |         self.inplanes = planes * block.expansion
116 |         for i in range(1, blocks):
117 |             layers.append(block(self.inplanes, planes, cardinality))
118 | 
119 |         return nn.Sequential(*layers)
120 | 
121 |     def forward(self, x):
122 |         x = self.conv1(x)
123 |         x = self.bn1(x)
124 |         x = self.relu(x)
125 |         x = self.maxpool(x)
126 | 
127 |         x = self.layer1(x)
128 |         x = self.layer2(x)
129 |         x = self.layer3(x)
130 |         x = self.layer4(x)
131 | 
132 |         x = self.avgpool(x)
133 | 
134 |         x = x.view(x.size(0), -1)
135 |         if self.last_fc:
136 |             x = self.fc(x)
137 | 
138 |         return x
139 | 
140 | def get_fine_tuning_parameters(model, ft_begin_index):
141 |     if ft_begin_index == 0:
142 |         return model.parameters()
143 | 
144 |     ft_module_names = []
145 |     for i in range(ft_begin_index, 5):
146 |         ft_module_names.append('layer{}'.format(ft_begin_index))
147 |     ft_module_names.append('fc')
148 | 
149 |     parameters = []
150 |     for k, v in model.named_parameters():
151 |         for ft_module in ft_module_names:
152 |             if ft_module in k:
153 |                 parameters.append({'params': v})
154 |                 break
155 |         else:
156 |             parameters.append({'params': v, 'lr': 0.0})
157 | 
158 |     return parameters
159 | 
160 | def resnet50(**kwargs):
161 |     """Constructs a ResNet-50 model.
162 |     """
163 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
164 |     return model
165 | 
166 | def resnet101(**kwargs):
167 |     """Constructs a ResNet-101 model.
168 |     """
169 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
170 |     return model
171 | 
172 | def resnet152(**kwargs):
173 |     """Constructs a ResNet-101 model.
174 |     """
175 |     model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
176 |     return model
177 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/models/wide_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class WideBottleneck(nn.Module):
 31 |     expansion = 2
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(WideBottleneck, self).__init__()
 35 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 36 |         self.bn1 = nn.BatchNorm3d(planes)
 37 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 38 |                                padding=1, bias=False)
 39 |         self.bn2 = nn.BatchNorm3d(planes)
 40 |         self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False)
 41 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 42 |         self.relu = nn.ReLU(inplace=True)
 43 |         self.downsample = downsample
 44 |         self.stride = stride
 45 | 
 46 |     def forward(self, x):
 47 |         residual = x
 48 | 
 49 |         out = self.conv1(x)
 50 |         out = self.bn1(out)
 51 |         out = self.relu(out)
 52 | 
 53 |         out = self.conv2(out)
 54 |         out = self.bn2(out)
 55 |         out = self.relu(out)
 56 | 
 57 |         out = self.conv3(out)
 58 |         out = self.bn3(out)
 59 | 
 60 |         if self.downsample is not None:
 61 |             residual = self.downsample(x)
 62 | 
 63 |         out += residual
 64 |         out = self.relu(out)
 65 | 
 66 |         return out
 67 | 
 68 | 
 69 | class WideResNet(nn.Module):
 70 | 
 71 |     def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True):
 72 |         self.last_fc = last_fc
 73 | 
 74 |         self.inplanes = 64
 75 |         super(WideResNet, self).__init__()
 76 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
 77 |                                padding=(3, 3, 3), bias=False)
 78 |         self.bn1 = nn.BatchNorm3d(64)
 79 |         self.relu = nn.ReLU(inplace=True)
 80 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 81 |         self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type)
 82 |         self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2)
 83 |         self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2)
 84 |         self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2)
 85 |         last_duration = math.ceil(sample_duration / 16)
 86 |         last_size = math.ceil(sample_size / 32)
 87 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
 88 |         self.fc = nn.Linear(512 * k * block.expansion, num_classes)
 89 | 
 90 |         for m in self.modules():
 91 |             if isinstance(m, nn.Conv3d):
 92 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 93 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 94 |             elif isinstance(m, nn.BatchNorm3d):
 95 |                 m.weight.data.fill_(1)
 96 |                 m.bias.data.zero_()
 97 | 
 98 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
 99 |         downsample = None
100 |         if stride != 1 or self.inplanes != planes * block.expansion:
101 |             if shortcut_type == 'A':
102 |                 downsample = partial(downsample_basic_block,
103 |                                      planes=planes * block.expansion,
104 |                                      stride=stride)
105 |             else:
106 |                 downsample = nn.Sequential(
107 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
108 |                               kernel_size=1, stride=stride, bias=False),
109 |                     nn.BatchNorm3d(planes * block.expansion)
110 |                 )
111 | 
112 |         layers = []
113 |         layers.append(block(self.inplanes, planes, stride, downsample))
114 |         self.inplanes = planes * block.expansion
115 |         for i in range(1, blocks):
116 |             layers.append(block(self.inplanes, planes))
117 | 
118 |         return nn.Sequential(*layers)
119 | 
120 |     def forward(self, x):
121 |         x = self.conv1(x)
122 |         x = self.bn1(x)
123 |         x = self.relu(x)
124 |         x = self.maxpool(x)
125 | 
126 |         x = self.layer1(x)
127 |         x = self.layer2(x)
128 |         x = self.layer3(x)
129 |         x = self.layer4(x)
130 | 
131 |         x = self.avgpool(x)
132 | 
133 |         x = x.view(x.size(0), -1)
134 |         if self.last_fc:
135 |             x = self.fc(x)
136 | 
137 |         return x
138 | 
139 | def get_fine_tuning_parameters(model, ft_begin_index):
140 |     if ft_begin_index == 0:
141 |         return model.parameters()
142 | 
143 |     ft_module_names = []
144 |     for i in range(ft_begin_index, 5):
145 |         ft_module_names.append('layer{}'.format(ft_begin_index))
146 |     ft_module_names.append('fc')
147 | 
148 |     parameters = []
149 |     for k, v in model.named_parameters():
150 |         for ft_module in ft_module_names:
151 |             if ft_module in k:
152 |                 parameters.append({'params': v})
153 |                 break
154 |         else:
155 |             parameters.append({'params': v, 'lr': 0.0})
156 | 
157 |     return parameters
158 | 
159 | def resnet50(**kwargs):
160 |     """Constructs a ResNet-50 model.
161 |     """
162 |     model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs)
163 |     return model
164 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/opts.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | def parse_opts():
 4 |     parser = argparse.ArgumentParser()
 5 |     parser.add_argument('--input', default='input', type=str, help='Input file path')
 6 |     parser.add_argument('--video_root', default='', type=str, help='Root path of input videos')
 7 |     parser.add_argument('--model', default='', type=str, help='Model file path')
 8 |     parser.add_argument('--output', default='output.json', type=str, help='Output file path')
 9 |     parser.add_argument('--mode', default='score', type=str, help='Mode (score | feature). score outputs class scores. feature outputs features (after global average pooling).')
10 |     parser.add_argument('--batch_size', default=32, type=int, help='Batch Size')
11 |     parser.add_argument('--n_threads', default=4, type=int, help='Number of threads for multi-thread loading')
12 |     parser.add_argument('--model_name', default='resnet', type=str, help='Currently only support resnet')
13 |     parser.add_argument('--model_depth', default=34, type=int, help='Depth of resnet (10 | 18 | 34 | 50 | 101)')
14 |     parser.add_argument('--resnet_shortcut', default='A', type=str, help='Shortcut type of resnet (A | B)')
15 |     parser.add_argument('--wide_resnet_k', default=2, type=int, help='Wide resnet k')
16 |     parser.add_argument('--resnext_cardinality', default=32, type=int, help='ResNeXt cardinality')
17 |     parser.add_argument('--no_cuda', action='store_true', help='If true, cuda is not used.')
18 |     parser.set_defaults(verbose=False)
19 |     parser.add_argument('--verbose', action='store_true', help='')
20 |     parser.set_defaults(verbose=False)
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     return args
25 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/spatial_transforms.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import math
  3 | import numbers
  4 | import collections
  5 | import numpy as np
  6 | import torch
  7 | from PIL import Image, ImageOps
  8 | try:
  9 |     import accimage
 10 | except ImportError:
 11 |     accimage = None
 12 | 
 13 | 
 14 | class Compose(object):
 15 |     """Composes several transforms together.
 16 |     Args:
 17 |         transforms (list of ``Transform`` objects): list of transforms to compose.
 18 |     Example:
 19 |         >>> transforms.Compose([
 20 |         >>>     transforms.CenterCrop(10),
 21 |         >>>     transforms.ToTensor(),
 22 |         >>> ])
 23 |     """
 24 | 
 25 |     def __init__(self, transforms):
 26 |         self.transforms = transforms
 27 | 
 28 |     def __call__(self, img):
 29 |         for t in self.transforms:
 30 |             img = t(img)
 31 |         return img
 32 | 
 33 | 
 34 | class ToTensor(object):
 35 |     """Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor.
 36 |     Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
 37 |     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0].
 38 |     """
 39 | 
 40 |     def __call__(self, pic):
 41 |         """
 42 |         Args:
 43 |             pic (PIL.Image or numpy.ndarray): Image to be converted to tensor.
 44 |         Returns:
 45 |             Tensor: Converted image.
 46 |         """
 47 |         if isinstance(pic, np.ndarray):
 48 |             # handle numpy array
 49 |             img = torch.from_numpy(pic.transpose((2, 0, 1)))
 50 |             # backward compatibility
 51 |             return img.float()
 52 | 
 53 |         if accimage is not None and isinstance(pic, accimage.Image):
 54 |             nppic = np.zeros([pic.channels, pic.height, pic.width], dtype=np.float32)
 55 |             pic.copyto(nppic)
 56 |             return torch.from_numpy(nppic)
 57 | 
 58 |         # handle PIL Image
 59 |         if pic.mode == 'I':
 60 |             img = torch.from_numpy(np.array(pic, np.int32, copy=False))
 61 |         elif pic.mode == 'I;16':
 62 |             img = torch.from_numpy(np.array(pic, np.int16, copy=False))
 63 |         else:
 64 |             img = torch.ByteTensor(torch.ByteStorage.from_buffer(pic.tobytes()))
 65 |         # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK
 66 |         if pic.mode == 'YCbCr':
 67 |             nchannel = 3
 68 |         elif pic.mode == 'I;16':
 69 |             nchannel = 1
 70 |         else:
 71 |             nchannel = len(pic.mode)
 72 |         img = img.view(pic.size[1], pic.size[0], nchannel)
 73 |         # put it from HWC to CHW format
 74 |         # yikes, this transpose takes 80% of the loading time/CPU
 75 |         img = img.transpose(0, 1).transpose(0, 2).contiguous()
 76 |         if isinstance(img, torch.ByteTensor):
 77 |             return img.float()
 78 |         else:
 79 |             return img
 80 | 
 81 | 
 82 | class Normalize(object):
 83 |     """Normalize an tensor image with mean and standard deviation.
 84 |     Given mean: (R, G, B) and std: (R, G, B),
 85 |     will normalize each channel of the torch.*Tensor, i.e.
 86 |     channel = (channel - mean) / std
 87 |     Args:
 88 |         mean (sequence): Sequence of means for R, G, B channels respecitvely.
 89 |         std (sequence): Sequence of standard deviations for R, G, B channels
 90 |             respecitvely.
 91 |     """
 92 | 
 93 |     def __init__(self, mean, std):
 94 |         self.mean = mean
 95 |         self.std = std
 96 | 
 97 |     def __call__(self, tensor):
 98 |         """
 99 |         Args:
100 |             tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
101 |         Returns:
102 |             Tensor: Normalized image.
103 |         """
104 |         # TODO: make efficient
105 |         for t, m, s in zip(tensor, self.mean, self.std):
106 |             t.sub_(m).div_(s)
107 |         return tensor
108 | 
109 | 
110 | class Scale(object):
111 |     """Rescale the input PIL.Image to the given size.
112 |     Args:
113 |         size (sequence or int): Desired output size. If size is a sequence like
114 |             (w, h), output size will be matched to this. If size is an int,
115 |             smaller edge of the image will be matched to this number.
116 |             i.e, if height > width, then image will be rescaled to
117 |             (size * height / width, size)
118 |         interpolation (int, optional): Desired interpolation. Default is
119 |             ``PIL.Image.BILINEAR``
120 |     """
121 | 
122 |     def __init__(self, size, interpolation=Image.BILINEAR):
123 |         assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
124 |         self.size = size
125 |         self.interpolation = interpolation
126 | 
127 |     def __call__(self, img):
128 |         """
129 |         Args:
130 |             img (PIL.Image): Image to be scaled.
131 |         Returns:
132 |             PIL.Image: Rescaled image.
133 |         """
134 |         if isinstance(self.size, int):
135 |             w, h = img.size
136 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
137 |                 return img
138 |             if w < h:
139 |                 ow = self.size
140 |                 oh = int(self.size * h / w)
141 |                 return img.resize((ow, oh), self.interpolation)
142 |             else:
143 |                 oh = self.size
144 |                 ow = int(self.size * w / h)
145 |                 return img.resize((ow, oh), self.interpolation)
146 |         else:
147 |             return img.resize(self.size, self.interpolation)
148 | 
149 | 
150 | class CenterCrop(object):
151 |     """Crops the given PIL.Image at the center.
152 |     Args:
153 |         size (sequence or int): Desired output size of the crop. If size is an
154 |             int instead of sequence like (h, w), a square crop (size, size) is
155 |             made.
156 |     """
157 | 
158 |     def __init__(self, size):
159 |         if isinstance(size, numbers.Number):
160 |             self.size = (int(size), int(size))
161 |         else:
162 |             self.size = size
163 | 
164 |     def __call__(self, img):
165 |         """
166 |         Args:
167 |             img (PIL.Image): Image to be cropped.
168 |         Returns:
169 |             PIL.Image: Cropped image.
170 |         """
171 |         w, h = img.size
172 |         th, tw = self.size
173 |         x1 = int(round((w - tw) / 2.))
174 |         y1 = int(round((h - th) / 2.))
175 |         return img.crop((x1, y1, x1 + tw, y1 + th))
176 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/temporal_transforms.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | 
 5 | class LoopPadding(object):
 6 |     def __init__(self, size):
 7 |         self.size = size
 8 | 
 9 |     def __call__(self, frame_indices):
10 |         out = frame_indices
11 | 
12 |         for index in out:
13 |             if len(out) >= self.size:
14 |                 break
15 |             out.append(index)
16 | 
17 |         return out
18 | 
19 | 
20 | class TemporalCenterCrop(object):
21 |     """Temporally crop the given frame indices at a center.
22 | 
23 |     If the number of frames is less than the size,
24 |     loop the indices as many times as necessary to satisfy the size.
25 | 
26 |     Args:
27 |         size (int): Desired output size of the crop.
28 |     """
29 | 
30 |     def __init__(self, size):
31 |         self.size = size
32 | 
33 |     def __call__(self, frame_indices):
34 |         """
35 |         Args:
36 |             frame_indices (list): frame indices to be cropped.
37 |         Returns:
38 |             list: Cropped frame indices.
39 |         """
40 | 
41 |         center_index = len(frame_indices) // 2
42 |         begin_index = max(0, center_index - (self.size // 2))
43 |         end_index = min(begin_index + self.size, len(frame_indices))
44 | 
45 |         out = frame_indices[begin_index:end_index]
46 | 
47 |         for index in out:
48 |             if len(out) >= self.size:
49 |                 break
50 |             out.append(index)
51 | 
52 |         return out
53 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import os
 5 | import sys
 6 | import json
 7 | 
 8 | from utils import AverageMeter
 9 | 
10 | 
11 | def calculate_video_results(output_buffer, video_id, test_results, class_names):
12 |     video_outputs = torch.stack(output_buffer)
13 |     average_scores = torch.mean(video_outputs, dim=0)
14 |     sorted_scores, locs = torch.topk(average_scores, k=10)
15 | 
16 |     video_results = []
17 |     for i in range(sorted_scores.size(0)):
18 |         video_results.append({'label': class_names[locs[i]], 'score': sorted_scores[i]})
19 | 
20 |     test_results['results'][video_id] = video_results
21 | 
22 | 
23 | def test(data_loader, model, opt, class_names):
24 |     print('test')
25 | 
26 |     model.eval()
27 | 
28 |     batch_time = AverageMeter()
29 |     data_time = AverageMeter()
30 | 
31 |     end_time = time.time()
32 |     output_buffer = []
33 |     previous_video_id = ''
34 |     test_results = {'results': {}}
35 |     for i, (inputs, targets) in enumerate(data_loader):
36 |         data_time.update(time.time() - end_time)
37 | 
38 |         inputs = Variable(inputs, volatile=True)
39 |         outputs = model(inputs)
40 | 
41 |         for j in range(outputs.size(0)):
42 |             if not (i == 0 and j == 0) and targets[j] != previous_video_id:
43 |                 calculate_video_results(output_buffer, previous_video_id,
44 |                                         test_results, class_names)
45 |                 output_buffer = []
46 |             output_buffer.append(outputs[j].data.cpu())
47 |             previous_video_id = targets[j]
48 | 
49 |         if (i % 100) == 0:
50 |             with open(os.path.join(opt.result_path,
51 |                                    '{}.json'.format(opt.test_subset)),
52 |                       'w') as f:
53 |                 json.dump(test_results, f)
54 | 
55 |         batch_time.update(time.time() - end_time)
56 |         end_time = time.time()
57 | 
58 |         print('[{}/{}]\t'
59 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
60 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format(
61 |                   i + 1, len(data_loader), batch_time=batch_time, data_time=data_time))
62 |     with open(os.path.join(opt.result_path,
63 |                            '{}.json'.format(opt.test_subset)),
64 |               'w') as f:
65 |         json.dump(test_results, f)
66 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import os
 5 | import sys
 6 | 
 7 | from utils import AverageMeter, calculate_accuracy
 8 | 
 9 | 
10 | def train_epoch(epoch, data_loader, model, criterion, optimizer, opt,
11 |                 epoch_logger, batch_logger):
12 |     print('train at epoch {}'.format(epoch))
13 | 
14 |     model.train()
15 | 
16 |     batch_time = AverageMeter()
17 |     data_time = AverageMeter()
18 |     losses = AverageMeter()
19 |     accuracies = AverageMeter()
20 | 
21 |     end_time = time.time()
22 |     for i, (inputs, targets) in enumerate(data_loader):
23 |         data_time.update(time.time() - end_time)
24 | 
25 |         if not opt.no_cuda:
26 |             targets = targets.cuda(async=True)
27 |         inputs = Variable(inputs)
28 |         targets = Variable(targets)
29 |         outputs = model(inputs)
30 |         loss = criterion(outputs, targets)
31 |         acc = calculate_accuracy(outputs, targets)
32 | 
33 |         losses.update(loss.data[0], inputs.size(0))
34 |         accuracies.update(acc, inputs.size(0))
35 | 
36 |         optimizer.zero_grad()
37 |         loss.backward()
38 |         optimizer.step()
39 | 
40 |         batch_time.update(time.time() - end_time)
41 |         end_time = time.time()
42 | 
43 |         batch_logger.log({
44 |             'epoch': epoch,
45 |             'batch': i + 1,
46 |             'iter': (epoch - 1) * len(data_loader) + (i + 1),
47 |             'loss': losses.val,
48 |             'acc': accuracies.val,
49 |             'lr': optimizer.param_groups[0]['lr']
50 |         })
51 | 
52 |         print('Epoch: [{0}][{1}/{2}]\t'
53 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
54 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
55 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
56 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
57 |                   epoch, i + 1, len(data_loader), batch_time=batch_time,
58 |                   data_time=data_time, loss=losses, acc=accuracies))
59 | 
60 |     epoch_logger.log({
61 |         'epoch': epoch,
62 |         'loss': losses.avg,
63 |         'acc': accuracies.avg,
64 |         'lr': optimizer.param_groups[0]['lr']
65 |     })
66 | 
67 |     if epoch % opt.checkpoint == 0:
68 |         save_file_path = os.path.join(opt.result_path, 'save_{}.pth'.format(epoch))
69 |         states = {
70 |             'epoch': epoch + 1,
71 |             'arch': opt.arch,
72 |             'state_dict': model.state_dict(),
73 |             'optimizer' : optimizer.state_dict(),
74 |         }
75 |         torch.save(states, save_file_path)
76 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/video_feature_extractor/videocnn/validation.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | import time
 4 | import sys
 5 | 
 6 | from utils import AverageMeter, calculate_accuracy
 7 | 
 8 | 
 9 | def val_epoch(epoch, data_loader, model, criterion, opt, logger):
10 |     print('validation at epoch {}'.format(epoch))
11 | 
12 |     model.eval()
13 | 
14 |     batch_time = AverageMeter()
15 |     data_time = AverageMeter()
16 |     losses = AverageMeter()
17 |     accuracies = AverageMeter()
18 | 
19 |     end_time = time.time()
20 |     for i, (inputs, targets) in enumerate(data_loader):
21 |         data_time.update(time.time() - end_time)
22 | 
23 |         if not opt.no_cuda:
24 |             targets = targets.cuda(async=True)
25 |         inputs = Variable(inputs, volatile=True)
26 |         targets = Variable(targets, volatile=True)
27 |         outputs = model(inputs)
28 |         loss = criterion(outputs, targets)
29 |         acc = calculate_accuracy(outputs, targets)
30 | 
31 |         losses.update(loss.data[0], inputs.size(0))
32 |         accuracies.update(acc, inputs.size(0))
33 | 
34 |         batch_time.update(time.time() - end_time)
35 |         end_time = time.time()
36 | 
37 |         print('Epoch: [{0}][{1}/{2}]\t'
38 |               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
39 |               'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
40 |               'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
41 |               'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(
42 |                   epoch, i + 1, len(data_loader), batch_time=batch_time,
43 |                   data_time=data_time, loss=losses, acc=accuracies))
44 | 
45 |     logger.log({
46 |         'epoch': epoch,
47 |         'loss': losses.avg,
48 |         'acc': accuracies.avg
49 |     })
50 | 
51 |     return losses.avg
52 | 


--------------------------------------------------------------------------------
/code/lecture_aware_embds/we_embd.py:
--------------------------------------------------------------------------------
 1 | from gensim.models.keyedvectors import KeyedVectors
 2 | 
 3 | import torch as th
 4 | from torch.utils.data import Dataset
 5 | import pickle
 6 | import torch.nn.functional as F
 7 | import numpy as np
 8 | import re
 9 | from collections import defaultdict
10 | from torch.utils.data.dataloader import default_collate
11 | 
12 | from stop_words import ENGLISH_STOP_WORDS
13 | 
14 | we_dim = 300
15 | max_words = 50
16 | 
17 | we = KeyedVectors.load_word2vec_format('/ssd_scratch/cvit/darshan/data/GoogleNews-vectors-negative300.bin', binary=True)
18 | 
19 | 
20 | def _zero_pad_tensor(tensor, size):
21 |     if len(tensor) >= size:
22 |         return tensor[:size]
23 |     else:
24 |         zero = np.zeros((size - len(tensor), we_dim), dtype=np.float32)
25 |         return np.concatenate((tensor, zero), axis=0)
26 | 
27 | 
28 | def _tokenize_text(sentence):
29 |     w = re.findall(r"[\w']+", str(sentence))
30 |     return w
31 | 
32 | def _words_to_we(words):
33 |     # words = [word for word in words if word in self.we.vocab]
34 |     words = list(map(lambda word: word.lower(), words))
35 |     words = [word for word in words if (word in we.vocab) and (word not in ENGLISH_STOP_WORDS)]
36 |     if words:
37 |         we_t = _zero_pad_tensor(we[words], max_words)
38 |         return th.from_numpy(we_t)
39 |     else:
40 |         return th.zeros(max_words, we_dim)
41 | 
42 | 
43 | cap = "Time is suspect Train arrives at 7 o'clock Two simultaneous events Do two different observer agree? NO! Relativity of simultaneity"
44 | 
45 | caption = _words_to_we(_tokenize_text(cap))
46 | 
47 | print(cap, caption.shape)


--------------------------------------------------------------------------------
/figures/AVLectures_stats.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Darshansingh11/AVLectures/d5452d90d29961f28a89c5d1ff7bef88c3f66ca0/figures/AVLectures_stats.jpg


--------------------------------------------------------------------------------