├── README.md ├── c3d_model.py ├── crop_mean.npy ├── list └── classInd.txt ├── master ├── real_time_c3d.py ├── real_time_input_data.py ├── resources ├── test_1.jpg ├── test_2_gif.gif └── test_gif.gif └── test_video ├── v_PlayingGuitar_g16_c02.avi └── v_PushUps_g02_c03.avi /README.md: -------------------------------------------------------------------------------- 1 | # real time action recognition 2 | 3 | ## example 4 | 5 | ### In Cpu; 6 | ![](https://github.com/FingerRec/real_time_video_action_recognition/raw/master/resources/test_gif.gif) 7 | ![](https://github.com/FingerRec/real_time_video_action_recognition/raw/master/resources/test_2_gif.gif) 8 | 9 | **If run GPU there is no delay.** 10 | 11 | the video cann't show here, the below are some capture images. 12 | 13 | ![](https://github.com/FingerRec/real_time_video_action_recognition/raw/master/resources/test_1.jpg) 14 | 15 | 16 | ## prepare 17 | * tensorflow 1.2+ 18 | * opencv3.x 19 | * pillow 20 | * scipy 21 | * python3+ 22 | 23 | ## run 24 | 25 | ```bash 26 | python real_time_c3d.py 27 | ``` 28 | Two test video provided in directory test_video/. Video can be merged [here](https://www.aconvert.com/cn/video/merge/) free. 29 | 30 | This code can be run directly use cpu, but it will cause delay.With **gpu**, it will run real-time recognition very well. 31 | 32 | ## trained model 33 | 34 | Dropbox:[c3d_pretrained_model](https://www.dropbox.com/sh/8wcjrcadx4r31ux/AAAkz3dQ706pPO8ZavrztRCca?dl=0) 35 | 36 | Baiduyun: 链接:https://pan.baidu.com/s/1IRVhEQSvz7OlZUi5iPcEgQ 密码:z1k2 37 | 38 | download this model and load it directly. 39 | 40 | ## Others 41 | This demo's pretrained model is based on [C3D-tensorflow](https://github.com/hx173149/C3D-tensorflow); 42 | 43 | If you want the Pytorch version or R3D/I3D etc, please tell me and I will update this code. 44 | -------------------------------------------------------------------------------- /c3d_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Builds the C3D network. 17 | 18 | Implements the inference pattern for model building. 19 | inference_c3d(): Builds the model as far as is required for running the network 20 | forward to make predictions. 21 | """ 22 | 23 | import tensorflow as tf 24 | 25 | # The UCF-101 dataset has 101 classes 26 | NUM_CLASSES = 101 27 | 28 | # Images are cropped to (CROP_SIZE, CROP_SIZE) 29 | CROP_SIZE = 112 30 | CHANNELS = 3 31 | 32 | # Number of frames per video clip 33 | NUM_FRAMES_PER_CLIP = 16 34 | 35 | "-----------------------------------------------------------------------------------------------------------------------" 36 | 37 | def conv3d(name, l_input, w, b): 38 | return tf.nn.bias_add( 39 | tf.nn.conv3d(l_input, w, strides=[1, 1, 1, 1, 1], padding='SAME'), 40 | b 41 | ) 42 | 43 | def max_pool(name, l_input, k): 44 | return tf.nn.max_pool3d(l_input, ksize=[1, k, 2, 2, 1], strides=[1, k, 2, 2, 1], padding='SAME', name=name) 45 | 46 | def inference_c3d(_X, _dropout, batch_size, _weights, _biases): 47 | 48 | # Convolution Layer 49 | conv1 = conv3d('conv1', _X, _weights['wc1'], _biases['bc1']) 50 | conv1 = tf.nn.relu(conv1, 'relu1') 51 | pool1 = max_pool('pool1', conv1, k=1) 52 | 53 | # Convolution Layer 54 | conv2 = conv3d('conv2', pool1, _weights['wc2'], _biases['bc2']) 55 | conv2 = tf.nn.relu(conv2, 'relu2') 56 | pool2 = max_pool('pool2', conv2, k=2) 57 | 58 | # Convolution Layer 59 | conv3 = conv3d('conv3a', pool2, _weights['wc3a'], _biases['bc3a']) 60 | conv3 = tf.nn.relu(conv3, 'relu3a') 61 | conv3 = conv3d('conv3b', conv3, _weights['wc3b'], _biases['bc3b']) 62 | conv3 = tf.nn.relu(conv3, 'relu3b') 63 | pool3 = max_pool('pool3', conv3, k=2) 64 | 65 | # Convolution Layer 66 | conv4 = conv3d('conv4a', pool3, _weights['wc4a'], _biases['bc4a']) 67 | conv4 = tf.nn.relu(conv4, 'relu4a') 68 | conv4 = conv3d('conv4b', conv4, _weights['wc4b'], _biases['bc4b']) 69 | conv4 = tf.nn.relu(conv4, 'relu4b') 70 | pool4 = max_pool('pool4', conv4, k=2) 71 | 72 | # Convolution Layer 73 | conv5 = conv3d('conv5a', pool4, _weights['wc5a'], _biases['bc5a']) 74 | conv5 = tf.nn.relu(conv5, 'relu5a') 75 | conv5 = conv3d('conv5b', conv5, _weights['wc5b'], _biases['bc5b']) 76 | conv5 = tf.nn.relu(conv5, 'relu5b') 77 | pool5 = max_pool('pool5', conv5, k=2) 78 | 79 | # Fully connected layer 80 | pool5 = tf.transpose(pool5, perm=[0,1,4,2,3]) 81 | dense1 = tf.reshape(pool5, [batch_size, _weights['wd1'].get_shape().as_list()[0]]) # Reshape conv3 output to fit dense layer input 82 | dense1 = tf.matmul(dense1, _weights['wd1']) + _biases['bd1'] 83 | 84 | dense1 = tf.nn.relu(dense1, name='fc1') # Relu activation 85 | dense1 = tf.nn.dropout(dense1, _dropout) 86 | 87 | dense2 = tf.nn.relu(tf.matmul(dense1, _weights['wd2']) + _biases['bd2'], name='fc2') # Relu activation 88 | dense2 = tf.nn.dropout(dense2, _dropout) 89 | 90 | # Output: class prediction 91 | out = tf.matmul(dense2, _weights['out']) + _biases['out'] 92 | 93 | return out 94 | -------------------------------------------------------------------------------- /crop_mean.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/crop_mean.npy -------------------------------------------------------------------------------- /list/classInd.txt: -------------------------------------------------------------------------------- 1 | 1 ApplyEyeMakeup 2 | 2 ApplyLipstick 3 | 3 Archery 4 | 4 BabyCrawling 5 | 5 BalanceBeam 6 | 6 BandMarching 7 | 7 BaseballPitch 8 | 8 Basketball 9 | 9 BasketballDunk 10 | 10 BenchPress 11 | 11 Biking 12 | 12 Billiards 13 | 13 BlowDryHair 14 | 14 BlowingCandles 15 | 15 BodyWeightSquats 16 | 16 Bowling 17 | 17 BoxingPunchingBag 18 | 18 BoxingSpeedBag 19 | 19 BreastStroke 20 | 20 BrushingTeeth 21 | 21 CleanAndJerk 22 | 22 CliffDiving 23 | 23 CricketBowling 24 | 24 CricketShot 25 | 25 CuttingInKitchen 26 | 26 Diving 27 | 27 Drumming 28 | 28 Fencing 29 | 29 FieldHockeyPenalty 30 | 30 FloorGymnastics 31 | 31 FrisbeeCatch 32 | 32 FrontCrawl 33 | 33 GolfSwing 34 | 34 Haircut 35 | 35 Hammering 36 | 36 HammerThrow 37 | 37 HandstandPushups 38 | 38 HandstandWalking 39 | 39 HeadMassage 40 | 40 HighJump 41 | 41 HorseRace 42 | 42 HorseRiding 43 | 43 HulaHoop 44 | 44 IceDancing 45 | 45 JavelinThrow 46 | 46 JugglingBalls 47 | 47 JumpingJack 48 | 48 JumpRope 49 | 49 Kayaking 50 | 50 Knitting 51 | 51 LongJump 52 | 52 Lunges 53 | 53 MilitaryParade 54 | 54 Mixing 55 | 55 MoppingFloor 56 | 56 Nunchucks 57 | 57 ParallelBars 58 | 58 PizzaTossing 59 | 59 PlayingCello 60 | 60 PlayingDaf 61 | 61 PlayingDhol 62 | 62 PlayingFlute 63 | 63 PlayingGuitar 64 | 64 PlayingPiano 65 | 65 PlayingSitar 66 | 66 PlayingTabla 67 | 67 PlayingViolin 68 | 68 PoleVault 69 | 69 PommelHorse 70 | 70 PullUps 71 | 71 Punch 72 | 72 PushUps 73 | 73 Rafting 74 | 74 RockClimbingIndoor 75 | 75 RopeClimbing 76 | 76 Rowing 77 | 77 SalsaSpin 78 | 78 ShavingBeard 79 | 79 Shotput 80 | 80 SkateBoarding 81 | 81 Skiing 82 | 82 Skijet 83 | 83 SkyDiving 84 | 84 SoccerJuggling 85 | 85 SoccerPenalty 86 | 86 StillRings 87 | 87 SumoWrestling 88 | 88 Surfing 89 | 89 Swing 90 | 90 TableTennisShot 91 | 91 TaiChi 92 | 92 TennisSwing 93 | 93 ThrowDiscus 94 | 94 TrampolineJumping 95 | 95 Typing 96 | 96 UnevenBars 97 | 97 VolleyballSpiking 98 | 98 WalkingWithDog 99 | 99 WallPushups 100 | 100 WritingOnBoard 101 | 101 YoYo 102 | -------------------------------------------------------------------------------- /master: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/master -------------------------------------------------------------------------------- /real_time_c3d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2018/6/8 21:11 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : C3D-tensorflow 8 | # @File : real_time_c3d.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | import scipy.io 14 | import os 15 | 16 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 17 | 18 | import time 19 | from six.moves import xrange # pylint: disable=redefined-builtin 20 | import tensorflow as tf 21 | import c3d_model 22 | from real_time_input_data import * 23 | import numpy as np 24 | import cv2 25 | import heapq 26 | 27 | # Basic model parameters as external flags. 28 | flags = tf.app.flags 29 | gpu_num = 1 30 | flags.DEFINE_integer('batch_size', 1 , 'Batch size.') 31 | FLAGS = flags.FLAGS 32 | 33 | images_placeholder = tf.placeholder(tf.float32, shape=(1, 16, 112, 112, 3)) 34 | labels_placeholder = tf.placeholder(tf.int64, shape=1) 35 | 36 | def placeholder_inputs(batch_size): 37 | """Generate placeholder variables to represent the input tensors. 38 | These placeholders are used as inputs by the rest of the model building 39 | code and will be fed from the downloaded data in the .run() loop, below. 40 | Args: 41 | batch_size: The batch size will be baked into both placeholders. 42 | Returns: 43 | images_placeholder: Images placeholder. 44 | labels_placeholder: Labels placeholder. 45 | """ 46 | # Note that the shapes of the placeholders match the shapes of the full 47 | # image and label tensors, except the first dimension is now batch_size 48 | # rather than the full size of the train or test data sets. 49 | images_placeholder = tf.placeholder(tf.float32, shape=(1, 16,112,112,3)) 50 | labels_placeholder = tf.placeholder(tf.int64, shape=1) 51 | return images_placeholder, labels_placeholder 52 | 53 | 54 | def _variable_on_cpu(name, shape, initializer): 55 | #with tf.device('/cpu:%d' % cpu_id): 56 | with tf.device('/cpu:0'): 57 | var = tf.get_variable(name, shape, initializer=initializer) 58 | return var 59 | 60 | 61 | def _variable_with_weight_decay(name, shape, stddev, wd): 62 | var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev)) 63 | if wd is not None: 64 | weight_decay = tf.nn.l2_loss(var) * wd 65 | tf.add_to_collection('losses', weight_decay) 66 | return var 67 | 68 | 69 | def run_one_sample(norm_score, sess, video_imgs): 70 | """ 71 | run_one_sample and get the classification result 72 | :param norm_score: 73 | :param sess: 74 | :param video_imgs: 75 | :return: 76 | """ 77 | # images_placeholder, labels_placeholder = placeholder_inputs(FLAGS.batch_size * gpu_num) 78 | # start_time = time.time() 79 | # video_imgs = np.random.rand(1, 16, 112, 112, 3).astype(np.float32) 80 | predict_score = norm_score.eval( 81 | session=sess, 82 | feed_dict={images_placeholder: video_imgs} 83 | ) 84 | top1_predicted_label = np.argmax(predict_score) 85 | predict_score = np.reshape(predict_score,101) 86 | #print(predict_score) 87 | top5_predicted_value = heapq.nlargest(5, predict_score) 88 | top5_predicted_label = predict_score.argsort()[-5:][::-1] 89 | return top1_predicted_label, top5_predicted_label, top5_predicted_value 90 | 91 | 92 | def build_c3d_model(): 93 | """ 94 | build c3d model 95 | :return: 96 | norm_score: 97 | sess: 98 | """ 99 | #model_name = "pretrained_model/c3d_ucf101_finetune_whole_iter_20000_TF.model.mdlp" 100 | #model_name = "pretrained_model/conv3d_deepnetA_sport1m_iter_1900000_TF.model" 101 | model_name = "pretrained_model/sports1m_finetuning_ucf101.model" 102 | # Get the sets of images and labels for training, validation, and 103 | with tf.variable_scope('var_name') as var_scope: 104 | weights = { 105 | 'wc1': _variable_with_weight_decay('wc1', [3, 3, 3, 3, 64], 0.04, 0.00), 106 | 'wc2': _variable_with_weight_decay('wc2', [3, 3, 3, 64, 128], 0.04, 0.00), 107 | 'wc3a': _variable_with_weight_decay('wc3a', [3, 3, 3, 128, 256], 0.04, 0.00), 108 | 'wc3b': _variable_with_weight_decay('wc3b', [3, 3, 3, 256, 256], 0.04, 0.00), 109 | 'wc4a': _variable_with_weight_decay('wc4a', [3, 3, 3, 256, 512], 0.04, 0.00), 110 | 'wc4b': _variable_with_weight_decay('wc4b', [3, 3, 3, 512, 512], 0.04, 0.00), 111 | 'wc5a': _variable_with_weight_decay('wc5a', [3, 3, 3, 512, 512], 0.04, 0.00), 112 | 'wc5b': _variable_with_weight_decay('wc5b', [3, 3, 3, 512, 512], 0.04, 0.00), 113 | 'wd1': _variable_with_weight_decay('wd1', [8192, 4096], 0.04, 0.001), 114 | 'wd2': _variable_with_weight_decay('wd2', [4096, 4096], 0.04, 0.002), 115 | 'out': _variable_with_weight_decay('wout', [4096, c3d_model.NUM_CLASSES], 0.04, 0.005) 116 | } 117 | biases = { 118 | 'bc1': _variable_with_weight_decay('bc1', [64], 0.04, 0.0), 119 | 'bc2': _variable_with_weight_decay('bc2', [128], 0.04, 0.0), 120 | 'bc3a': _variable_with_weight_decay('bc3a', [256], 0.04, 0.0), 121 | 'bc3b': _variable_with_weight_decay('bc3b', [256], 0.04, 0.0), 122 | 'bc4a': _variable_with_weight_decay('bc4a', [512], 0.04, 0.0), 123 | 'bc4b': _variable_with_weight_decay('bc4b', [512], 0.04, 0.0), 124 | 'bc5a': _variable_with_weight_decay('bc5a', [512], 0.04, 0.0), 125 | 'bc5b': _variable_with_weight_decay('bc5b', [512], 0.04, 0.0), 126 | 'bd1': _variable_with_weight_decay('bd1', [4096], 0.04, 0.0), 127 | 'bd2': _variable_with_weight_decay('bd2', [4096], 0.04, 0.0), 128 | 'out': _variable_with_weight_decay('bout', [c3d_model.NUM_CLASSES], 0.04, 0.0), 129 | } 130 | logits = [] 131 | for gpu_index in range(0, gpu_num): 132 | with tf.device('/gpu:%d' % gpu_index): 133 | logit = c3d_model.inference_c3d( 134 | images_placeholder[0 * FLAGS.batch_size:(0 + 1) * FLAGS.batch_size,:,:,:,:], 0.6, 135 | FLAGS.batch_size, weights, biases) 136 | logits.append(logit) 137 | logits = tf.concat(logits, 0) 138 | norm_score = tf.nn.softmax(logits) 139 | saver = tf.train.Saver() 140 | sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) 141 | init = tf.global_variables_initializer() 142 | sess.run(init) 143 | # Create a saver for writing training checkpoints. 144 | saver.restore(sess, model_name) 145 | return norm_score, sess 146 | 147 | 148 | def real_time_recognition(video_path): 149 | """ 150 | real time video classification 151 | :param video_path:the origin video_path 152 | :return: 153 | """ 154 | norm_score, sess = build_c3d_model() 155 | video_src = video_path 156 | cap = cv2.VideoCapture(video_src) 157 | count = 0 158 | video_imgs = [] 159 | predicted_label_top5 = [] 160 | top5_predicted_value = [] 161 | predicted_label = 0 162 | classes = {} 163 | flag = False 164 | with open('./list/classInd.txt', 'r') as f: 165 | for line in f: 166 | content = line.strip('\r\n').split(' ') 167 | classes[content[0]] = content[1] 168 | # print(classes) 169 | while True: 170 | ret, img = cap.read() 171 | if type(img) == type(None): 172 | break 173 | float_img = img.astype(np.float32) 174 | video_imgs.append(float_img) 175 | count += 1 176 | if count == 16: 177 | video_imgs_tensor = clip_images_to_tensor(video_imgs, 16, 112) 178 | predicted_label, predicted_label_top5, top5_predicted_value = run_one_sample(norm_score, sess, video_imgs_tensor) 179 | count = 0 180 | video_imgs = [] 181 | flag = True 182 | # channel_1, channel_2, channel_3 = random.randint(0, 255), random.randint(0, 255), random.randint(0, 255) 183 | if flag: 184 | for i in range(5): 185 | cv2.putText(img, str(top5_predicted_value[i])+ ' : ' + classes[str(predicted_label_top5[i] + 1)], (10, 15*(i+1)), 186 | cv2.FONT_HERSHEY_TRIPLEX, 0.5, (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)), 187 | 1, False) 188 | 189 | cv2.imshow('video', img) 190 | 191 | if cv2.waitKey(33) == 27: 192 | break 193 | 194 | cv2.destroyAllWindows() 195 | 196 | 197 | def main(_): 198 | video_path = input("please input the video path to be classification:") 199 | real_time_recognition(video_path) 200 | 201 | if __name__ == '__main__': 202 | tf.app.run() -------------------------------------------------------------------------------- /real_time_input_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | # @Time : 2018/6/9 15:34 5 | # @Author : Awiny 6 | # @Site : 7 | # @Project : C3D-tensorflow 8 | # @File : real_time_input_data.py 9 | # @Software: PyCharm 10 | # @Github : https://github.com/FingerRec 11 | # @Blog : http://fingerrec.github.io 12 | """ 13 | from __future__ import absolute_import 14 | from __future__ import division 15 | from __future__ import print_function 16 | import scipy.io 17 | import os 18 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' #close the warning 19 | 20 | 21 | 22 | import os 23 | from six.moves import xrange # pylint: disable=redefined-builtin 24 | import tensorflow as tf 25 | import PIL.Image as Image 26 | import random 27 | import numpy as np 28 | import cv2 29 | import time 30 | 31 | 32 | def clip_images_to_tensor(video_imgs, num_frames_per_clip=16, crop_size=112): 33 | data = [] 34 | np_mean = np.load('crop_mean.npy').reshape([num_frames_per_clip, crop_size, crop_size, 3]) 35 | tmp_data = video_imgs 36 | img_datas = [] 37 | if(len(tmp_data)!=0): 38 | for j in xrange(len(tmp_data)): 39 | img = Image.fromarray(tmp_data[j].astype(np.uint8)) 40 | if img.width > img.height: 41 | scale = float(crop_size)/float(img.height) 42 | img = np.array(cv2.resize(np.array(img),(int(img.width * scale + 1), crop_size))).astype(np.float32) 43 | else: 44 | scale = float(crop_size)/float(img.width) 45 | img = np.array(cv2.resize(np.array(img),(crop_size, int(img.height * scale + 1)))).astype(np.float32) 46 | crop_x = int((img.shape[0] - crop_size)/2) 47 | crop_y = int((img.shape[1] - crop_size)/2) 48 | img = img[crop_x:crop_x+crop_size, crop_y:crop_y+crop_size,:] - np_mean[j] 49 | img_datas.append(img) 50 | # data.append(img_datas) 51 | 52 | # pad (duplicate) data/label if less than batch_size 53 | data.append(img_datas) 54 | 55 | np_arr_data = np.array(data).astype(np.float32) 56 | 57 | return np_arr_data 58 | -------------------------------------------------------------------------------- /resources/test_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/resources/test_1.jpg -------------------------------------------------------------------------------- /resources/test_2_gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/resources/test_2_gif.gif -------------------------------------------------------------------------------- /resources/test_gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/resources/test_gif.gif -------------------------------------------------------------------------------- /test_video/v_PlayingGuitar_g16_c02.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/test_video/v_PlayingGuitar_g16_c02.avi -------------------------------------------------------------------------------- /test_video/v_PushUps_g02_c03.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FingerRec/real_time_video_action_recognition/f6fab30d39c4f7bb29f81e4d299b95949e36ec5f/test_video/v_PushUps_g02_c03.avi --------------------------------------------------------------------------------