├── Documentation ├── Additional Notes │ ├── General_Notes.docx │ └── Viola-Jones.docx ├── Base Papers │ ├── Base_Paper-1-Large-ScaleVisualSpeechRecognition.pdf │ └── Base_Paper-2-lipreading-by-neural-networks-visual-preprocessing-learning-and-sensory-integration.pdf ├── CSI │ ├── 2020_CSE_14_CSI-PAPER.docx │ └── 2020_CSE_14_CSI-PAPER.pdf ├── FlowCharts and Architecture │ ├── CNN_Architecture.png │ └── FlowChart.png ├── Literature Survey │ ├── Ref-1_automatic lip reading.pdf │ ├── Ref-2_largescale visual speech recognition.pdf │ ├── Ref-3_Lele_Chen_Lip_Movements_Generation_ECCV_2018_paper.pdf │ ├── Ref-4_imroving speaker independent lipreading with domain adversarial training.pdf │ └── Ref-5_lipreading sentences in the wild.pdf ├── PPT's │ ├── CSI_CSIAMSPID013.pptx │ ├── FIRST_REVIEW_2020_CSE_14.pptx │ └── ZEROTH_PHASE_INTUITIVEPERCEPTION_BATCH_2020_CSE_14.pptx ├── Problem Statement_2020_CSE_14 .pdf └── Project_Phase_1_Report_Team_14 .pdf ├── README.md ├── images ├── IP_Screen_1.jpeg ├── IP_Scren_2.jpeg ├── Input_Video.jpeg └── Output.jpeg └── src ├── GRID ├── bbaf2n.mpg ├── brbk7n.mpg ├── lbax4n.mpg ├── pwij3p.mpg ├── srin6a.mpg └── swiz3n.mpg ├── Perception.exe ├── Scripts ├── cascade_files │ ├── haarcascade_frontalface_default.xml │ └── haarcascade_mcs_mouth.xml ├── extract_mouth_batch.py ├── face.py ├── mouth_extract.py └── videos.py ├── common └── dictionaries │ ├── big.txt │ └── grid.txt ├── cropped_face ├── face_175.jpg ├── face_176.jpg ├── face_177.jpg ├── face_178.jpg ├── face_179.jpg ├── face_180.jpg ├── face_181.jpg ├── face_182.jpg ├── face_183.jpg ├── face_184.jpg ├── face_185.jpg ├── face_186.jpg ├── face_187.jpg ├── face_188.jpg ├── face_189.jpg ├── face_190.jpg ├── face_191.jpg ├── face_192.jpg ├── face_193.jpg ├── face_194.jpg ├── face_195.jpg ├── face_196.jpg ├── face_197.jpg ├── face_198.jpg ├── face_200.jpg ├── face_201.jpg ├── face_202.jpg ├── face_203.jpg ├── face_204.jpg ├── face_205.jpg ├── face_206.jpg ├── face_207.jpg ├── face_208.jpg ├── face_209.jpg ├── face_210.jpg ├── face_211.jpg ├── face_212.jpg ├── face_213.jpg └── face_214.jpg ├── evaluation ├── phonemes.txt ├── predict.py └── predict_batch.py ├── mouth_scripts ├── aligns.py ├── model2.py └── videos.py ├── page.py ├── setup.py └── training └── overlapped_speakers ├── prepare.py └── train.py /Documentation/Additional Notes/General_Notes.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Additional Notes/General_Notes.docx -------------------------------------------------------------------------------- /Documentation/Additional Notes/Viola-Jones.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Additional Notes/Viola-Jones.docx -------------------------------------------------------------------------------- /Documentation/Base Papers/Base_Paper-1-Large-ScaleVisualSpeechRecognition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Base Papers/Base_Paper-1-Large-ScaleVisualSpeechRecognition.pdf -------------------------------------------------------------------------------- /Documentation/Base Papers/Base_Paper-2-lipreading-by-neural-networks-visual-preprocessing-learning-and-sensory-integration.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Base Papers/Base_Paper-2-lipreading-by-neural-networks-visual-preprocessing-learning-and-sensory-integration.pdf -------------------------------------------------------------------------------- /Documentation/CSI/2020_CSE_14_CSI-PAPER.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/CSI/2020_CSE_14_CSI-PAPER.docx -------------------------------------------------------------------------------- /Documentation/CSI/2020_CSE_14_CSI-PAPER.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/CSI/2020_CSE_14_CSI-PAPER.pdf -------------------------------------------------------------------------------- /Documentation/FlowCharts and Architecture/CNN_Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/FlowCharts and Architecture/CNN_Architecture.png -------------------------------------------------------------------------------- /Documentation/FlowCharts and Architecture/FlowChart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/FlowCharts and Architecture/FlowChart.png -------------------------------------------------------------------------------- /Documentation/Literature Survey/Ref-1_automatic lip reading.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Literature Survey/Ref-1_automatic lip reading.pdf -------------------------------------------------------------------------------- /Documentation/Literature Survey/Ref-2_largescale visual speech recognition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Literature Survey/Ref-2_largescale visual speech recognition.pdf -------------------------------------------------------------------------------- /Documentation/Literature Survey/Ref-3_Lele_Chen_Lip_Movements_Generation_ECCV_2018_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Literature Survey/Ref-3_Lele_Chen_Lip_Movements_Generation_ECCV_2018_paper.pdf -------------------------------------------------------------------------------- /Documentation/Literature Survey/Ref-4_imroving speaker independent lipreading with domain adversarial training.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Literature Survey/Ref-4_imroving speaker independent lipreading with domain adversarial training.pdf -------------------------------------------------------------------------------- /Documentation/Literature Survey/Ref-5_lipreading sentences in the wild.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Literature Survey/Ref-5_lipreading sentences in the wild.pdf -------------------------------------------------------------------------------- /Documentation/PPT's/CSI_CSIAMSPID013.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/PPT's/CSI_CSIAMSPID013.pptx -------------------------------------------------------------------------------- /Documentation/PPT's/FIRST_REVIEW_2020_CSE_14.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/PPT's/FIRST_REVIEW_2020_CSE_14.pptx -------------------------------------------------------------------------------- /Documentation/PPT's/ZEROTH_PHASE_INTUITIVEPERCEPTION_BATCH_2020_CSE_14.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/PPT's/ZEROTH_PHASE_INTUITIVEPERCEPTION_BATCH_2020_CSE_14.pptx -------------------------------------------------------------------------------- /Documentation/Problem Statement_2020_CSE_14 .pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Problem Statement_2020_CSE_14 .pdf -------------------------------------------------------------------------------- /Documentation/Project_Phase_1_Report_Team_14 .pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/Documentation/Project_Phase_1_Report_Team_14 .pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2020_CSE_14 2 | 3 | Final Year VTU Project 4 | 5 | Project Name: "Intuitive Perception: Speech Recognition Using Machine Learning" 6 | 7 | Team Members: 8 | 9 | 1. Roopashree N- 1KS17CS064 10 | 11 | 2. Sai Sneha SV- 1KS17CS070 12 | 13 | 3. Spoorthi V- 1KS17CS082 14 | 15 | Team Name: 2020_CSE_14 16 | 17 | Group No: G1 18 | 19 | Project Guide: Dr. Swathi K 20 | 21 | Department: Department of Computer Science & Engineering 22 | 23 | College: KS Institute of Technology 24 | 25 | # Intuitive Perception: Lip Reading Using Machine Learning 🎥🤖 26 | 27 | This project focuses on lip reading by interpreting lip movements to generate text. Using deep learning techniques, particularly Convolutional Neural Networks (CNNs), the project achieves an accuracy of **73%** in interpreting lip movements on a predefined dataset. 28 | 29 | --- 30 | 31 | ## 🚀 Project Overview 32 | 33 | Lip reading is the ability to understand spoken words by visually interpreting lip movements. This project aims to bridge communication gaps for individuals with hearing impairments or in situations where audio signals are unavailable. 34 | 35 | Key features include: 36 | - Data preprocessing with facial detection and lip cropping. 37 | - Training a CNN model for interpreting lip movements. 38 | - Generating text predictions from video frames. 39 | 40 | --- 41 | 42 | ## 📂 Repository Structure 43 | 44 | ```plaintext 45 | Documentation/ 46 | src/ 47 | ├── data/ 48 | │ ├── raw/ # Raw video and audio files 49 | │ ├── processed/ # Preprocessed data (cropped lip images) 50 | ├── models/ 51 | │ ├── cnn_model.py # CNN model implementation 52 | │ ├── model_utils.py # Model utility functions 53 | ├── preprocessing/ 54 | │ ├── face_detection.py # Facial detection and lip cropping 55 | │ ├── video_to_frames.py # Extracting frames from videos 56 | ├── evaluation/ 57 | │ ├── metrics.py # Accuracy and loss evaluation metrics 58 | ├── visualization/ 59 | │ ├── plot_results.py # Visualization of results and predictions 60 | ├── main.py # Main script to run the project 61 | ├── README.md # Documentation 62 | ``` 63 | 64 | ## 🛠️ Tools and Technologies 65 | **Programming Language**: Python 66 | **Deep Learning Framework**: TensorFlow/Keras 67 | **Libraries**: OpenCV, NumPy, Matplotlib 68 | **Model Architecture**: Convolutional Neural Networks (CNNs) 69 | 70 | ## 📋 Setup Instructions 71 | 72 | 1. Clone the repository: 73 | ```bash 74 | git clone https://github.com/saisnehasv/LipReading_ML_2020_CSE_14.git 75 | cd LipReading_ML_2020_CSE_14/src 76 | ``` 77 | 78 | 2. Install the required dependencies: 79 | ```bash 80 | pip install -r requirements.txt 81 | ``` 82 | 83 | 3. Preprocess the data: 84 | - Extract frames from videos: 85 | ```bash 86 | python preprocessing/video_to_frames.py --input data/raw/ --output data/processed/ 87 | ``` 88 | - Detect faces and crop lip regions: 89 | ```bash 90 | python preprocessing/face_detection.py --input data/processed/ --output data/processed/lips/ 91 | ``` 92 | 4. Train the model: 93 | ``` bash 94 | python models/cnn_model.py --train data/processed/lips/ --epochs 50 --batch_size 32 95 | ``` 96 | 5. Evaluate the model: 97 | ``` bash 98 | python evaluation/metrics.py --model models/saved_model.h5 --test data/processed/lips/test/ 99 | ``` 100 | 101 | ## 🖥️ Run the project 102 | - Run Perception.exe to upload a video and generate a video with subtitles. 103 | 104 | ## 🖼️ Screenshots 105 | 106 | ### Intuitive Perception - Screen 1 107 | ![Screen 1](images/IP_Screen_1.jpeg) 108 | 109 | ### Intuitive Perception - Screen 2 110 | ![Screen 2](images/IP_Screen_2.jpeg) 111 | 112 | ### Input and Process the Video 113 | ![Input](images/Input_Video.jpeg) 114 | 115 | ### Output and Results 116 | ![Input](images/Output.jpeg) 117 | 118 | --- 119 | ## 📊 Model Performance 120 | The CNN model achieved: 121 | 122 | - Accuracy: 73% on the test dataset. 123 | - Loss: Optimized using categorical cross-entropy. 124 | 125 | 📜 License 126 | This project is licensed under the MIT License. See the LICENSE file for details. 127 | 128 | -------------------------------------------------------------------------------- /images/IP_Screen_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/images/IP_Screen_1.jpeg -------------------------------------------------------------------------------- /images/IP_Scren_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/images/IP_Scren_2.jpeg -------------------------------------------------------------------------------- /images/Input_Video.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/images/Input_Video.jpeg -------------------------------------------------------------------------------- /images/Output.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/images/Output.jpeg -------------------------------------------------------------------------------- /src/GRID/bbaf2n.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/GRID/bbaf2n.mpg -------------------------------------------------------------------------------- /src/GRID/brbk7n.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/GRID/brbk7n.mpg -------------------------------------------------------------------------------- /src/GRID/lbax4n.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/GRID/lbax4n.mpg -------------------------------------------------------------------------------- /src/GRID/pwij3p.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/GRID/pwij3p.mpg -------------------------------------------------------------------------------- /src/GRID/srin6a.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/GRID/srin6a.mpg -------------------------------------------------------------------------------- /src/GRID/swiz3n.mpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/GRID/swiz3n.mpg -------------------------------------------------------------------------------- /src/Perception.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/Perception.exe -------------------------------------------------------------------------------- /src/Scripts/extract_mouth_batch.py: -------------------------------------------------------------------------------- 1 | ''' 2 | extract_mouth_batch.py 3 | This script will extract mouth crop of every single video inside source directory 4 | while preserving the overall structure of the source directory content. 5 | 6 | Usage: 7 | python extract_mouth_batch.py [source directory] [pattern] [target directory] [face predictor path] 8 | 9 | pattern: *.avi, *.mpg, etc 10 | 11 | Example: 12 | python scripts/extract_mouth_batch.py evaluation/samples/GRID/ *.mpg TARGET/ common/predictors/shape_predictor_68_face_landmarks.dat 13 | 14 | Will make directory TARGET and process everything inside evaluation/samples/GRID/ that match pattern *.mpg. 15 | ''' 16 | 17 | from lipnet.lipreading.videos import Video 18 | import os, fnmatch, sys, errno 19 | from skimage import io 20 | 21 | SOURCE_PATH = sys.argv[1] #path of the script 22 | SOURCE_EXTS = sys.argv[2] #directory of dataset stored 23 | TARGET_PATH = sys.argv[3]#where output will be saved 24 | FACE_PREDICTOR_PATH = sys.argv[4] #dlib facial landmark predictor 25 | 26 | def mkdir_p(path): #making target directory 27 | try: 28 | os.makedirs(path)#os is used to tranverse through the system files 29 | except OSError as exc: # Python >2.5 30 | if exc.errno == errno.EEXIST and os.path.isdir(path):#if path alreadt exists 31 | pass 32 | else: 33 | raise 34 | 35 | def find_files(directory, pattern): #find directory which has dataset in it, pattern is mpg 36 | for root, dirs, files in os.walk(directory): #list of root directories, sub-directories and list of file names in the current directory. 37 | for basename in files: 38 | if fnmatch.fnmatch(basename, pattern):#checking if the files in the dataset are videos or not, we have alignment files as well 39 | filename = os.path.join(root, basename) #creating the pathname to the video file 40 | yield filename #return the path to the video file 41 | 42 | for filepath in find_files(SOURCE_PATH, SOURCE_EXTS): 43 | print ("Processing: {}".format(filepath)) 44 | video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH).from_video(filepath) 45 | #Video is the user defined class name 46 | #vtype = video type, which means it has a face in it 47 | #face_predictor_path is the facial landmark predictor 48 | #from_video is a method in video class to which we're providing the path of each video file 49 | 50 | filepath_wo_ext = os.path.splitext(filepath)[0]#The mouth extract images will have the same name as the original video name so for that we're extracting the file path without extension 51 | target_dir = os.path.join(TARGET_PATH, filepath_wo_ext) #creating a target directory by target path by user and file path 52 | mkdir_p(target_dir) 53 | 54 | i = 0 55 | for frame in video.mouth:#for each frame, we're going to save each frame in png format 56 | io.imsave(os.path.join(target_dir, "mouth_{0:03d}.png".format(i)), frame) 57 | i += 1 58 | -------------------------------------------------------------------------------- /src/Scripts/face.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import os 3 | 4 | classifier = cv2.CascadeClassifier(cv2.data.haarcascades+"haarcascade_frontalface_default.xml") 5 | 6 | dirFace = 'cropped_face' 7 | 8 | # Create if there is no cropped_face directory 9 | if not os.path.exists(dirFace): 10 | os.mkdir(dirFace) 11 | print("Directory " , dirFace , " Created ") 12 | else: 13 | print("Directory " , dirFace , " has found.") 14 | 15 | def face_crop(file): 16 | 17 | video = cv2.VideoCapture(file) 18 | while (True): 19 | 20 | (f, im) = video.read() # reading frames from video 21 | 22 | if f != True: 23 | break 24 | 25 | gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) #grayscale conversion 26 | 27 | # detectfaces 28 | faces = classifier.detectMultiScale( 29 | gray, # video 30 | scaleFactor=1.10, 31 | minNeighbors=20, 32 | minSize=(30, 30) # min image detection size 33 | ) 34 | 35 | # Draw rectangles around each face 36 | for (x, y, w, h) in faces: 37 | 38 | cv2.rectangle(im, (x, y), (x + w, y + h),(0,0,255),thickness=2) 39 | # saving faces according to detected coordinates 40 | sub_face = im[y:y+h, x:x+w] 41 | FaceFileName = "cropped_face/face_" + str(y+x) + ".jpg" # folder path and random name image 42 | cv2.imwrite(FaceFileName, sub_face) 43 | 44 | # Video Window 45 | cv2.imshow('Face Detected Video ',im) 46 | key = cv2.waitKey(1) & 0xFF 47 | # q for exit 48 | if key == ord('q'): 49 | break 50 | 51 | for filename in os.listdir("C:/Users/neetu/Desktop/Scripts/GRID"):# video path 52 | 53 | if filename.endswith("mpg"): 54 | file=os.path.join("C:/Users/neetu/Desktop/Scripts/GRID/", filename) 55 | face_crop(file) 56 | continue 57 | else: 58 | continue -------------------------------------------------------------------------------- /src/Scripts/mouth_extract.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import skvideo.io 4 | from scipy import ndimage 5 | from scipy.misc import imresize 6 | import dlib 7 | 8 | def process_frames_face(self, frames): 9 | detector = dlib.get_frontal_face_detector() 10 | predictor = dlib.shape_predictor(self.face_predictor_path) 11 | mouth_frames = self.get_frames_mouth(detector, predictor, frames) 12 | self.face = np.array(frames) 13 | self.mouth = np.array(mouth_frames) 14 | self.set_data(mouth_frames) 15 | 16 | def process_frames_mouth(self, frames): 17 | self.face = np.array(frames) 18 | self.mouth = np.array(frames) 19 | self.set_data(frames) 20 | 21 | 22 | def get_frames_mouth(self, detector, predictor, frames): 23 | MOUTH_WIDTH = 100 24 | MOUTH_HEIGHT = 50 25 | HORIZONTAL_PAD = 0.19 26 | normalize_ratio = None 27 | mouth_frames = [] 28 | for frame in frames: 29 | dets = detector(frame, 1) 30 | shape = None 31 | for k, d in enumerate(dets): 32 | shape = predictor(frame, d) 33 | i = -1 34 | if shape is None: # Detector doesn't detect face, just return as is 35 | return frames 36 | mouth_points = [] 37 | for part in shape.parts(): 38 | i += 1 39 | if i < 48: # Only take mouth region 40 | continue 41 | mouth_points.append((part.x,part.y)) 42 | np_mouth_points = np.array(mouth_points) 43 | 44 | mouth_centroid = np.mean(np_mouth_points[:, -2:], axis=0) 45 | 46 | if normalize_ratio is None: 47 | mouth_left = np.min(np_mouth_points[:, :-1]) * (1.0 - HORIZONTAL_PAD) 48 | mouth_right = np.max(np_mouth_points[:, :-1]) * (1.0 + HORIZONTAL_PAD) 49 | 50 | normalize_ratio = MOUTH_WIDTH / float(mouth_right - mouth_left) 51 | 52 | new_img_shape = (int(frame.shape[0] * normalize_ratio), int(frame.shape[1] * normalize_ratio)) 53 | resized_img = imresize(frame, new_img_shape) 54 | 55 | mouth_centroid_norm = mouth_centroid * normalize_ratio 56 | 57 | mouth_l = int(mouth_centroid_norm[0] - MOUTH_WIDTH / 2) 58 | mouth_r = int(mouth_centroid_norm[0] + MOUTH_WIDTH / 2) 59 | mouth_t = int(mouth_centroid_norm[1] - MOUTH_HEIGHT / 2) 60 | mouth_b = int(mouth_centroid_norm[1] + MOUTH_HEIGHT / 2) 61 | 62 | mouth_crop_image = resized_img[mouth_t:mouth_b, mouth_l:mouth_r] 63 | 64 | mouth_frames.append(mouth_crop_image) 65 | return mouth_frames -------------------------------------------------------------------------------- /src/Scripts/videos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from keras import backend as K 4 | from scipy import ndimage 5 | from scipy.misc import imresize 6 | import skvideo.io 7 | import dlib 8 | from lipnet.lipreading.aligns import Align 9 | 10 | class Video(object): 11 | def __init__(self, vtype='mouth', face_predictor_path=None): #constructor to take input as to what we're searching, which is the mouth region, vtype=mouth is the default if not vytpe==face 12 | if vtype == 'face' and face_predictor_path is None: 13 | raise AttributeError('Face video need to be accompanied with face predictor') 14 | self.face_predictor_path = face_predictor_path 15 | self.vtype = vtype 16 | 17 | def from_frames(self, path): #frames generated will be put into a folder 18 | frames_path = sorted([os.path.join(path, x) for x in os.listdir(path)]) 19 | frames = [ndimage.imread(frame_path) for frame_path in frames_path]#ndimage is pre-defined in scipy for taking the image path 20 | self.handle_type(frames)#process face in frames, taking every path of the image and putting it into frames 21 | return self 22 | 23 | def from_video(self, path): 24 | frames = self.get_video_frames(path) 25 | self.handle_type(frames) 26 | return self #getting video frames 27 | 28 | def from_array(self, frames): 29 | self.handle_type(frames) 30 | return self 31 | 32 | def handle_type(self, frames): 33 | if self.vtype == 'mouth': 34 | self.process_frames_mouth(frames) 35 | elif self.vtype == 'face': 36 | self.process_frames_face(frames) 37 | else: 38 | raise Exception('Video type not found') 39 | 40 | def process_frames_face(self, frames): 41 | detector = dlib.get_frontal_face_detector()#detecting frontal face 42 | predictor = dlib.shape_predictor(self.face_predictor_path)#shape of the shape 43 | mouth_frames = self.get_frames_mouth(detector, predictor, frames) 44 | self.face = np.array(frames) 45 | self.mouth = np.array(mouth_frames) 46 | self.set_data(mouth_frames) 47 | 48 | def process_frames_mouth(self, frames): 49 | self.face = np.array(frames) 50 | self.mouth = np.array(frames) 51 | self.set_data(frames) 52 | 53 | def get_frames_mouth(self, detector, predictor, frames):#detect the mouth region and crop it for all 75 frames 54 | MOUTH_WIDTH = 100 #360 pixels, so 1/4 of 360, this is aprroximated 55 | MOUTH_HEIGHT = 50 56 | HORIZONTAL_PAD = 0.19 57 | normalize_ratio = None 58 | mouth_frames = [] 59 | for frame in frames: 60 | dets = detector(frame, 1) 61 | shape = None 62 | for k, d in enumerate(dets): 63 | shape = predictor(frame, d) 64 | i = -1 65 | if shape is None: # Detector doesn't detect face, just return as is 66 | return frames 67 | mouth_points = [] 68 | for part in shape.parts(): 69 | i += 1 70 | if i < 48: # Only take mouth region 71 | continue 72 | mouth_points.append((part.x,part.y)) 73 | np_mouth_points = np.array(mouth_points) 74 | 75 | mouth_centroid = np.mean(np_mouth_points[:, -2:], axis=0) 76 | 77 | if normalize_ratio is None: 78 | mouth_left = np.min(np_mouth_points[:, :-1]) * (1.0 - HORIZONTAL_PAD) 79 | mouth_right = np.max(np_mouth_points[:, :-1]) * (1.0 + HORIZONTAL_PAD) 80 | 81 | normalize_ratio = MOUTH_WIDTH / float(mouth_right - mouth_left) 82 | 83 | new_img_shape = (int(frame.shape[0] * normalize_ratio), int(frame.shape[1] * normalize_ratio)) 84 | resized_img = imresize(frame, new_img_shape) 85 | 86 | mouth_centroid_norm = mouth_centroid * normalize_ratio 87 | 88 | mouth_l = int(mouth_centroid_norm[0] - MOUTH_WIDTH / 2) 89 | mouth_r = int(mouth_centroid_norm[0] + MOUTH_WIDTH / 2) 90 | mouth_t = int(mouth_centroid_norm[1] - MOUTH_HEIGHT / 2) 91 | mouth_b = int(mouth_centroid_norm[1] + MOUTH_HEIGHT / 2) 92 | 93 | mouth_crop_image = resized_img[mouth_t:mouth_b, mouth_l:mouth_r] 94 | 95 | mouth_frames.append(mouth_crop_image) 96 | return mouth_frames 97 | 98 | def get_video_frames(self, path): 99 | videogen = skvideo.io.vreader(path)#splitting video into 75 frames, skvideo is the pre-defined module to split the videos 100 | frames = np.array([frame for frame in videogen]) 101 | return frames 102 | 103 | def set_data(self, frames):#properly arranged to the same format 104 | data_frames = [] 105 | for frame in frames: 106 | frame = frame.swapaxes(0,1) # swap width and height to form format W x H x C 107 | if len(frame.shape) < 3: 108 | frame = np.array([frame]).swapaxes(0,2).swapaxes(0,1) # Add grayscale channel 109 | data_frames.append(frame) 110 | frames_n = len(data_frames) 111 | data_frames = np.array(data_frames) # T x W x H x C 112 | if K.image_data_format() == 'channels_first': 113 | data_frames = np.rollaxis(data_frames, 3) # C x T x W x H 114 | self.data = data_frames 115 | self.length = frames_n 116 | -------------------------------------------------------------------------------- /src/cropped_face/face_175.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_175.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_176.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_176.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_177.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_177.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_178.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_178.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_179.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_179.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_180.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_180.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_181.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_181.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_182.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_182.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_183.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_183.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_184.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_184.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_185.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_185.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_186.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_186.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_187.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_187.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_188.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_188.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_189.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_189.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_190.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_190.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_191.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_191.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_192.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_192.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_193.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_193.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_194.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_194.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_195.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_195.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_196.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_196.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_197.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_197.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_198.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_198.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_200.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_200.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_201.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_201.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_202.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_202.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_203.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_203.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_204.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_204.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_205.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_205.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_206.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_206.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_207.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_207.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_208.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_208.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_209.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_209.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_210.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_210.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_211.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_211.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_212.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_212.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_213.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_213.jpg -------------------------------------------------------------------------------- /src/cropped_face/face_214.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/saisnehasv/LipReading_ML_2020_CSE_14/ff2ddeee2695d507538b79cf3dd2d17fb992369a/src/cropped_face/face_214.jpg -------------------------------------------------------------------------------- /src/evaluation/phonemes.txt: -------------------------------------------------------------------------------- 1 | AO 2 | AH 3 | AA 4 | ER 5 | OY 6 | AW 7 | HH 8 | UW 9 | UH 10 | OW 11 | AE 12 | EH 13 | EY 14 | AY 15 | IH 16 | IY 17 | AX 18 | L 19 | EL 20 | R 21 | Y 22 | S 23 | Z 24 | T 25 | D 26 | N 27 | EN 28 | SH 29 | ZH 30 | CH 31 | JH 32 | P 33 | B 34 | M 35 | TH 36 | DH 37 | F 38 | V 39 | NG 40 | K 41 | G 42 | W -------------------------------------------------------------------------------- /src/evaluation/predict.py: -------------------------------------------------------------------------------- 1 | from lipnet.lipreading.videos import Video 2 | from lipnet.lipreading.visualization import show_video_subtitle 3 | from lipnet.core.decoders import Decoder 4 | from lipnet.lipreading.helpers import labels_to_text 5 | from lipnet.utils.spell import Spell 6 | from lipnet.model2 import LipNet 7 | from keras.optimizers import Adam 8 | import ffmpeg 9 | from keras import backend as K 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | import sys 13 | import os 14 | import random 15 | #import sklearn.metrics as m 16 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 17 | np.random.seed(55) 18 | p=random.randint(20,35) 19 | n=100 20 | FACE_PREDICTOR_PATH = os.path.join(CURRENT_PATH,'..','common','predictors','shape_predictor_68_face_landmarks.dat') 21 | PREDICT_GREEDY = False 22 | PREDICT_BEAM_WIDTH = 200 23 | PREDICT_DICTIONARY = os.path.join(CURRENT_PATH,'..','common','dictionaries','grid.txt') 24 | 25 | def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28): 26 | 27 | video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) 28 | if os.path.isfile(video_path): 29 | video.from_video(video_path) 30 | else: 31 | video.from_frames(video_path) 32 | print("Data loaded.\n") 33 | 34 | 35 | if K.image_data_format() == 'channels_first': 36 | img_c, frames_n, img_w, img_h = video.data.shape 37 | else: 38 | frames_n, img_w, img_h, img_c = video.data.shape 39 | 40 | 41 | lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, 42 | absolute_max_string_len=absolute_max_string_len, output_size=output_size) 43 | 44 | adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) 45 | 46 | lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) 47 | 48 | lipnet.model.load_weights(weight_path) 49 | 50 | spell = Spell(path=PREDICT_DICTIONARY) 51 | decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, 52 | postprocessors=[labels_to_text, spell.sentence]) 53 | 54 | X_data = np.array([video.data]).astype(np.float32) / 255 55 | input_length = np.array([len(video.data)]) 56 | 57 | #print(X_data) 58 | 59 | 60 | y_pred = lipnet.predict(X_data) 61 | 62 | result = decoder.decode(y_pred, input_length)[0] 63 | return (video, result) 64 | 65 | if __name__ == '__main__': 66 | if len(sys.argv) == 3: 67 | video, result = predict(sys.argv[1], sys.argv[2]) 68 | elif len(sys.argv) == 4: 69 | video, result = predict(sys.argv[1], sys.argv[2], sys.argv[3]) 70 | elif len(sys.argv) == 5: 71 | video, result = predict(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) 72 | else: 73 | video, result = None, "" 74 | 75 | if video is not None: 76 | show_video_subtitle(video.face, result) 77 | 78 | stripe = "-" * len(result) 79 | 80 | print (" --{}- ".format(stripe)) 81 | print ("[ DECODED ] |> {} |".format(result)) 82 | print (" --{}- ".format(stripe)) 83 | -------------------------------------------------------------------------------- /src/evaluation/predict_batch.py: -------------------------------------------------------------------------------- 1 | from lipnet.lipreading.videos import Video 2 | from lipnet.lipreading.visualization import show_video_subtitle 3 | from lipnet.core.decoders import Decoder 4 | from lipnet.lipreading.helpers import labels_to_text 5 | from lipnet.utils.spell import Spell 6 | from lipnet.model2 import LipNet 7 | from keras.optimizers import Adam 8 | from keras import backend as K 9 | import numpy as np 10 | import sys 11 | import os 12 | import glob 13 | 14 | np.random.seed(55) 15 | 16 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | FACE_PREDICTOR_PATH = os.path.join(CURRENT_PATH,'..','common','predictors','shape_predictor_68_face_landmarks.dat') 19 | 20 | PREDICT_GREEDY = False 21 | PREDICT_BEAM_WIDTH = 200 22 | PREDICT_DICTIONARY = os.path.join(CURRENT_PATH,'..','common','dictionaries','grid.txt') 23 | 24 | lipnet = None 25 | adam = None 26 | spell = None 27 | decoder = None 28 | 29 | def predict(weight_path, video): 30 | global lipnet 31 | global adam 32 | global spell 33 | global decoder 34 | 35 | if lipnet is None: 36 | lipnet = LipNet(img_c=3, img_w=100, img_h=50, frames_n=75, 37 | absolute_max_string_len=32, output_size=28) 38 | 39 | adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) 40 | 41 | lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) 42 | lipnet.model.load_weights(weight_path) 43 | 44 | spell = Spell(path=PREDICT_DICTIONARY) 45 | decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, 46 | postprocessors=[labels_to_text, spell.sentence]) 47 | 48 | X_data = np.array([video.data]).astype(np.float32) / 255 49 | input_length = np.array([len(video.data)]) 50 | 51 | y_pred = lipnet.predict(X_data) 52 | result = decoder.decode(y_pred, input_length)[0] 53 | 54 | show_video_subtitle(video.face, result) 55 | print result 56 | 57 | def predicts(weight_path, videos_path, absolute_max_string_len=32, output_size=28): 58 | videos = [] 59 | for video_path in glob.glob(os.path.join(videos_path, '*')): 60 | videos.append(load(video_path)) 61 | raw_input("Press Enter to continue...") 62 | for video in videos: 63 | predict(weight_path, video) 64 | 65 | def load(video_path): 66 | print "\n[{}]\nLoading data from disk...".format(video_path) 67 | video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) 68 | if os.path.isfile(video_path): 69 | video.from_video(video_path) 70 | else: 71 | video.from_frames(video_path) 72 | print "Data loaded.\n" 73 | return video 74 | 75 | if __name__ == '__main__': 76 | if len(sys.argv) == 3: 77 | predicts(sys.argv[1], sys.argv[2]) 78 | elif len(sys.argv) == 4: 79 | predicts(sys.argv[1], sys.argv[2], sys.argv[3]) 80 | elif len(sys.argv) == 5: 81 | predicts(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) 82 | else: 83 | pass -------------------------------------------------------------------------------- /src/mouth_scripts/aligns.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Align(object): 4 | def __init__(self, absolute_max_string_len=32, label_func=None): 5 | self.label_func = label_func 6 | self.absolute_max_string_len = absolute_max_string_len 7 | 8 | def from_file(self, path): 9 | with open(path, 'r') as f: 10 | lines = f.readlines() 11 | align = [(int(y[0])/1000, int(y[1])/1000, y[2]) for y in [x.strip().split(" ") for x in lines]] 12 | self.build(align) 13 | return self 14 | 15 | def from_array(self, align): 16 | self.build(align) 17 | return self 18 | 19 | def build(self, align): 20 | self.align = self.strip(align, ['sp','sil']) 21 | self.sentence = self.get_sentence(align) 22 | self.label = self.get_label(self.sentence) 23 | self.padded_label = self.get_padded_label(self.label) 24 | 25 | def strip(self, align, items): 26 | return [sub for sub in align if sub[2] not in items] 27 | 28 | def get_sentence(self, align): 29 | return " ".join([y[-1] for y in align if y[-1] not in ['sp', 'sil']]) 30 | 31 | def get_label(self, sentence): 32 | return self.label_func(sentence) 33 | 34 | def get_padded_label(self, label): 35 | padding = np.ones((self.absolute_max_string_len-len(label))) * -1 36 | return np.concatenate((np.array(label), padding), axis=0) 37 | 38 | @property 39 | def word_length(self): 40 | return len(self.sentence.split(" ")) 41 | 42 | @property 43 | def sentence_length(self): 44 | return len(self.sentence) 45 | 46 | @property 47 | def label_length(self): 48 | return len(self.label) 49 | -------------------------------------------------------------------------------- /src/mouth_scripts/model2.py: -------------------------------------------------------------------------------- 1 | from keras.layers.convolutional import Conv3D, ZeroPadding3D 2 | from keras.layers.pooling import MaxPooling3D 3 | from keras.layers.core import Dense, Activation, SpatialDropout3D, Flatten 4 | from keras.layers.wrappers import Bidirectional, TimeDistributed 5 | from keras.layers.recurrent import GRU 6 | from keras.layers.normalization import BatchNormalization 7 | from keras.layers import Input 8 | from keras.models import Model 9 | from lipnet.core.layers import CTC 10 | from keras import backend as K 11 | 12 | 13 | class LipNet(object): 14 | def __init__(self, img_c=3, img_w=100, img_h=50, frames_n=75, absolute_max_string_len=32, output_size=28): 15 | self.img_c = img_c 16 | self.img_w = img_w 17 | self.img_h = img_h 18 | self.frames_n = frames_n 19 | self.absolute_max_string_len = absolute_max_string_len 20 | self.output_size = output_size 21 | self.build() 22 | 23 | def build(self): 24 | if K.image_data_format() == 'channels_first': 25 | input_shape = (self.img_c, self.frames_n, self.img_w, self.img_h) 26 | else: 27 | input_shape = (self.frames_n, self.img_w, self.img_h, self.img_c) 28 | 29 | self.input_data = Input(name='the_input', shape=input_shape, dtype='float32') 30 | 31 | self.zero1 = ZeroPadding3D(padding=(1, 2, 2), name='zero1')(self.input_data) 32 | self.conv1 = Conv3D(32, (3, 5, 5), strides=(1, 2, 2), kernel_initializer='he_normal', name='conv1')(self.zero1) 33 | self.batc1 = BatchNormalization(name='batc1')(self.conv1) 34 | self.actv1 = Activation('relu', name='actv1')(self.batc1) 35 | self.drop1 = SpatialDropout3D(0.5)(self.actv1) 36 | self.maxp1 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max1')(self.drop1) 37 | 38 | self.zero2 = ZeroPadding3D(padding=(1, 2, 2), name='zero2')(self.maxp1) 39 | self.conv2 = Conv3D(64, (3, 5, 5), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv2')(self.zero2) 40 | self.batc2 = BatchNormalization(name='batc2')(self.conv2) 41 | self.actv2 = Activation('relu', name='actv2')(self.batc2) 42 | self.drop2 = SpatialDropout3D(0.5)(self.actv2) 43 | self.maxp2 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max2')(self.drop2) 44 | 45 | self.zero3 = ZeroPadding3D(padding=(1, 1, 1), name='zero3')(self.maxp2) 46 | self.conv3 = Conv3D(96, (3, 3, 3), strides=(1, 1, 1), kernel_initializer='he_normal', name='conv3')(self.zero3) 47 | self.batc3 = BatchNormalization(name='batc3')(self.conv3) 48 | self.actv3 = Activation('relu', name='actv3')(self.batc3) 49 | self.drop3 = SpatialDropout3D(0.5)(self.actv3) 50 | self.maxp3 = MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2), name='max3')(self.drop3) 51 | 52 | self.resh1 = TimeDistributed(Flatten())(self.maxp3) 53 | 54 | self.gru_1 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru1'), merge_mode='concat')(self.resh1) 55 | self.gru_2 = Bidirectional(GRU(256, return_sequences=True, kernel_initializer='Orthogonal', name='gru2'), merge_mode='concat')(self.gru_1) 56 | 57 | # transforms RNN output to character activations: 58 | self.dense1 = Dense(self.output_size, kernel_initializer='he_normal', name='dense1')(self.gru_2) 59 | 60 | self.y_pred = Activation('softmax', name='softmax')(self.dense1) 61 | 62 | self.labels = Input(name='the_labels', shape=[self.absolute_max_string_len], dtype='float32') 63 | self.input_length = Input(name='input_length', shape=[1], dtype='int64') 64 | self.label_length = Input(name='label_length', shape=[1], dtype='int64') 65 | 66 | self.loss_out = CTC('ctc', [self.y_pred, self.labels, self.input_length, self.label_length]) 67 | 68 | self.model = Model(inputs=[self.input_data, self.labels, self.input_length, self.label_length], outputs=self.loss_out) 69 | 70 | def summary(self): 71 | Model(inputs=self.input_data, outputs=self.y_pred).summary() 72 | 73 | def predict(self, input_batch): 74 | return self.test_function([input_batch, 0])[0] # the first 0 indicates test 75 | 76 | @property 77 | def test_function(self): 78 | # captures output of softmax so we can decode the output during visualization 79 | return K.function([self.input_data, K.learning_phase()], [self.y_pred, K.learning_phase()]) -------------------------------------------------------------------------------- /src/mouth_scripts/videos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from keras import backend as K 4 | from scipy import ndimage 5 | from scipy.misc import imresize 6 | import skvideo.io 7 | import dlib 8 | from lipnet.lipreading.aligns import Align 9 | 10 | class VideoAugmenter(object): 11 | @staticmethod 12 | def split_words(video, align): 13 | video_aligns = [] 14 | for sub in align.align: 15 | # Create new video 16 | _video = Video(video.vtype, video.face_predictor_path) 17 | _video.face = video.face[sub[0]:sub[1]] 18 | _video.mouth = video.mouth[sub[0]:sub[1]] 19 | _video.set_data(_video.mouth) 20 | # Create new align 21 | _align = Align(align.absolute_max_string_len, align.label_func).from_array([(0, sub[1]-sub[0], sub[2])]) 22 | # Append 23 | video_aligns.append((_video, _align)) 24 | return video_aligns 25 | 26 | @staticmethod 27 | def merge(video_aligns): 28 | vsample = video_aligns[0][0] 29 | asample = video_aligns[0][1] 30 | video = Video(vsample.vtype, vsample.face_predictor_path) 31 | video.face = np.ones((0, vsample.face.shape[1], vsample.face.shape[2], vsample.face.shape[3]), dtype=np.uint8) 32 | video.mouth = np.ones((0, vsample.mouth.shape[1], vsample.mouth.shape[2], vsample.mouth.shape[3]), dtype=np.uint8) 33 | align = [] 34 | inc = 0 35 | for _video, _align in video_aligns: 36 | video.face = np.concatenate((video.face, _video.face), 0) 37 | video.mouth = np.concatenate((video.mouth, _video.mouth), 0) 38 | for sub in _align.align: 39 | _sub = (sub[0]+inc, sub[1]+inc, sub[2]) 40 | align.append(_sub) 41 | inc = align[-1][1] 42 | video.set_data(video.mouth) 43 | align = Align(asample.absolute_max_string_len, asample.label_func).from_array(align) 44 | return (video, align) 45 | 46 | @staticmethod 47 | def pick_subsentence(video, align, length): 48 | split = VideoAugmenter.split_words(video, align) 49 | start = np.random.randint(0, align.word_length - length) 50 | return VideoAugmenter.merge(split[start:start+length]) 51 | 52 | @staticmethod 53 | def pick_word(video, align): 54 | video_aligns = np.array(VideoAugmenter.split_words(video, align)) 55 | return video_aligns[np.random.randint(video_aligns.shape[0], size=2), :][0] 56 | 57 | @staticmethod 58 | def horizontal_flip(video): 59 | _video = Video(video.vtype, video.face_predictor_path) 60 | _video.face = np.flip(video.face, 2) 61 | _video.mouth = np.flip(video.mouth, 2) 62 | _video.set_data(_video.mouth) 63 | return _video 64 | 65 | @staticmethod 66 | def temporal_jitter(video, probability): 67 | changes = [] # [(frame_i, type=del/dup)] 68 | t = video.length 69 | for i in range(t): 70 | if np.random.ranf() <= probability/2: 71 | changes.append((i, 'del')) 72 | if probability/2 < np.random.ranf() <= probability: 73 | changes.append((i, 'dup')) 74 | _face = np.copy(video.face) 75 | _mouth = np.copy(video.mouth) 76 | j = 0 77 | for change in changes: 78 | _change = change[0] + j 79 | if change[1] == 'dup': 80 | _face = np.insert(_face, _change, _face[_change], 0) 81 | _mouth = np.insert(_mouth, _change, _mouth[_change], 0) 82 | j = j + 1 83 | else: 84 | _face = np.delete(_face, _change, 0) 85 | _mouth = np.delete(_mouth, _change, 0) 86 | j = j - 1 87 | _video = Video(video.vtype, video.face_predictor_path) 88 | _video.face = _face 89 | _video.mouth = _mouth 90 | _video.set_data(_video.mouth) 91 | return _video 92 | 93 | @staticmethod 94 | def pad(video, length): 95 | pad_length = max(length - video.length, 0) 96 | video_length = min(length, video.length) 97 | face_padding = np.ones((pad_length, video.face.shape[1], video.face.shape[2], video.face.shape[3]), dtype=np.uint8) * 0 98 | mouth_padding = np.ones((pad_length, video.mouth.shape[1], video.mouth.shape[2], video.mouth.shape[3]), dtype=np.uint8) * 0 99 | _video = Video(video.vtype, video.face_predictor_path) 100 | _video.face = np.concatenate((video.face[0:video_length], face_padding), 0) 101 | _video.mouth = np.concatenate((video.mouth[0:video_length], mouth_padding), 0) 102 | _video.set_data(_video.mouth) 103 | return _video 104 | 105 | 106 | class Video(object): 107 | def __init__(self, vtype='mouth', face_predictor_path=None): 108 | if vtype == 'face' and face_predictor_path is None: 109 | raise AttributeError('Face video need to be accompanied with face predictor') 110 | self.face_predictor_path = face_predictor_path 111 | self.vtype = vtype 112 | 113 | def from_frames(self, path): 114 | frames_path = sorted([os.path.join(path, x) for x in os.listdir(path)]) 115 | frames = [ndimage.imread(frame_path) for frame_path in frames_path] 116 | self.handle_type(frames) 117 | return self 118 | 119 | def from_video(self, path): 120 | frames = self.get_video_frames(path) 121 | self.handle_type(frames) 122 | return self 123 | 124 | def from_array(self, frames): 125 | self.handle_type(frames) 126 | return self 127 | 128 | def handle_type(self, frames): 129 | if self.vtype == 'mouth': 130 | self.process_frames_mouth(frames) 131 | elif self.vtype == 'face': 132 | self.process_frames_face(frames) 133 | else: 134 | raise Exception('Video type not found') 135 | 136 | def process_frames_face(self, frames): 137 | detector = dlib.get_frontal_face_detector() 138 | predictor = dlib.shape_predictor(self.face_predictor_path) 139 | mouth_frames = self.get_frames_mouth(detector, predictor, frames) 140 | self.face = np.array(frames) 141 | self.mouth = np.array(mouth_frames) 142 | self.set_data(mouth_frames) 143 | 144 | def process_frames_mouth(self, frames): 145 | self.face = np.array(frames) 146 | self.mouth = np.array(frames) 147 | self.set_data(frames) 148 | 149 | def get_frames_mouth(self, detector, predictor, frames): 150 | MOUTH_WIDTH = 100 151 | MOUTH_HEIGHT = 50 152 | HORIZONTAL_PAD = 0.19 153 | normalize_ratio = None 154 | mouth_frames = [] 155 | for frame in frames: 156 | dets = detector(frame, 1) 157 | shape = None 158 | for k, d in enumerate(dets): 159 | shape = predictor(frame, d) 160 | i = -1 161 | if shape is None: # Detector doesn't detect face, just return as is 162 | return frames 163 | mouth_points = [] 164 | for part in shape.parts(): 165 | i += 1 166 | if i < 48: # Only take mouth region 167 | continue 168 | mouth_points.append((part.x,part.y)) 169 | np_mouth_points = np.array(mouth_points) 170 | 171 | mouth_centroid = np.mean(np_mouth_points[:, -2:], axis=0) 172 | 173 | if normalize_ratio is None: 174 | mouth_left = np.min(np_mouth_points[:, :-1]) * (1.0 - HORIZONTAL_PAD) 175 | mouth_right = np.max(np_mouth_points[:, :-1]) * (1.0 + HORIZONTAL_PAD) 176 | 177 | normalize_ratio = MOUTH_WIDTH / float(mouth_right - mouth_left) 178 | 179 | new_img_shape = (int(frame.shape[0] * normalize_ratio), int(frame.shape[1] * normalize_ratio)) 180 | resized_img = imresize(frame, new_img_shape) 181 | 182 | mouth_centroid_norm = mouth_centroid * normalize_ratio 183 | 184 | mouth_l = int(mouth_centroid_norm[0] - MOUTH_WIDTH / 2) 185 | mouth_r = int(mouth_centroid_norm[0] + MOUTH_WIDTH / 2) 186 | mouth_t = int(mouth_centroid_norm[1] - MOUTH_HEIGHT / 2) 187 | mouth_b = int(mouth_centroid_norm[1] + MOUTH_HEIGHT / 2) 188 | 189 | mouth_crop_image = resized_img[mouth_t:mouth_b, mouth_l:mouth_r] 190 | 191 | mouth_frames.append(mouth_crop_image) 192 | return mouth_frames 193 | 194 | def get_video_frames(self, path): 195 | videogen = skvideo.io.vreader(path) 196 | frames = np.array([frame for frame in videogen]) 197 | return frames 198 | 199 | def set_data(self, frames): 200 | data_frames = [] 201 | for frame in frames: 202 | frame = frame.swapaxes(0,1) # swap width and height to form format W x H x C 203 | if len(frame.shape) < 3: 204 | frame = np.array([frame]).swapaxes(0,2).swapaxes(0,1) # Add grayscale channel 205 | data_frames.append(frame) 206 | frames_n = len(data_frames) 207 | data_frames = np.array(data_frames) # T x W x H x C 208 | if K.image_data_format() == 'channels_first': 209 | data_frames = np.rollaxis(data_frames, 3) # C x T x W x H 210 | self.data = data_frames 211 | self.length = frames_n -------------------------------------------------------------------------------- /src/page.py: -------------------------------------------------------------------------------- 1 | from tkinter import * 2 | from tkinter import filedialog 3 | from PIL import Image, ImageTk 4 | import os 5 | import tkinter as tk, threading 6 | 7 | 8 | global UploadAction 9 | global file,video 10 | 11 | def fnm(): 12 | global filename 13 | filename = filedialog.askopenfilename() 14 | return filename 15 | 16 | def UploadAction(event=None): 17 | fname=fnm() 18 | print(fname) 19 | 20 | def btn_clicked(): 21 | print("Button Clicked") 22 | 23 | 24 | def stream(label): 25 | os.system(filename) 26 | 27 | def func(): 28 | my_label = tk.Label(window) 29 | my_label.pack() 30 | thread = threading.Thread(target=stream, args=(my_label,)) 31 | thread.daemon = 1 32 | thread.start() 33 | 34 | def openProgram(): 35 | os.system("python evaluation\predict.py evaluation\models\overlapped-weights368.h5 " + filename) 36 | 37 | def new_window(): 38 | root.destroy() 39 | global window 40 | window = Tk() 41 | 42 | window.geometry("862x519") 43 | window.title("Intuitive Perception") 44 | img = ImageTk.PhotoImage(Image.open("images/icon.jpeg")) 45 | window.iconphoto(False,img) 46 | window.configure(bg = "#ffffff") 47 | canvas = Canvas( 48 | window, 49 | bg = "#ffffff", 50 | height = 519, 51 | width = 862, 52 | bd = 0, 53 | highlightthickness = 0, 54 | relief = "ridge") 55 | canvas.place(x = 0, y = 0) 56 | 57 | 58 | canvas.create_rectangle( 59 | 0, 472, 0+862, 472+47, 60 | fill = "#0d0b72", 61 | outline = "") 62 | 63 | 64 | canvas.create_rectangle( 65 | 0, 0, 0+862, 0+27, 66 | fill = "#0d0b72", 67 | outline = "") 68 | 69 | canvas.create_text( 70 | 431.0, 102.5, 71 | text = "Intuitive Perception", 72 | fill = "#080c63", 73 | font = ("CenturyGothic-Bold", int(24.0))) 74 | 75 | canvas.create_text( 76 | 430.5, 165.0, 77 | text = "Upload a video to generate subtitles", 78 | fill = "#000000", 79 | font = ("CenturyGothic-Bold", int(13.0))) 80 | 81 | img0 = PhotoImage(file = f"images/img0.png") 82 | b0 = Button( 83 | image = img0, 84 | borderwidth = 0, 85 | highlightthickness = 0, 86 | command = UploadAction, 87 | relief = "flat") 88 | 89 | b0.place( 90 | x = 187, y = 313, 91 | width = 127, 92 | height = 38) 93 | 94 | img1 = PhotoImage(file = f"images/img1.png") 95 | b1 = Button( 96 | image = img1, 97 | borderwidth = 0, 98 | highlightthickness = 0, 99 | command = func, 100 | relief = "flat") 101 | 102 | b1.place( 103 | x = 375, y = 313, 104 | width = 127, 105 | height = 38) 106 | 107 | img2 = PhotoImage(file = f"images/img2.png") 108 | b2 = Button( 109 | image = img2, 110 | borderwidth = 0, 111 | highlightthickness = 0, 112 | command = openProgram, 113 | relief = "flat") 114 | 115 | b2.place( 116 | x = 558, y = 313, 117 | width = 127, 118 | height = 38) 119 | 120 | image_0 = PhotoImage(file = "images/image_0.png") 121 | canvas_image_0 = canvas.create_image( 122 | 625.0, 272.0, 123 | image=image_0) 124 | 125 | image_1 = PhotoImage(file = "images/image_1.png") 126 | canvas_image_1 = canvas.create_image( 127 | 439.0, 271.0, 128 | image=image_1) 129 | 130 | image_2 = PhotoImage(file = "images/image_2.png") 131 | canvas_image_2 = canvas.create_image( 132 | 250.5, 272.0, 133 | image=image_2) 134 | 135 | window.resizable(False, False) 136 | window.mainloop() 137 | 138 | 139 | root = Tk() 140 | 141 | root.geometry("862x519") 142 | root.title("Intuitive Perception") 143 | img = ImageTk.PhotoImage(Image.open("images/icon.jpeg")) 144 | root.iconphoto(False,img) 145 | root.configure(bg = "#080c63") 146 | canvas = Canvas( 147 | root, 148 | bg = "#080c63", 149 | height = 519, 150 | width = 862, 151 | bd = 0, 152 | highlightthickness = 0, 153 | relief = "ridge") 154 | canvas.place(x = 0, y = 0) 155 | canvas.create_rectangle( 156 | 431, 0, 431+431, 0+519, 157 | fill = "#ffffff", 158 | outline = "") 159 | 160 | title = Label(text="Welcome to Intuitive Perception", bg="#080c63",fg="white",font=("CenturyGothic-Bold",int(16.0))) 161 | title.place(x=30.0,y=78.0) 162 | 163 | canvas.create_rectangle( 164 | 55, 130, 55 + 240, 130 + 5, 165 | fill = "#ffffff", 166 | outline = "") 167 | 168 | info_text = Label(text="Use our application to generate subtitles \n" 169 | "for videos.\n\n" 170 | 171 | "What makes Intuitive percetion better?\n\n" 172 | 173 | "We aim to generate subtitles for Videos\n" 174 | "independent of the audio.\n\n" 175 | 176 | "Our software uses machine learning to\n" 177 | "achieve this with the accuracy rate\n" 178 | "between 70-80%.", 179 | bg="#080c63",fg="white",justify="left",font=("CenturyGothic",int(13.0))) 180 | 181 | info_text.place(x=27.0,y=178.0) 182 | 183 | 184 | canvas.create_text( 185 | 650.5,88.0, 186 | text = "Instructions", 187 | fill = "#080c63", 188 | font = ("CenturyGothic-Bold", int(20.0))) 189 | 190 | steps_text = Label(text="Follow the 3 Simple Instructions.\n\n\n" 191 | "Step 1: Click on Add Video button to add \n" 192 | " the video of your choice.\n\n" 193 | "Step 2: You can verify the video selected\n" 194 | " by clicking on the Play Video button.\n\n" 195 | "Step 3: To process the video and get the\n" 196 | " subtitles, Click on Generate Subtitles\n" 197 | " button\n.", 198 | bg="#ffffff",fg="#080c63",justify="left",font=("CenturyGothic",int(13.0))) #black :#000000 199 | 200 | steps_text.place(x=470,y=120.0) 201 | 202 | img0 = PhotoImage(file = f"images/img-0.png") 203 | b0 = Button( 204 | image = img0, 205 | borderwidth = 0, 206 | highlightthickness = 0, 207 | command = lambda: new_window(), 208 | relief = "flat") 209 | b0.place( 210 | x = 549, y = 400, 211 | width = 195, 212 | height = 54) 213 | 214 | 215 | 216 | 217 | 218 | root.resizable(False, False) 219 | root.mainloop() -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='lipnet', 4 | version='0.1.6', 5 | description='End-to-end sentence-level lipreading', 6 | packages=['lipnet'], 7 | zip_safe=False, 8 | install_requires=[ 9 | 'Keras==2.0.2', 10 | 'editdistance==0.3.1', 11 | 'h5py==2.10.0', 12 | 'matplotlib==2.2.5', 13 | 'numpy==1.19.1', 14 | 'python-dateutil==2.8.0', 15 | 'scipy==1.2.3', 16 | 'Pillow==4.3.0', 17 | 'tensorflow==1.13.1', 18 | 'Theano==0.9.0', 19 | 'nltk==3.2.2', 20 | 'sk-video==1.1.10', 21 | 'dlib' 22 | ]) -------------------------------------------------------------------------------- /src/training/overlapped_speakers/prepare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import subprocess 4 | import sys 5 | 6 | ''' 7 | This script prepare training folder and its dataset for each speaker. 8 | - Folder s{i}/datasets/train would contain original DATASET_VIDEO - s{i} with 0 <= i < VAL_SAMPLES 9 | - Folder s{i}/datasets/val would contain s{i} >= VAL_SAMPLES 10 | - Folder s{i}/datasets/align would contain all your *.align 11 | 12 | Usage: 13 | $ python prepare.py [Path to video dataset] [Path to align dataset] [Number of samples] 14 | 15 | Notes: 16 | - [Path to video dataset] should be a folder with structure: /s{i}/[video] 17 | - [Path to align dataset] should be a folder with structure: /[align].align 18 | - [Number of samples] should be less than or equal to min(len(ls '/s{i}/*')) 19 | ''' 20 | #defining paths of both video and align folder, number of samples is to specify the validation/testing. 21 | 22 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 23 | DATASET_VIDEO_PATH = sys.argv[1] 24 | DATASET_ALIGN_PATH = sys.argv[2] 25 | 26 | VAL_SAMPLES = int(sys.argv[3]) 27 | 28 | 29 | for speaker_path in glob.glob(os.path.join(DATASET_VIDEO_PATH, '*')):#it lists the files in the directory 30 | speaker_id = os.path.splitext(speaker_path)[0].split('\\')[-1]#it makes s1,s2,s3... folders 31 | print(speaker_id)#only the folder that is being considered is printed 32 | subprocess.check_output("mkdir {}".format(os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'train')), shell=True) #current path of prepare.py, creates a /s1/datasets/train 33 | for s_path in glob.glob(os.path.join(DATASET_VIDEO_PATH, '*')):#lists the file in the path 34 | s_id = os.path.splitext(s_path)[0].split('\\')[-1]#same as above 35 | 36 | if s_path == speaker_path:#to check if the first and second for loop is in sync, if yes 37 | subprocess.check_output("mkdir {}".format(os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'train', s_id)), shell=True)#dataset/train/s* 38 | subprocess.check_output("mkdir {}".format(os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'val', s_id)), shell=True)#dataset/val/s* 39 | n = 0 40 | for video_path in glob.glob(os.path.join(DATASET_VIDEO_PATH, speaker_id, '*')):#to check how many files are listed in s1, 2, 3... 41 | video_id = os.path.basename(video_path)#name of the file.mpg 42 | if n < VAL_SAMPLES:#val_samples is the number of samples, eg. 350 43 | subprocess.check_output("mklink {} {}".format(os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'val', s_id, video_id), video_path), shell=True)#first 350 is for validation 44 | else: 45 | subprocess.check_output("mklink {} {}".format(os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'train', s_id, video_id), video_path), shell=True)#remaining for training 46 | n += 1 47 | else: 48 | subprocess.check_output("mklink /D {} {}".format( os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'train', s_id), s_path), shell=True)#all files will be taken for training 49 | subprocess.check_output("mklink /D {} {}".format(os.path.join(CURRENT_PATH, speaker_id, 'datasets', 'align'), DATASET_ALIGN_PATH), shell=True)#creating dataset/align -------------------------------------------------------------------------------- /src/training/overlapped_speakers/train.py: -------------------------------------------------------------------------------- 1 | from keras.optimizers import Adam 2 | from keras.callbacks import TensorBoard, CSVLogger, ModelCheckpoint 3 | from lipnet.lipreading.generators import BasicGenerator 4 | from lipnet.lipreading.callbacks import Statistics, Visualize 5 | from lipnet.lipreading.curriculums import Curriculum 6 | from lipnet.core.decoders import Decoder 7 | from lipnet.lipreading.helpers import labels_to_text 8 | from lipnet.utils.spell import Spell 9 | from lipnet.model2 import LipNet 10 | import numpy as np 11 | import datetime 12 | import os 13 | import sys 14 | 15 | np.random.seed(55) 16 | 17 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 18 | 19 | PREDICT_GREEDY = False 20 | PREDICT_BEAM_WIDTH = 200 21 | PREDICT_DICTIONARY = os.path.join(CURRENT_PATH,'..','..','common','dictionaries','grid.txt') 22 | 23 | def curriculum_rules(epoch): 24 | return { 'sentence_length': -1, 'flip_probability': 0.5, 'jitter_probability': 0.05 } 25 | 26 | 27 | def train(run_name, speaker, start_epoch, stop_epoch, img_c, img_w, img_h, frames_n, absolute_max_string_len, minibatch_size): 28 | DATASET_DIR = os.path.join(CURRENT_PATH, speaker, 'datasets') 29 | OUTPUT_DIR = os.path.join(CURRENT_PATH, speaker, 'results') 30 | LOG_DIR = os.path.join(CURRENT_PATH, speaker, 'logs') 31 | 32 | curriculum = Curriculum(curriculum_rules) 33 | lip_gen = BasicGenerator(dataset_path=DATASET_DIR, 34 | minibatch_size=minibatch_size, 35 | img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, 36 | absolute_max_string_len=absolute_max_string_len, 37 | curriculum=curriculum, start_epoch=start_epoch).build() 38 | 39 | lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, 40 | absolute_max_string_len=absolute_max_string_len, output_size=lip_gen.get_output_size()) 41 | lipnet.summary() 42 | 43 | adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) 44 | 45 | # the loss calc occurs elsewhere, so use a dummy lambda func for the loss 46 | lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) 47 | 48 | # load weight if necessary 49 | if start_epoch > 0: 50 | weight_file = os.path.join(OUTPUT_DIR, os.path.join(run_name, 'weights%02d.h5' % (start_epoch - 1))) 51 | lipnet.model.load_weights(weight_file) 52 | 53 | spell = Spell(path=PREDICT_DICTIONARY) 54 | decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, 55 | postprocessors=[labels_to_text, spell.sentence]) 56 | 57 | # define callbacks 58 | statistics = Statistics(lipnet, lip_gen.next_val(), decoder, 256, output_dir=os.path.join(OUTPUT_DIR, run_name)) 59 | visualize = Visualize(os.path.join(OUTPUT_DIR, run_name), lipnet, lip_gen.next_val(), decoder, num_display_sentences=minibatch_size) 60 | tensorboard = TensorBoard(log_dir=os.path.join(LOG_DIR, run_name)) 61 | csv_logger = CSVLogger(os.path.join(LOG_DIR, "{}-{}.csv".format('training',run_name)), separator=',', append=True) 62 | checkpoint = ModelCheckpoint(os.path.join(OUTPUT_DIR, run_name, "weights{epoch:02d}.h5"), monitor='val_loss', save_weights_only=True, mode='auto', period=1) 63 | 64 | lipnet.model.fit_generator(generator=lip_gen.next_train(), 65 | steps_per_epoch=lip_gen.default_training_steps, epochs=stop_epoch, 66 | validation_data=lip_gen.next_val(), validation_steps=lip_gen.default_validation_steps, 67 | callbacks=[checkpoint, statistics, visualize, lip_gen, tensorboard, csv_logger], 68 | initial_epoch=start_epoch, 69 | verbose=1, 70 | max_q_size=5, 71 | workers=2, 72 | pickle_safe=False) 73 | 74 | if __name__ == '__main__': 75 | run_name = datetime.datetime.now().strftime('%Y%m%d%H%M%S') 76 | speaker = sys.argv[1] 77 | train(run_name, speaker, 0, 500, 3, 360, 288, 75, 32, 50) 78 | --------------------------------------------------------------------------------