├── LICENSE
├── README.md
├── TDLC
    ├── TDLC.pde
    └── data
    │   └── Garuda-48.vlw
├── audioStitcher.py
├── cropper.py
├── dump.py
├── faceReadTest.py
├── getAudio.py
├── imageTest.py
├── key.txt
├── lipTester.py
├── phoframeTest.py
├── phoframeTrain.py
├── phoframes.txt
├── pyTubeShort.py
├── pytubeTest.py
├── samples
    ├── frame120030.png
    ├── frame120031.png
    ├── frame120032.png
    ├── frame120033.png
    ├── frame120034.png
    ├── frame120035.png
    ├── frame120036.png
    ├── frame120037.png
    ├── frame120038.png
    ├── frame120039.png
    ├── frame120040.png
    ├── frame120041.png
    ├── frame120042.png
    ├── frame120043.png
    ├── frame120044.png
    ├── frame120045.png
    ├── frame120046.png
    ├── frame120047.png
    ├── frame120048.png
    ├── frame120049.png
    ├── frame120050.png
    ├── frame120051.png
    ├── frame120052.png
    ├── frame120053.png
    ├── frame120054.png
    ├── frame120055.png
    ├── frame120056.png
    ├── frame120057.png
    ├── frame120058.png
    ├── frame120059.png
    ├── frame120060.png
    ├── frame120061.png
    ├── frame120062.png
    ├── frame120063.png
    ├── frame120064.png
    ├── frame120065.png
    ├── frame120066.png
    ├── frame120067.png
    ├── frame120068.png
    ├── frame120069.png
    ├── frame120070.png
    ├── frame120071.png
    ├── frame120072.png
    ├── frame120073.png
    ├── frame120074.png
    ├── frame120075.png
    ├── frame120076.png
    ├── frame120077.png
    ├── frame120078.png
    ├── frame120079.png
    ├── frame120080.png
    ├── frame120081.png
    ├── frame120082.png
    ├── frame120083.png
    ├── frame120084.png
    ├── frame120085.png
    ├── frame120086.png
    ├── frame120087.png
    ├── frame120088.png
    ├── frame120089.png
    ├── frame120090.png
    ├── frame120091.png
    ├── frame120092.png
    ├── frame120093.png
    ├── frame120094.png
    ├── frame120095.png
    ├── frame120096.png
    ├── frame120097.png
    ├── frame120098.png
    ├── frame120099.png
    ├── frame120100.png
    ├── frame120101.png
    ├── frame120102.png
    ├── frame120103.png
    ├── frame120104.png
    ├── frame120105.png
    ├── frame120106.png
    ├── frame120107.png
    ├── frame120108.png
    ├── frame120109.png
    ├── frame120110.png
    ├── frame120111.png
    ├── frame120112.png
    ├── frame120113.png
    ├── frame120114.png
    ├── frame120115.png
    ├── frame120116.png
    ├── frame120117.png
    ├── frame120118.png
    ├── frame120119.png
    ├── frame120120.png
    ├── frame120121.png
    ├── frame120122.png
    ├── frame120123.png
    ├── frame120124.png
    ├── frame120125.png
    ├── frame120126.png
    ├── frame120127.png
    ├── frame120128.png
    └── frame120129.png
├── trainingDataVisualizer.py
├── turnPhonemesToPhoframes.py
├── videoContinue.py
└── videoGetter.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 carykh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # videoToVoice
 2 | These files take in a sequence of lip images, and predict the phonemes being said.
 3 | 
 4 | pyTubeTest.py takes in a YouTube URL, downloads that video onto the computer, turns the video into an image sequence, tries to find faces in the images, and also extracts the audio from the video and saves that, too. Earlier, we tried to get pyTubeTest.py to also convert the audio into spectrograms with ARSS in the same code, but that just didn’t work because all the libraries required for the first steps only work in Ubuntu, and ARSS only works in Windows.
 5 | 
 6 | pyTubeShort.py does the same thing as pyTubeTest.py, but doesn't download the video from YouTube. Instead, it just takes a file from a file directory.
 7 | 
 8 | getAudio.py takes in a video file, and saves the audio from that video into a new file.
 9 | 
10 | audioStitcher.py is very simple: it just takes in two audio files, stitches them together, and saves the result.
11 | 
12 | lipTester.py takes in a sequence of images of faces, and crops each one so that the new folder of images only shows the speaker's lips. (And a margin of 25 pixels or so.)
13 | 
14 | turnPhonemesToPhoframes.py takes the JSON output the Gentle creates. (This is a time-aligned transcript of what was spoken in the video: e.g., when I said the Bee Movie script, this JSON file has the timestamps at which I said every phoneme of the movie.) Then, it turns that JSON file into phoframes.txt, a text file listing what phoneme is said at every video frame (1/30th of a second)
15 | 
16 | key.txt tells us what number corresponds with what phoneme, so we can read phoframes.txt more easily!
17 | 
18 | phoframes.txt tells us what phoneme is being said at every frame of the video. This is the ground truth. And every value is a number, which can be converted back into a phoneme, using key.txt.
19 | 
20 | phoframeTrain.py creates the neural network architecture, and trains it on processed data. (Note: this code describes the neural network architecture in the most detail.)
21 | 
22 | phoframeTest.py takes in a pre-trained model, and a sequence of silent images, and generates a text file predicting what phonemes should go along with said video.
23 | 
24 | rainingDataVisualizer.py
25 | 
26 | OUTDATED FILES
27 | 
28 | imageTest.py was an experimental dumping ground of how to use PIL, which I don't think I ended up using.
29 | 
30 | dump.py is where I tested helper functions such as the spectrogram-smoother, and video-frame-accesser.
31 | 
32 | faceReadTest.py is where I tested the face recognition library installed from online. It ended up working, but it snaps to the nearest ~30 pixels for some reason, so we decided not to use it for now.
33 | 
34 | cropper.py crops an image to only show the middle section (middle 40% horizontally and middle 50% vertically), although this is only used for explanation purposes (train.py has a cropping function within it.)
35 | 
36 | videoGetter.py was a short script we used to extract all the images from a downloaded YouTube video when pyTubeTest.py crashed for some reason.
37 | 
38 | videoContinue.py is videoGetter’s sequel and does the same thing, but starts in the middle.
39 | 
40 | trainingDataVisualizer.py was my first attempt at making the pretty bar graphs that show the NN's prediction of phonemes at each frame. The new and improved version is the .pde file.
41 | 


--------------------------------------------------------------------------------
/TDLC/TDLC.pde:
--------------------------------------------------------------------------------
 1 | int img = 90000;
 2 | PFont font;
 3 | String[] phos;
 4 | String[] keys;
 5 | void setup(){
 6 |   phos = loadStrings("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt");
 7 |   keys = loadStrings("/media/rob/Ma Book1/CS 230/videoToVoice/3/key.txt");
 8 |   
 9 |   font = loadFont("Garuda-48.vlw");
10 |   textFont(font,24);
11 |   size(400,250);
12 |   frameRate(2);
13 | }
14 | void draw(){
15 |   if(img < 200 || true){
16 |     String s = img+"";
17 |     while(s.length() < 4){
18 |       s = "0"+s;
19 |     }
20 |     background(0);
21 |     PImage mouthImage = loadImage("/media/rob/Ma Book1/CS 230/videoToVoice/3/mouthImages/frame"+s+".jpg");
22 |     image(mouthImage,0,0);
23 |     saveFrame("/media/rob/Ma Book1/CS 230/videoToVoice/3/lineupCheck/check"+s+".jpg");
24 |     
25 |     for(int y = -5; y <= 5; y++){
26 |       String val = keys[Integer.parseInt(phos[img+y])];
27 |       if(y == 0){
28 |         fill(255,0,0);
29 |       }else{
30 |         fill(255);
31 |       }
32 |       text(val.split("\t")[1],240,160+24*y);
33 |     }
34 |   }
35 |   img++; 
36 | }
37 | 


--------------------------------------------------------------------------------
/TDLC/data/Garuda-48.vlw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/TDLC/data/Garuda-48.vlw


--------------------------------------------------------------------------------
/audioStitcher.py:
--------------------------------------------------------------------------------
 1 | import wave
 2 | 
 3 | infiles = ["3/audiop1.wav", "3/audiop2.wav"]
 4 | outfile = "3/fullAudio.wav"
 5 | 
 6 | data= []
 7 | for infile in infiles:
 8 |     w = wave.open(infile, 'rb')
 9 |     data.append( [w.getparams(), w.readframes(w.getnframes())] )
10 |     w.close()
11 | 
12 | output = wave.open(outfile, 'wb')
13 | output.setparams(data[0][0])
14 | output.writeframes(data[0][1])
15 | output.writeframes(data[1][1])
16 | output.close()
17 | 


--------------------------------------------------------------------------------
/cropper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import misc
 3 | import random
 4 | import math
 5 | 
 6 | for i in range(0,20):
 7 |     strIndex = str(i)
 8 |     while len(strIndex) < 6:
 9 |         strIndex = "0"+strIndex
10 |     arr = misc.imread('2/origImages/frame'+strIndex+'.jpg')
11 |     misc.imsave('2/croppedImages/frame'+strIndex+'.jpg',arr[180:540,384:896])
12 | 


--------------------------------------------------------------------------------
/dump.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import misc
 3 | 
 4 | INSPEC_WIDTH = 240 # 2 seconds
 5 | INSPEC_HEIGHT = 368
 6 | 
 7 | def readAndClipImage(i):
 8 |     if i < 0 or i > 90:
 9 |         return np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH,1))
10 |     arr = misc.imread('2/audioSnippets/'+str(i)+'.jpg')
11 |     if i == 0: 
12 |         return arr[:,:]
13 |     elif i == 90:
14 |         return arr[:,120:]
15 |     else:
16 |         return arr[:,120:]
17 | 
18 | 
19 | def getSpecAtFrame(f,w):
20 |     specIndex = (f // 300)
21 | 
22 |     arr = np.zeros((INSPEC_HEIGHT,INSPEC_WIDTH))
23 | 
24 |     specImageFile = readAndClipImage(specIndex)
25 |     prevSpecImageFile = readAndClipImage(specIndex-1)
26 | 
27 |     mod = frameIndex%300
28 | 
29 |     if mod < w: # The previous 2 seconds is going to bleed into the previous section
30 |         seamSpot = (w-mod)*4
31 |         arr[:,seamSpot:] = specImageFile[:,0:mod*4,0]
32 |         arr[:,:seamSpot] = prevSpecImageFile[:,1200-seamSpot:1200,0]
33 |         for col in range(seamSpot,min(seamSpot+w,INSPEC_WIDTH)): #60-pixel smoothing between one portion and the next, cuz I'm fancy.
34 | 	    sFrom = prevSpecImageFile[:,1200+col-seamSpot,0]
35 | 	    sTo = specImageFile[:,col-seamSpot,0]
36 | 	    prog = (col-seamSpot)/60.0
37 | 	    arr[:,col] = sFrom+(sTo-sFrom)*prog
38 |     else:
39 | 	arr = specImageFile[:,(mod-w)*4:mod*4,0]
40 |     return np.asarray(arr)/255.0
41 | 
42 | def getInSpecAtFrame(f):
43 |     return getSpecAtFrame(f,60)
44 | 
45 | def getOutSpecAtFrame(f):
46 |     return getSpecAtFrame(f+2,2)
47 | 
48 | 
49 | frameIndex = 5125
50 | 
51 | misc.imsave('dump9.png',getOutSpecAtFrame(frameIndex))
52 | 
53 | 


--------------------------------------------------------------------------------
/faceReadTest.py:
--------------------------------------------------------------------------------
 1 | import face_recognition
 2 | import subprocess
 3 | 
 4 | image = face_recognition.load_image_file("2/origImages/frame0000.jpg")
 5 | 
 6 | face_locations = face_recognition.face_locations(image)
 7 | 
 8 | if(len(face_locations) == 1):
 9 |     top, right, bottom, left = face_locations[0]
10 |     faceFilename = faceFolderName+"/"+"frame{:04d}.jpg".format(0)
11 |     height = top-bottom
12 |     faceFrame = frame.crop((left,top,right,bottom-height*0.3))
13 |     faceFrame.save(faceFilename)
14 | 


--------------------------------------------------------------------------------
/getAudio.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | 
3 | command = "ffmpeg -i 3/IMG_4700.MOV -ab 160k -ac 2 -ar 44100 -vn 3/audiop2.wav"
4 | 
5 | subprocess.call(command, shell=True)
6 | 


--------------------------------------------------------------------------------
/imageTest.py:
--------------------------------------------------------------------------------
 1 | from PIL import Image
 2 | from PIL import ImageFont
 3 | from PIL import ImageDraw 
 4 | 
 5 | img = Image.open("sample_in.jpg")
 6 | draw = ImageDraw.Draw(img)
 7 | # font = ImageFont.truetype(<font-file>, <font-size>)
 8 | font = ImageFont.truetype("sans-serif.ttf", 16)
 9 | # draw.text((x, y),"Sample Text",(r,g,b))
10 | draw.text((0, 0),"Sample Text",(255,255,255),font=font)
11 | img.save('sample-out.jpg')
12 | 


--------------------------------------------------------------------------------
/key.txt:
--------------------------------------------------------------------------------
 1 | 0	silence
 2 | 1	k
 3 | 2	ao
 4 | 3	r
 5 | 4	d
 6 | 5	ih
 7 | 6	ng
 8 | 7	t
 9 | 8	ah
10 | 9	l
11 | 10	n
12 | 11	ow
13 | 12	z
14 | 13	v
15 | 14	ey
16 | 15	iy
17 | 16	sh
18 | 17	dh
19 | 18	eh
20 | 19	w
21 | 20	b
22 | 21	uh
23 | 22	f
24 | 23	ay
25 | 24	s
26 | 25	aa
27 | 26	uw
28 | 27	m
29 | 28	g
30 | 29	ae
31 | 30	aw
32 | 31	hh
33 | 32	y
34 | 33	th
35 | 34	p
36 | 35	oov
37 | 36	er
38 | 37	jh
39 | 38	ch
40 | 39	oy
41 | 40	zh
42 | 


--------------------------------------------------------------------------------
/lipTester.py:
--------------------------------------------------------------------------------
 1 | import face_recognition
 2 | from scipy import misc
 3 | margin = 25
 4 | maxWidth = 0
 5 | maxHeight = 0
 6 | 
 7 | # frames with my hand in front of my mouth
 8 | # 3197 - 3224
 9 | 
10 | 
11 | for i in range(0,131663):
12 |     strIndex = str(i)
13 |     while len(strIndex) < 4:
14 |         strIndex = "0"+strIndex
15 | 
16 |     image = face_recognition.load_image_file("/media/rob/Ma Book1/CS 230/videoToVoice/3/origImages/frame"+strIndex+".jpg")
17 |     face_landmarks_list = face_recognition.face_landmarks(image)
18 | 
19 |     if(len(face_landmarks_list) >= 1):
20 |         xMin = 999999
21 |         xMax = -999999
22 |         yMin = 999999
23 |         yMax = -999999
24 |         
25 |         points = face_landmarks_list[0]['bottom_lip']+face_landmarks_list[0]['top_lip']
26 |         
27 |         for point in points:
28 |             if point[0] < xMin:
29 |                 xMin = point[0]
30 |             if point[0] > xMax:
31 |                 xMax = point[0]
32 |             if point[1] < yMin:
33 |                 yMin = point[1]
34 |             if point[1] > yMax:
35 |                 yMax = point[1]
36 | 
37 |         if(yMax-yMin > maxHeight):
38 |             maxHeight = yMax-yMin
39 | 
40 |         if(xMax-xMin > maxWidth):
41 |             maxWidth = xMax-xMin
42 | 
43 |         arr = misc.imread("3/origImages/frame"+strIndex+".jpg")
44 |         misc.imsave("3/mouthImages/frame"+strIndex+".jpg",arr[yMin-margin:yMax+margin,xMin-margin:xMax+margin])
45 |         print("FINISHED IMAGE #"+str(i)+". Also, the maximum dimensions are "+str(maxWidth)+" x "+str(maxHeight))
46 | 


--------------------------------------------------------------------------------
/phoframeTest.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from scipy import misc
  4 | import random
  5 | import math
  6 | import os
  7 | 
  8 | phoframeFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt","r") 
  9 | 
 10 | phoframes = phoframeFile.read().split("\n")
 11 | 
 12 | FOLDER_SAVE_NAME = "phoframe41"
 13 | 
 14 | if not os.path.exists(FOLDER_SAVE_NAME):
 15 |     os.makedirs(FOLDER_SAVE_NAME)
 16 | 
 17 | if not os.path.exists(FOLDER_SAVE_NAME+"/samples"):
 18 |     os.makedirs(FOLDER_SAVE_NAME+"/samples")
 19 | 
 20 | if not os.path.exists(FOLDER_SAVE_NAME+"/models"):
 21 |     os.makedirs(FOLDER_SAVE_NAME+"/models")
 22 | 
 23 | def weight_variable(shape):
 24 |     initial = tf.truncated_normal(shape, stddev=0.1)
 25 |     return tf.Variable(initial)
 26 | 
 27 | def bias_variable(shape):
 28 |     initial = tf.constant(0.1, shape=shape)
 29 |     return tf.Variable(initial)
 30 | 
 31 | def getRandomFrame():
 32 |     #return 5+int(math.floor(random.randrange(0, 60)))
 33 |     f = int(math.floor(random.randrange(14, 120000)))
 34 |     while not isFValid(f):  # Exclude portions of the video with no visible mouth
 35 |         f = int(math.floor(random.randrange(14, 120000)))
 36 |     return f
 37 | 
 38 | def isFValid(f):
 39 |     for nearF in range(f-14,f+15):
 40 |         strIndex = str(nearF)
 41 |         while len(strIndex) < 4:
 42 |             strIndex = "0"+strIndex
 43 |         if not os.path.exists('3/mouthImages/frame'+strIndex+'.jpg'):
 44 |             return False
 45 |     return True # As of now, i can't remember where the invalid frames are.
 46 | 
 47 | def getInVidsAtFrame(f):
 48 |     arr = np.zeros([1, INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
 49 |     for imageIndex in range(0,29):
 50 |         strIndex = str(f-14+imageIndex)
 51 |         while len(strIndex) < 4:
 52 |             strIndex = "0"+strIndex
 53 |         newImage = misc.imread('3/mouthImages/frame'+strIndex+'.jpg')
 54 | 
 55 |         if newImage.shape[0] > INVID_HEIGHT:
 56 |             extraMargin = (newImage.shape[0]-INVID_HEIGHT)//2
 57 |             newImage = newImage[extraMargin:extraMargin+INVID_HEIGHT,:,:]
 58 |         if newImage.shape[1] > INVID_WIDTH:
 59 |             extraMargin = (newImage.shape[1]-INVID_WIDTH)//2
 60 |             newImage = newImage[:,extraMargin:extraMargin+INVID_WIDTH,:]
 61 | 
 62 |         h = newImage.shape[0]
 63 |         w = newImage.shape[1]
 64 |         yStart = (INVID_HEIGHT-h)//2
 65 |         xStart = (INVID_WIDTH-w)//2
 66 |         arr[:,yStart:yStart+h,xStart:xStart+w,imageIndex*3:(imageIndex+1)*3] = newImage
 67 |     return np.asarray(arr)/255.0
 68 | 
 69 | def getLabelsAtFrame(f):
 70 |   return int(phoframes[f])
 71 | 
 72 | INVID_WIDTH = 256 # mouth width
 73 | INVID_HEIGHT = 256 # mouth height
 74 | INVID_DEPTH = 87 # 29 images of R, G, B
 75 | 
 76 | PHONEME_CATEGORIES = 41
 77 | 
 78 | learning_rate = 0.0002
 79 | 
 80 | invids_ = tf.placeholder(tf.float32, (None, INVID_HEIGHT, INVID_WIDTH, INVID_DEPTH), name='invids')
 81 | labels_ = tf.placeholder(tf.int32, (None), name='labels')
 82 | 
 83 | ### Encode the invids
 84 | conv1 = tf.layers.conv2d(inputs=invids_, filters=40, kernel_size=(5,5), strides=(2,2), padding='same', activation=tf.nn.relu)
 85 | # Now 128x128x40
 86 | maxpool1 = tf.layers.max_pooling2d(conv1, pool_size=2, strides=(2,2), padding='same')
 87 | # Now 64x64x40
 88 | conv2 = tf.layers.conv2d(inputs=maxpool1, filters=70, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
 89 | # Now 64x64x70
 90 | maxpool2 = tf.layers.max_pooling2d(conv2, pool_size=2, strides=(2,2), padding='same')
 91 | # Now 32x32x70
 92 | conv3 = tf.layers.conv2d(inputs=maxpool2, filters=100, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
 93 | # Now 32x32x100
 94 | maxpool3 = tf.layers.max_pooling2d(conv3, pool_size=2, strides=(2,2), padding='same')
 95 | # Now 16x16x100
 96 | conv4 = tf.layers.conv2d(inputs=maxpool3, filters=130, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
 97 | # Now 16x16x130
 98 | maxpool4 = tf.layers.max_pooling2d(conv4, pool_size=4, strides=(4,4), padding='same')
 99 | # Now 4x4x130 (flatten to 2080)
100 | 
101 | maxpool4_flat = tf.reshape(maxpool4, [-1,4*4*130])
102 | # Now 2080
103 | 
104 | W_fc1 = weight_variable([2080, 1000])
105 | b_fc1 = bias_variable([1000])
106 | fc1 = tf.nn.relu(tf.matmul(maxpool4_flat, W_fc1) + b_fc1)
107 | 
108 | W_fc2 = weight_variable([1000, 300])
109 | b_fc2 = bias_variable([300])
110 | fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
111 | 
112 | W_fc3 = weight_variable([300, PHONEME_CATEGORIES])
113 | b_fc3 = bias_variable([PHONEME_CATEGORIES])
114 | logits = tf.matmul(fc2, W_fc3) + b_fc3
115 | #Now 114
116 | onehot_labels = tf.one_hot(indices=labels_, depth=PHONEME_CATEGORIES)
117 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_, logits=logits)
118 | 
119 | output = tf.nn.softmax(logits,name=None)
120 | 
121 | # Get cost and define the optimizer
122 | cost = tf.reduce_mean(loss)
123 | opt = tf.train.AdamOptimizer(learning_rate).minimize(cost)
124 | 
125 | 
126 | 
127 | print("made it here! :D")
128 | sess = tf.Session()
129 | RANGE_START = 120030
130 | RANGE_END = 131030
131 | epochs = 2000000
132 | batch_size = 50
133 | MODEL_SAVE_EVERY = 50
134 | SAVE_FILE_START_POINT = 5750
135 | 
136 | saver = tf.train.Saver()
137 | 
138 | sess.run(tf.global_variables_initializer())
139 | 
140 | if SAVE_FILE_START_POINT >= 1:
141 |     saver.restore(sess,  FOLDER_SAVE_NAME+"/models/model"+str(SAVE_FILE_START_POINT)+".ckpt")
142 | 
143 | print("about to start...")
144 | 
145 | f = open(FOLDER_SAVE_NAME+'/outputted.txt','w')
146 | for frame in range(RANGE_START,RANGE_END):
147 |     invids = np.empty([0,INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
148 |     labels = np.empty(0)
149 | 
150 |     invids = np.vstack((invids,getInVidsAtFrame(frame)))
151 |     labels = np.append(labels,getLabelsAtFrame(frame))
152 | 
153 |     _output, batch_cost, _logits = sess.run([output, cost, logits],
154 |        feed_dict={invids_: invids, labels_: labels})
155 |     
156 |     for i in _output[0]:
157 |         f.write(str(i)+"\t");
158 |     f.write("\n");
159 |     print("Done with "+str(frame-RANGE_START)+" / "+str(RANGE_END-RANGE_START))
160 | f.close()
161 | 


--------------------------------------------------------------------------------
/phoframeTrain.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from scipy import misc
  4 | import random
  5 | import math
  6 | import os
  7 | 
  8 | TRAIN_SET_START = 14
  9 | TRAIN_SET_END = 120000
 10 | 
 11 | TEST_SET_START = 120030
 12 | TEST_SET_END = 131030
 13 | 
 14 | phoframeFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt","r") 
 15 | 
 16 | phoframes = phoframeFile.read().split("\n")
 17 | 
 18 | FOLDER_SAVE_NAME = "phoframe41"
 19 | 
 20 | if not os.path.exists(FOLDER_SAVE_NAME):
 21 |     os.makedirs(FOLDER_SAVE_NAME)
 22 | 
 23 | if not os.path.exists(FOLDER_SAVE_NAME+"/samples"):
 24 |     os.makedirs(FOLDER_SAVE_NAME+"/samples")
 25 | 
 26 | if not os.path.exists(FOLDER_SAVE_NAME+"/models"):
 27 |     os.makedirs(FOLDER_SAVE_NAME+"/models")
 28 | 
 29 | def weight_variable(shape):
 30 |     initial = tf.truncated_normal(shape, stddev=0.1)
 31 |     return tf.Variable(initial)
 32 | 
 33 | def bias_variable(shape):
 34 |     initial = tf.constant(0.1, shape=shape)
 35 |     return tf.Variable(initial)
 36 | 
 37 | def getRandomFrame(isTraining):
 38 |     if isTraining:
 39 |        f = int(math.floor(random.randrange(TRAIN_SET_START, TRAIN_SET_END)))
 40 |        while not isFValid(f):  # Exclude portions of the video with no visible mouth
 41 |            f = int(math.floor(random.randrange(TRAIN_SET_START, TRAIN_SET_END)))
 42 |        return f
 43 |     else:
 44 |        f = int(math.floor(random.randrange(TEST_SET_START, TEST_SET_END)))
 45 |        while not isFValid(f):  # Exclude portions of the video with no visible mouth
 46 |            f = int(math.floor(random.randrange(TEST_SET_START, TEST_SET_END)))
 47 |        return f
 48 | 
 49 | 
 50 | def isFValid(f):
 51 |     for nearF in range(f-14,f+15):
 52 |         strIndex = str(nearF)
 53 |         while len(strIndex) < 4:
 54 |             strIndex = "0"+strIndex
 55 |         if not os.path.exists('3/mouthImages/frame'+strIndex+'.jpg'):
 56 |             return False
 57 |     return True # As of now, i can't remember where the invalid frames are.
 58 | 
 59 | def getInVidsAtFrame(f):
 60 |     arr = np.zeros([1, INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
 61 |     for imageIndex in range(0,29):
 62 |         strIndex = str(f-14+imageIndex)
 63 |         while len(strIndex) < 4:
 64 |             strIndex = "0"+strIndex
 65 |         newImage = misc.imread('3/mouthImages/frame'+strIndex+'.jpg')
 66 | 
 67 |         if newImage.shape[0] > INVID_HEIGHT:
 68 |             extraMargin = (newImage.shape[0]-INVID_HEIGHT)//2
 69 |             newImage = newImage[extraMargin:extraMargin+INVID_HEIGHT,:,:]
 70 |         if newImage.shape[1] > INVID_WIDTH:
 71 |             extraMargin = (newImage.shape[1]-INVID_WIDTH)//2
 72 |             newImage = newImage[:,extraMargin:extraMargin+INVID_WIDTH,:]
 73 | 
 74 |         h = newImage.shape[0]
 75 |         w = newImage.shape[1]
 76 |         yStart = (INVID_HEIGHT-h)//2
 77 |         xStart = (INVID_WIDTH-w)//2
 78 |         arr[:,yStart:yStart+h,xStart:xStart+w,imageIndex*3:(imageIndex+1)*3] = newImage
 79 |     return np.asarray(arr)/255.0
 80 | 
 81 | def getLabelsAtFrame(f):
 82 |   return int(phoframes[f])
 83 | 
 84 | INVID_WIDTH = 256 # mouth width
 85 | INVID_HEIGHT = 256 # mouth height
 86 | INVID_DEPTH = 87 # 29 images of R, G, B
 87 | 
 88 | PHONEME_CATEGORIES = 41
 89 | 
 90 | learning_rate = 0.0002
 91 | 
 92 | invids_ = tf.placeholder(tf.float32, (None, INVID_HEIGHT, INVID_WIDTH, INVID_DEPTH), name='invids')
 93 | labels_ = tf.placeholder(tf.int32, (None), name='labels')
 94 | 
 95 | ### Encode the invids
 96 | conv1 = tf.layers.conv2d(inputs=invids_, filters=40, kernel_size=(5,5), strides=(2,2), padding='same', activation=tf.nn.relu)
 97 | # Now 128x128x40
 98 | maxpool1 = tf.layers.max_pooling2d(conv1, pool_size=2, strides=(2,2), padding='same')
 99 | # Now 64x64x40
100 | conv2 = tf.layers.conv2d(inputs=maxpool1, filters=70, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
101 | # Now 64x64x70
102 | maxpool2 = tf.layers.max_pooling2d(conv2, pool_size=2, strides=(2,2), padding='same')
103 | # Now 32x32x70
104 | conv3 = tf.layers.conv2d(inputs=maxpool2, filters=100, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
105 | # Now 32x32x100
106 | maxpool3 = tf.layers.max_pooling2d(conv3, pool_size=2, strides=(2,2), padding='same')
107 | # Now 16x16x100
108 | conv4 = tf.layers.conv2d(inputs=maxpool3, filters=130, kernel_size=(5,5), padding='same', activation=tf.nn.relu)
109 | # Now 16x16x130
110 | maxpool4 = tf.layers.max_pooling2d(conv4, pool_size=4, strides=(4,4), padding='same')
111 | # Now 4x4x130 (flatten to 2080)
112 | 
113 | maxpool4_flat = tf.reshape(maxpool4, [-1,4*4*130])
114 | # Now 2080
115 | 
116 | W_fc1 = weight_variable([2080, 1000])
117 | b_fc1 = bias_variable([1000])
118 | fc1 = tf.nn.relu(tf.matmul(maxpool4_flat, W_fc1) + b_fc1)
119 | 
120 | W_fc2 = weight_variable([1000, 300])
121 | b_fc2 = bias_variable([300])
122 | fc2 = tf.nn.relu(tf.matmul(fc1, W_fc2) + b_fc2)
123 | 
124 | W_fc3 = weight_variable([300, PHONEME_CATEGORIES])
125 | b_fc3 = bias_variable([PHONEME_CATEGORIES])
126 | logits = tf.matmul(fc2, W_fc3) + b_fc3
127 | #Now 40
128 | onehot_labels = tf.one_hot(indices=labels_, depth=PHONEME_CATEGORIES)
129 | loss = tf.losses.sparse_softmax_cross_entropy(labels=labels_, logits=logits)
130 | 
131 | # Get cost and define the optimizer
132 | cost = tf.reduce_mean(loss)
133 | opt = tf.train.AdamOptimizer(learning_rate).minimize(cost)
134 | 
135 | 
136 | 
137 | print("made it here! :D")
138 | sess = tf.Session()
139 | epochs = 2000000
140 | batch_size = 50
141 | 
142 | test_batch_size = 10
143 | 
144 | MODEL_SAVE_EVERY = 50
145 | SAVE_FILE_START_POINT = 0
146 | 
147 | saver = tf.train.Saver()
148 | 
149 | sess.run(tf.global_variables_initializer())
150 | 
151 | if SAVE_FILE_START_POINT >= 1:
152 |     saver.restore(sess,  FOLDER_SAVE_NAME+"/models/model"+str(SAVE_FILE_START_POINT)+".ckpt")
153 | 
154 | f= open(FOLDER_SAVE_NAME+"/lossOverTime.txt","a")
155 | f.write("Epoch\tTrain Loss\tTest Loss\n")
156 | f.close()
157 | 
158 | print("about to start...")
159 | for e in range(SAVE_FILE_START_POINT, epochs):
160 | 
161 |     invids = np.empty([50,INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
162 |     labels = np.empty(50)
163 | 
164 |     for x in range(0,50):
165 |         frameIndex = getRandomFrame(True)
166 |         invids[x] = getInVidsAtFrame(frameIndex)
167 |         labels[x] = getLabelsAtFrame(frameIndex)
168 | 
169 |     train_loss, _, _logits = sess.run([cost, opt, logits],
170 |        feed_dict={invids_: invids, labels_: labels}) # """inspecs_: inspecs,"""
171 | 
172 |     
173 | 
174 | 
175 |     invids = np.empty([10,INVID_HEIGHT,INVID_WIDTH,INVID_DEPTH])
176 |     labels = np.empty(10)
177 | 
178 |     for x in range(0,10):
179 |         frameIndex = getRandomFrame(False)
180 |         invids[x] = getInVidsAtFrame(frameIndex)
181 |         labels[x] = getLabelsAtFrame(frameIndex)
182 | 
183 |     test_loss, _logits = sess.run([cost, logits],
184 |        feed_dict={invids_: invids, labels_: labels}) # """inspecs_: inspecs,"""
185 | 
186 |     print("Epoch: {}/{}...".format(e, epochs), "Training loss: {:.4f}".format(train_loss), "Test loss: {:.4f}".format(test_loss))
187 |     
188 |     f= open(FOLDER_SAVE_NAME+"/lossOverTime.txt","a+")
189 |     f.write(str(e)+"\t"+str(train_loss)+"\t"+str(test_loss)+"\n")
190 |     f.close()
191 |     
192 |     if (e)%MODEL_SAVE_EVERY == 0:
193 |         save_path = saver.save(sess, FOLDER_SAVE_NAME+"/models/model"+str(e)+".ckpt")
194 |         print("MODEL SAVED, BRO: "+str(save_path))
195 | 


--------------------------------------------------------------------------------
/pyTubeShort.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from contextlib import closing
 4 | from videosequence import VideoSequence
 5 | from PIL import Image
 6 | import face_recognition
 7 | import subprocess
 8 | 
 9 | folderNumber = "3"
10 | 
11 | """if not os.path.exists(folderNumber):
12 |     os.makedirs(folderNumber)
13 | 
14 | list_of_files = glob.glob(folderNumber+'/*') # * means all if need specific format then *.csv
15 | latest_file = max(list_of_files, key=os.path.getctime)"""
16 | 
17 | latest_file = "3/IMG_4700.MOV"
18 | 
19 | folderName = folderNumber+"/origImages"
20 | faceFolderName = folderNumber+"/faceImages"
21 | 
22 | if not os.path.exists(folderName):
23 |     os.makedirs(folderName)
24 | 
25 | if not os.path.exists(faceFolderName):
26 |     os.makedirs(faceFolderName)
27 | 
28 | with closing(VideoSequence(latest_file)) as frames:
29 |     for idx, frame in enumerate(frames[:]):
30 |         print(str(56730+idx)+" frames. That's "+str((56730+idx)/1800.0)+" minutes.")
31 |         filename = folderName+"/"+"frame{:04d}.jpg".format(56730+idx)
32 |         frame.rotate(180).save(filename)
33 | 
34 |         # Load the jpg file into a numpy array
35 |         """image = face_recognition.load_image_file(filename)
36 | 
37 |         # Find all the faces in the image using the default HOG-based model.
38 |         # This method is fairly accurate, but not as accurate as the CNN model and not GPU accelerated.
39 |         # See also: find_faces_in_picture_cnn.py
40 |         face_locations = face_recognition.face_locations(image)
41 | 
42 |         if(len(face_locations) == 1):
43 |             top, right, bottom, left = face_locations[0]
44 |             faceFilename = faceFolderName+"/"+"frame{:04d}.jpg".format(idx)
45 |             height = top-bottom
46 |             faceFrame = frame.crop((left,top,right,bottom-height*0.3))
47 |             faceFrame.save(faceFilename)"""
48 | 
49 | """command = "ffmpeg -i "+latest_file+" -ab 160k -ac 2 -ar 44100 -vn "+folderNumber+"/audio.wav"
50 | 
51 | subprocess.call(command, shell=True)"""
52 | 


--------------------------------------------------------------------------------
/pytubeTest.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from contextlib import closing
 4 | from videosequence import VideoSequence
 5 | from PIL import Image
 6 | import face_recognition
 7 | import subprocess
 8 | 
 9 | from pytube import YouTube
10 | 
11 | folderNumber = "3"
12 | 
13 | if not os.path.exists(folderNumber):
14 |     os.makedirs(folderNumber)
15 | 
16 | YouTube('https://www.youtube.com/watch?v=0bdHdGS3OlI').streams.first().download(folderNumber)
17 | 
18 | list_of_files = glob.glob(folderNumber+'/*') # * means all if need specific format then *.csv
19 | latest_file = max(list_of_files, key=os.path.getctime)
20 | 
21 | folderName = folderNumber+"/origImages"
22 | faceFolderName = folderNumber+"/faceImages"
23 | 
24 | if not os.path.exists(folderName):
25 |     os.makedirs(folderName)
26 | 
27 | if not os.path.exists(faceFolderName):
28 |     os.makedirs(faceFolderName)
29 | 
30 | with closing(VideoSequence(latest_file)) as frames:
31 |     for idx, frame in enumerate(frames[:]):
32 |         filename = folderName+"/"+"frame{:04d}.jpg".format(idx)
33 |         frame.save(filename)
34 | 
35 |         # Load the jpg file into a numpy array
36 |         image = face_recognition.load_image_file(filename)
37 | 
38 |         # Find all the faces in the image using the default HOG-based model.
39 |         # This method is fairly accurate, but not as accurate as the CNN model and not GPU accelerated.
40 |         # See also: find_faces_in_picture_cnn.py
41 |         face_locations = face_recognition.face_locations(image)
42 | 
43 |         if(len(face_locations) == 1):
44 |             top, right, bottom, left = face_locations[0]
45 |             faceFilename = faceFolderName+"/"+"frame{:04d}.jpg".format(idx)
46 |             height = top-bottom
47 |             faceFrame = frame.crop((left,top,right,bottom-height*0.3))
48 |             faceFrame.save(faceFilename)
49 | 
50 | command = "ffmpeg -i "+latest_file+" -ab 160k -ac 2 -ar 44100 -vn "+folderNumber+"/audio.wav"
51 | 
52 | subprocess.call(command, shell=True)
53 | 


--------------------------------------------------------------------------------
/samples/frame120030.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120030.png


--------------------------------------------------------------------------------
/samples/frame120031.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120031.png


--------------------------------------------------------------------------------
/samples/frame120032.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120032.png


--------------------------------------------------------------------------------
/samples/frame120033.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120033.png


--------------------------------------------------------------------------------
/samples/frame120034.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120034.png


--------------------------------------------------------------------------------
/samples/frame120035.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120035.png


--------------------------------------------------------------------------------
/samples/frame120036.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120036.png


--------------------------------------------------------------------------------
/samples/frame120037.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120037.png


--------------------------------------------------------------------------------
/samples/frame120038.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120038.png


--------------------------------------------------------------------------------
/samples/frame120039.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120039.png


--------------------------------------------------------------------------------
/samples/frame120040.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120040.png


--------------------------------------------------------------------------------
/samples/frame120041.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120041.png


--------------------------------------------------------------------------------
/samples/frame120042.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120042.png


--------------------------------------------------------------------------------
/samples/frame120043.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120043.png


--------------------------------------------------------------------------------
/samples/frame120044.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120044.png


--------------------------------------------------------------------------------
/samples/frame120045.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120045.png


--------------------------------------------------------------------------------
/samples/frame120046.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120046.png


--------------------------------------------------------------------------------
/samples/frame120047.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120047.png


--------------------------------------------------------------------------------
/samples/frame120048.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120048.png


--------------------------------------------------------------------------------
/samples/frame120049.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120049.png


--------------------------------------------------------------------------------
/samples/frame120050.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120050.png


--------------------------------------------------------------------------------
/samples/frame120051.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120051.png


--------------------------------------------------------------------------------
/samples/frame120052.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120052.png


--------------------------------------------------------------------------------
/samples/frame120053.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120053.png


--------------------------------------------------------------------------------
/samples/frame120054.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120054.png


--------------------------------------------------------------------------------
/samples/frame120055.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120055.png


--------------------------------------------------------------------------------
/samples/frame120056.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120056.png


--------------------------------------------------------------------------------
/samples/frame120057.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120057.png


--------------------------------------------------------------------------------
/samples/frame120058.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120058.png


--------------------------------------------------------------------------------
/samples/frame120059.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120059.png


--------------------------------------------------------------------------------
/samples/frame120060.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120060.png


--------------------------------------------------------------------------------
/samples/frame120061.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120061.png


--------------------------------------------------------------------------------
/samples/frame120062.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120062.png


--------------------------------------------------------------------------------
/samples/frame120063.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120063.png


--------------------------------------------------------------------------------
/samples/frame120064.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120064.png


--------------------------------------------------------------------------------
/samples/frame120065.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120065.png


--------------------------------------------------------------------------------
/samples/frame120066.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120066.png


--------------------------------------------------------------------------------
/samples/frame120067.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120067.png


--------------------------------------------------------------------------------
/samples/frame120068.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120068.png


--------------------------------------------------------------------------------
/samples/frame120069.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120069.png


--------------------------------------------------------------------------------
/samples/frame120070.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120070.png


--------------------------------------------------------------------------------
/samples/frame120071.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120071.png


--------------------------------------------------------------------------------
/samples/frame120072.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120072.png


--------------------------------------------------------------------------------
/samples/frame120073.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120073.png


--------------------------------------------------------------------------------
/samples/frame120074.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120074.png


--------------------------------------------------------------------------------
/samples/frame120075.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120075.png


--------------------------------------------------------------------------------
/samples/frame120076.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120076.png


--------------------------------------------------------------------------------
/samples/frame120077.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120077.png


--------------------------------------------------------------------------------
/samples/frame120078.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120078.png


--------------------------------------------------------------------------------
/samples/frame120079.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120079.png


--------------------------------------------------------------------------------
/samples/frame120080.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120080.png


--------------------------------------------------------------------------------
/samples/frame120081.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120081.png


--------------------------------------------------------------------------------
/samples/frame120082.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120082.png


--------------------------------------------------------------------------------
/samples/frame120083.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120083.png


--------------------------------------------------------------------------------
/samples/frame120084.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120084.png


--------------------------------------------------------------------------------
/samples/frame120085.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120085.png


--------------------------------------------------------------------------------
/samples/frame120086.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120086.png


--------------------------------------------------------------------------------
/samples/frame120087.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120087.png


--------------------------------------------------------------------------------
/samples/frame120088.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120088.png


--------------------------------------------------------------------------------
/samples/frame120089.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120089.png


--------------------------------------------------------------------------------
/samples/frame120090.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120090.png


--------------------------------------------------------------------------------
/samples/frame120091.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120091.png


--------------------------------------------------------------------------------
/samples/frame120092.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120092.png


--------------------------------------------------------------------------------
/samples/frame120093.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120093.png


--------------------------------------------------------------------------------
/samples/frame120094.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120094.png


--------------------------------------------------------------------------------
/samples/frame120095.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120095.png


--------------------------------------------------------------------------------
/samples/frame120096.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120096.png


--------------------------------------------------------------------------------
/samples/frame120097.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120097.png


--------------------------------------------------------------------------------
/samples/frame120098.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120098.png


--------------------------------------------------------------------------------
/samples/frame120099.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120099.png


--------------------------------------------------------------------------------
/samples/frame120100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120100.png


--------------------------------------------------------------------------------
/samples/frame120101.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120101.png


--------------------------------------------------------------------------------
/samples/frame120102.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120102.png


--------------------------------------------------------------------------------
/samples/frame120103.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120103.png


--------------------------------------------------------------------------------
/samples/frame120104.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120104.png


--------------------------------------------------------------------------------
/samples/frame120105.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120105.png


--------------------------------------------------------------------------------
/samples/frame120106.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120106.png


--------------------------------------------------------------------------------
/samples/frame120107.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120107.png


--------------------------------------------------------------------------------
/samples/frame120108.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120108.png


--------------------------------------------------------------------------------
/samples/frame120109.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120109.png


--------------------------------------------------------------------------------
/samples/frame120110.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120110.png


--------------------------------------------------------------------------------
/samples/frame120111.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120111.png


--------------------------------------------------------------------------------
/samples/frame120112.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120112.png


--------------------------------------------------------------------------------
/samples/frame120113.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120113.png


--------------------------------------------------------------------------------
/samples/frame120114.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120114.png


--------------------------------------------------------------------------------
/samples/frame120115.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120115.png


--------------------------------------------------------------------------------
/samples/frame120116.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120116.png


--------------------------------------------------------------------------------
/samples/frame120117.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120117.png


--------------------------------------------------------------------------------
/samples/frame120118.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120118.png


--------------------------------------------------------------------------------
/samples/frame120119.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120119.png


--------------------------------------------------------------------------------
/samples/frame120120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120120.png


--------------------------------------------------------------------------------
/samples/frame120121.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120121.png


--------------------------------------------------------------------------------
/samples/frame120122.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120122.png


--------------------------------------------------------------------------------
/samples/frame120123.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120123.png


--------------------------------------------------------------------------------
/samples/frame120124.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120124.png


--------------------------------------------------------------------------------
/samples/frame120125.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120125.png


--------------------------------------------------------------------------------
/samples/frame120126.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120126.png


--------------------------------------------------------------------------------
/samples/frame120127.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120127.png


--------------------------------------------------------------------------------
/samples/frame120128.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120128.png


--------------------------------------------------------------------------------
/samples/frame120129.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carykh/videoToVoice/7c782f22cf3948e28862a334df2f6deec24eedb3/samples/frame120129.png


--------------------------------------------------------------------------------
/trainingDataVisualizer.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from scipy import misc
 4 | import random
 5 | import math
 6 | import os
 7 | 
 8 | FOLDER_SAVE_NAME = "3"
 9 | 
10 | 
11 | 
12 | w = 400
13 | h = 250
14 | 
15 | phoframeFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phoframes.txt","r") 
16 | 
17 | phoframes = phoframeFile.read().split("\n")
18 | 
19 | keyFile = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/key.txt","r") 
20 | 
21 | key = keyFile.read().split("\n")
22 | 
23 | for i in range(0,200):
24 |   strIndex = str(i)
25 |   while len(strIndex) < 6:
26 |     strIndex = "0"+strIndex
27 |   newImage = misc.imread('3/mouthImages/frame'+strIndex+'.jpg')
28 |   s = newImage.shape;
29 | 
30 | 
31 |   imageToSave = np.zeros([h,w,3])
32 |   imageToSave[0:s[0],0:s[1],0:3] = newImage
33 |   misc.imsave(FOLDER_SAVE_NAME+"/lineupCheck/sample"+str(i)+'.png',imageToSave)
34 |   
35 |   
36 | 
37 |   
38 | 


--------------------------------------------------------------------------------
/turnPhonemesToPhoframes.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | FOLDER_NAME = "3"
 4 | 
 5 | phonemeToIndex = {'silence':0}
 6 | file = open("/media/rob/Ma Book1/CS 230/videoToVoice/3/phonemes.json","r") 
 7 | 
 8 | parsed_json = json.loads(file.read())
 9 | words = parsed_json["words"]
10 | 
11 | frameCount = 132000 #(int)(17*30)
12 | phoframes = [0] * frameCount
13 | 
14 | for word in words:
15 |   if word["case"] == "success":
16 |     wordText = word["alignedWord"]
17 | 
18 |     wordPointer = word["start"]
19 |     for phonemes in word["phones"]:
20 |       start = wordPointer
21 |       end = wordPointer+phonemes["duration"]
22 |       
23 |       startFrame = (int)(start*(29.96835))
24 |       endFrame = (int)(end*(29.96835))
25 | 
26 |       for frame in range(startFrame,endFrame):
27 |         phoneme = phonemes["phone"].split("_")[0]
28 |         if not phoneme in phonemeToIndex:
29 |           phonemeToIndex[phoneme] = len(phonemeToIndex)
30 |         phoframes[frame] = phonemeToIndex[phoneme] 
31 | 
32 |       wordPointer += phonemes["duration"]
33 | 
34 | f = open(FOLDER_NAME+'/key.txt','w')
35 | for i in range(len(phonemeToIndex)):
36 |   for j in phonemeToIndex:
37 |     if(phonemeToIndex[j] == i):
38 |       f.write(str(phonemeToIndex[j])+"\t"+str(j)+"\n")
39 | f.close()
40 | 
41 | f = open(FOLDER_NAME+'/phoframes.txt','w')
42 | for phoframe in phoframes:
43 |   f.write(str(phoframe)+"\n")
44 | f.close()
45 | 


--------------------------------------------------------------------------------
/videoContinue.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from contextlib import closing
 4 | from videosequence import VideoSequence
 5 | from PIL import Image
 6 | import subprocess
 7 | 
 8 | from pytube import YouTube
 9 | 
10 | folderNumber = "2"
11 | 
12 | folderName = folderNumber+"/origImages"
13 | 
14 | latest_file = folderNumber+"/\"My children dont exist.mp4\""
15 | 
16 | if not os.path.exists(folderName):
17 |     os.makedirs(folderName)
18 | """
19 | with closing(VideoSequence(latest_file)) as frames:
20 |     for idx, frame in enumerate(frames[:]):
21 |         if idx >= 12950:
22 |             filename = folderName+"/"+"frame{:06d}.jpg".format(idx)
23 |             frame.save(filename)
24 |         print("SAVED IMAGE #"+str(idx))
25 | 
26 | """
27 | 
28 | command = "ffmpeg -i "+latest_file+" -ab 160k -ac 2 -ar 44100 -vn "+folderNumber+"/audio.wav"
29 | 
30 | subprocess.call(command, shell=True)
31 | 


--------------------------------------------------------------------------------
/videoGetter.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | from contextlib import closing
 4 | from videosequence import VideoSequence
 5 | from PIL import Image
 6 | import subprocess
 7 | 
 8 | from pytube import YouTube
 9 | 
10 | folderNumber = "2"
11 | 
12 | if not os.path.exists(folderNumber):
13 |     os.makedirs(folderNumber)
14 | 
15 | print("BEGAN DOWNLOADING VIDEO")
16 | 
17 | YouTube('https://www.youtube.com/watch?v=_J7dEhYttbQ').streams.first().download(folderNumber)
18 | 
19 | print("FINISHED DOWNLOADING VIDEO")
20 | 
21 | list_of_files = glob.glob(folderNumber+'/*') # * means all if need specific format then *.csv
22 | latest_file = max(list_of_files, key=os.path.getctime)
23 | 
24 | folderName = folderNumber+"/origImages"
25 | 
26 | if not os.path.exists(folderName):
27 |     os.makedirs(folderName)
28 | 
29 | with closing(VideoSequence(latest_file)) as frames:
30 |     for idx, frame in enumerate(frames[:]):
31 |         filename = folderName+"/"+"frame{:06d}.jpg".format(idx)
32 |         frame.save(filename)
33 |         print("SAVED IMAGE #"+str(idx))
34 | 
35 | command = "ffmpeg -i "+latest_file+" -ab 160k -ac 2 -ar 44100 -vn "+folderNumber+"/audio.wav"
36 | 
37 | subprocess.call(command, shell=True)
38 | 


--------------------------------------------------------------------------------