├── .gitignore
├── Generate.py
├── LICENSE
├── Parse.py
├── README.md
├── Scrape.py
├── Train.py
├── YTCommenter.zip
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.bat
2 | *.h5
3 | *.npy
4 | *.png
5 | *.pyc
6 | *.txt


--------------------------------------------------------------------------------
/Generate.py:
--------------------------------------------------------------------------------
  1 | import os, random, json
  2 | import numpy as np
  3 | from scipy import stats
  4 | import util
  5 | 
  6 | SEQ_SIZE = 8
  7 | NUM_TO_GEN = 20
  8 | MODEL_DIR = 'trained_all/'
  9 | PARSED_DIR = 'parsed_all/'
 10 | MAKE_STATEFUL = False
 11 | IS_REVERSE = False
 12 | 
 13 | #Load titles
 14 | title_words, title_word_to_ix = util.load_title_dict(PARSED_DIR)
 15 | title_dict_size = len(title_words)
 16 | title_sentences = util.load_title_sentences(PARSED_DIR)
 17 | 
 18 | #Load comments
 19 | comment_words, comment_word_to_ix = util.load_comment_dict(PARSED_DIR)
 20 | comment_dict_size = len(comment_words)
 21 | comment_sentences = util.load_comment_sentences(PARSED_DIR)
 22 | assert(len(title_sentences) == len(comment_sentences))
 23 | 
 24 | def word_ixs_to_str(word_ixs, is_title):
 25 | 	result_txt = ""
 26 | 	for w_ix in word_ixs:
 27 | 		w = (title_words if is_title else comment_words)[w_ix]
 28 | 		if len(result_txt) == 0 or w in ['.', ',', "'", '!', '?', ':', ';', '...']:
 29 | 			result_txt += w
 30 | 		elif len(result_txt) > 0 and result_txt[-1] == "'" and w in ['s', 're', 't', 'll', 've', 'd']:
 31 | 			result_txt += w
 32 | 		else:
 33 | 			result_txt += ' ' + w
 34 | 	if len(result_txt) > 0:
 35 | 		result_txt = result_txt[:1].upper() + result_txt[1:]
 36 | 	return result_txt
 37 | 
 38 | def probs_to_word_ix(pk, is_first):
 39 | 	if is_first:
 40 | 		pk[0] = 0.0
 41 | 		pk /= np.sum(pk)
 42 | 	else:
 43 | 		pk *= pk
 44 | 		pk /= np.sum(pk)
 45 | 		#for i in range(3):
 46 | 		#	max_val = np.amax(pk)
 47 | 		#	if max_val > 0.5:
 48 | 		#		break
 49 | 		#	pk *= pk
 50 | 		#	pk /= np.sum(pk)
 51 | 
 52 | 	xk = np.arange(pk.shape[0], dtype=np.int32)
 53 | 	custm = stats.rv_discrete(name='custm', values=(xk, pk))
 54 | 	return custm.rvs()
 55 | 
 56 | def pred_text(model, context, max_len=64):
 57 | 	output = []
 58 | 	context = np.expand_dims(context, axis=0)
 59 | 	if MAKE_STATEFUL:
 60 | 		past_sample = np.zeros((1,), dtype=np.int32)
 61 | 	else:
 62 | 		past_sample = np.zeros((SEQ_SIZE,), dtype=np.int32)
 63 | 	while len(output) < max_len:
 64 | 		pk = model.predict([context, np.expand_dims(past_sample, axis=0)], batch_size=1)[-1]
 65 | 		if MAKE_STATEFUL:
 66 | 			pk = pk[0]
 67 | 		else:
 68 | 			past_sample = np.roll(past_sample, 1 if IS_REVERSE else -1)
 69 | 		new_sample = probs_to_word_ix(pk, len(output) == 0)
 70 | 		past_sample[0 if IS_REVERSE else -1] = new_sample
 71 | 		if new_sample == 0:
 72 | 			break
 73 | 		output.append(new_sample)
 74 | 
 75 | 	model.reset_states()
 76 | 	return output
 77 | 
 78 | #Load Keras and Theano
 79 | print("Loading Keras...")
 80 | import os, math
 81 | os.environ['KERAS_BACKEND'] = "tensorflow"
 82 | import tensorflow as tf
 83 | print("Tensorflow Version: " + tf.__version__)
 84 | import keras
 85 | print("Keras Version: " + keras.__version__)
 86 | from keras.layers import Input, Dense, Activation, Dropout, Flatten, Reshape, RepeatVector, TimeDistributed, concatenate
 87 | from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D, Convolution1D
 88 | from keras.layers.embeddings import Embedding
 89 | from keras.layers.local import LocallyConnected2D
 90 | from keras.layers.pooling import MaxPooling2D
 91 | from keras.layers.noise import GaussianNoise
 92 | from keras.layers.normalization import BatchNormalization
 93 | from keras.layers.recurrent import LSTM, SimpleRNN, GRU
 94 | from keras.models import Model, Sequential, load_model, model_from_json
 95 | from keras.optimizers import Adam, RMSprop, SGD
 96 | from keras.preprocessing.image import ImageDataGenerator
 97 | from keras.regularizers import l1
 98 | from keras.utils import plot_model, to_categorical
 99 | from keras import backend as K
100 | K.set_image_data_format('channels_first')
101 | 
102 | #Fix bug with sparse_categorical_accuracy
103 | from tensorflow.python.ops import math_ops
104 | from tensorflow.python.framework import ops
105 | from tensorflow.python.keras import backend as K
106 | from tensorflow.python.ops import array_ops
107 | def new_sparse_categorical_accuracy(y_true, y_pred):
108 | 	y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
109 | 	y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
110 | 	# If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
111 | 	if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
112 | 		y_true = array_ops.squeeze(y_true, [-1])
113 | 	y_pred = math_ops.argmax(y_pred, axis=-1)
114 | 	# If the predicted output and actual output types don't match, force cast them
115 | 	# to match.
116 | 	if K.dtype(y_pred) != K.dtype(y_true):
117 | 		y_pred = math_ops.cast(y_pred, K.dtype(y_true))
118 | 	return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
119 | 
120 | #Load the model
121 | print("Loading Model...")
122 | model = load_model(MODEL_DIR + 'model.h5', custom_objects={'new_sparse_categorical_accuracy':new_sparse_categorical_accuracy})
123 | 
124 | if MAKE_STATEFUL:
125 | 	weights = model.get_weights()
126 | 	model_json = json.loads(model.to_json())
127 | 
128 | 	layers = model_json['config']['layers']
129 | 	for layer in layers:
130 | 		if 'batch_input_shape' in layer['config']:
131 | 			layer['config']['batch_input_shape'][0] = 1
132 | 			if layer['config']['batch_input_shape'][1] == SEQ_SIZE:
133 | 				layer['config']['batch_input_shape'][1] = 1
134 | 		if layer['class_name'] == 'Embedding':
135 | 			layer['config']['input_length'] = 1
136 | 		if layer['class_name'] == 'RepeatVector':
137 | 			layer['config']['n'] = 1
138 | 		if layer['class_name'] == 'LSTM':
139 | 			assert(layer['config']['stateful'] == False)
140 | 			layer['config']['stateful'] = True
141 | 
142 | 	print(json.dumps(model_json, indent=4, sort_keys=True))
143 | 	model = model_from_json(json.dumps(model_json))
144 | 	model.set_weights(weights)
145 | 
146 | 	#plot_model(model, to_file='temp.png', show_shapes=True)
147 | 
148 | def generate_titles(my_title):
149 | 	my_title = util.clean_text(my_title)
150 | 	my_words = my_title.split(' ')
151 | 	print(' '.join((w.upper() if w in title_word_to_ix else w) for w in my_words) + '\n')
152 | 	my_title_ixs = [title_word_to_ix[w] for w in my_words if w in title_word_to_ix]
153 | 	my_title_sample = util.bag_of_words(my_title_ixs, title_dict_size)
154 | 	for i in range(10):
155 | 		print('  ' + word_ixs_to_str(pred_text(model, my_title_sample), False))
156 | 	print('')
157 | 
158 | while True:
159 | 	my_title = input('Enter Title:\n')
160 | 	generate_titles(my_title)
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 HackerPoet
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Parse.py:
--------------------------------------------------------------------------------
  1 | import os, re
  2 | import codecs, itertools
  3 | import numpy as np
  4 | import util
  5 | 
  6 | #INPUT_DATA = 'scraped/data.txt'
  7 | INPUT_DATA = 'scraped/all_comments.txt'
  8 | SAVE_DIR = 'parsed_all/'
  9 | 
 10 | #Create directory to hold parsed data
 11 | if not os.path.exists(SAVE_DIR):
 12 | 	os.makedirs(SAVE_DIR)
 13 | 
 14 | def split_to_words(full_str):
 15 | 	#Split clean sentence into words
 16 | 	words = full_str.split(' ')
 17 | 
 18 | 	#Remove annoying space-separated letters
 19 | 	num_single_letter = 0
 20 | 	for word in words:
 21 | 		if len(word) == 1 and word in 'abcdefghijklmnopqrstuvwxyzx':
 22 | 			num_single_letter += 1
 23 | 		#If sentence contains an unusually long word, ignore it
 24 | 		if len(word) >= 24:
 25 | 			return []
 26 | 	if num_single_letter > 5:
 27 | 		return []
 28 | 
 29 | 	return [word for word in words if len(word) > 0]
 30 | 
 31 | def parse_line(line):
 32 | 	vid, title, comment = line[:-1].split('~')
 33 | 	title = util.clean_text(title)
 34 | 	comment = util.clean_text(comment)
 35 | 	title_ix = split_to_words(title)
 36 | 	if len(title_ix) == 0:
 37 | 		return [], []
 38 | 	comment_ix = split_to_words(comment)
 39 | 	return title_ix, comment_ix
 40 | 
 41 | def words_to_ixs(words, all_words, word_to_ix):
 42 | 	for word in words:
 43 | 		if word not in word_to_ix:
 44 | 			word_to_ix[word] = len(all_words)
 45 | 			all_words.append(word)
 46 | 	return [word_to_ix[w] for w in words]
 47 | 
 48 | #Read the file line by line
 49 | print("Parsing...")
 50 | all_title_words = []
 51 | all_comment_words = []
 52 | with codecs.open(INPUT_DATA, 'r', encoding='utf-8') as fin:
 53 | 	for line in fin:
 54 | 		title_words, comment_words = parse_line(line)
 55 | 		if len(title_words) == 0 or len(comment_words) == 0:
 56 | 			continue
 57 | 		all_title_words.append(title_words)
 58 | 		all_comment_words.append(comment_words)
 59 | 
 60 | #Generate a word frequency to help eliminate uncommon samples
 61 | print("Counting Occurrence...")
 62 | comment_word_count = {}
 63 | for comment_words in all_comment_words:
 64 | 	for word in comment_words:
 65 | 		if word in comment_word_count:
 66 | 			comment_word_count[word] += 1
 67 | 		else:
 68 | 			comment_word_count[word] = 1
 69 | 
 70 | #Eliminate any words that appeared only once
 71 | print("Eliminating Ultra-Rare Words...")
 72 | all_title_ixs = []
 73 | all_comment_ixs = []
 74 | title_word_list = ['']
 75 | title_word_map= {'':0}
 76 | comment_word_list = ['']
 77 | comment_word_map = {'':0}
 78 | for title_words, comment_words in zip(all_title_words, all_comment_words):
 79 | 	for word in comment_words:
 80 | 		if comment_word_count[word] <= 1:
 81 | 			break
 82 | 	else:
 83 | 		all_title_ixs.append(words_to_ixs(title_words, title_word_list, title_word_map))
 84 | 		all_comment_ixs.append(words_to_ixs(comment_words, comment_word_list, comment_word_map))
 85 | 
 86 | #Generate lengths for the flattened data
 87 | print("Converting To Indices...")
 88 | all_title_lens = []
 89 | all_comment_lens = []
 90 | for title_ix in all_title_ixs:
 91 | 	all_title_lens.append(len(title_ix))
 92 | for comment_ix in all_comment_ixs:
 93 | 	all_comment_lens.append(len(comment_ix))
 94 | all_title_ixs = list(itertools.chain.from_iterable(all_title_ixs))
 95 | all_comment_ixs = list(itertools.chain.from_iterable(all_comment_ixs))
 96 | 
 97 | #Write results with numpy
 98 | print("Total Pairs: " + str(len(all_title_lens)))
 99 | print("Total Title Words: " + str(len(all_title_ixs)))
100 | print("Total Comment Words: " + str(len(all_comment_ixs)))
101 | np.save(SAVE_DIR + 'titles.npy', np.array(all_title_ixs, dtype=np.int32))
102 | np.save(SAVE_DIR + 'comments.npy', np.array(all_comment_ixs, dtype=np.int32))
103 | np.save(SAVE_DIR + 'title_lens.npy', np.array(all_title_lens, dtype=np.int32))
104 | np.save(SAVE_DIR + 'comment_lens.npy', np.array(all_comment_lens, dtype=np.int32))
105 | 
106 | #Save dictionary of all used words
107 | print("Title Dict Size: " + str(len(title_word_list)))
108 | with open(SAVE_DIR + 'title_dict.txt', 'w') as fout:
109 | 	for w in title_word_list:
110 | 		fout.write(w + '\n')
111 | print("Comment Dict Size: " + str(len(comment_word_list)))
112 | with open(SAVE_DIR + 'comment_dict.txt', 'w') as fout:
113 | 	for w in comment_word_list:
114 | 		fout.write(w + '\n')
115 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # YouTubeCommenter
2 | AI to generate YouTube comments based on video title.
3 | 
4 | **PLEASE NOTE:** This project is primarily available for reference, it is not in active development.
5 | 
6 | # Video
7 | https://youtu.be/tY6SvZEic9k
8 | 


--------------------------------------------------------------------------------
/Scrape.py:
--------------------------------------------------------------------------------
  1 | from urllib.request import urlopen
  2 | import os, re
  3 | import codecs
  4 | import json
  5 | 
  6 | #Update this line with your API key
  7 | API_KEY = ''
  8 | 
  9 | JUSTIN_Y_CHANNEL = 'UCiTfB-A55Vq2fB610vaWJVA'
 10 | SAVE_DIR = 'scraped/'
 11 | DATA_FILE = SAVE_DIR + 'all_comments.txt'
 12 | NUM_COMMENT_PAGES = 1
 13 | READ_CACHE_ONLY = False
 14 | 
 15 | ALL_COMMENTS = True
 16 | ALL_COMMENTS_LEN_MIN = 16
 17 | ALL_COMMENTS_LEN_MAX = 120
 18 | 
 19 | ALL_PLAYLISTS = [
 20 | 	'LLiTfB-A55Vq2fB610vaWJVA',           #5000 - Justin Y's liked videos
 21 | 	'PLEDLY9RcAVEAIhk1PO_3HNKZnqo8Tk6b5', #195 - The Great ones
 22 | 	'PLAZQuoSz85i6ZXYGUGoTny2Foj-GOAwde', #445 - the justin y. archive
 23 | 	'PLDqtsMAe_KP1FOYhp2PB6Og4KkMiNaHtf', #365 - Videos that Justin Y. has commented on
 24 | 	'PLdwJKVt5J4kPa4hpJb9T-ai3gCgn_aBns', #226 - Videos visited by our god king Justin Y
 25 | 	'PLdCWc2HiLQ-KiJCvVAPbItZng7L3zQeoD', #116 - Justin Y. comments on here
 26 | 	'PLs9Y9RDj_m8GFrXCJVvmjXXrHb3TvtEWZ', #197 - Justin Y. Comments
 27 | 	'PLXzFN31Zk_rJv9pbpLoBC4tom2upYDSpC', #110 - Justin y. comments
 28 | 	'PLjlDY1ZxBlPKuaikG2_PfzYhOj04WXihN', #268 - Videos that have Justin Y.
 29 | 	'PL2P9kapzgxPKt1R4a1uhaSTrBBWgYliP3', #307 - Vids With Justin Y.'s Comment
 30 | 	'PLflokqibVlnOJ_oJo99UaOpiTT1iX0jUc', #118 - Where Justin Y. is
 31 | 	'PLyZd6Up2ag97WCkrF94SoTYpo9el5Jbl4', #256 - Justin Y. Encounter
 32 | 	'PLEDLY9RcAVEAPgF9Oas2HY1Irv0fANt-v', #414 - Videos involving me
 33 | 	'PLsLRmcu39w_EWxoKXVu3BP3_YT_xVhevT', #187 - Justin Y commented
 34 | 	'PLcl0qJ8euYS2KNnNBAsl_AYpMGj559ql_', #386 - The chonicles of human achievement
 35 | 	'PLEDLY9RcAVECo1fh3XI4Thh5yORYLcYu5', #71 - Stuff to watch at 3 AM
 36 | 	'PL36thkaVLp_cjbfm7wC5Y72C0CKXclILj', #85 - Justin Y sightings
 37 | 	'PLoJiUwNc7oHTyNVH5r-D357Quz27k6C2r', #87 - Every Video I've watched and found Justin Y. in it
 38 | 	'PLWoMlj_1yvw7u4Swrh6xLSK62BfEi-clx', #101 - Justin Y, 03:00 69 (Oldest Published)
 39 | 	'PL8cL6SlaG8o9sl93VEY-WvCdFlC1sW_Fb', #91 - Justin Y
 40 | 	'PLzuaYEgVgFn6K3pxIkVGHx6reD61Gi4HH', #412 - Can You Find Justin Y. in the comments
 41 | 	'PL73NEkiN4yYhmHWwxuLKoH4p-NdJess4p', #303 - Justin Y. will be obliterated
 42 | 	'PLWtaAQnBZMhc7wMKeV5OYm-HgLw5FFhFo', #129 - Mystery
 43 | 	'PLjbgElHjuOFt7Jr96LO44-hnA0mvk8OQN', #119 - Justin Y. Comments in videos
 44 | 	'PLwDG7WIMSrIv5yyHK2iAe4NPaw7shSwA-', #74 - Videos I watch that Justin y commented on
 45 | 	'PLlkgpk6VYN-VC06r47a-ujPy817WRItP8', #138 - Justin Y.
 46 | 	'PLlDNv7LhQsqHNwz_qCLoQ00uV3ZaUZWKm', #207 - Justin Y
 47 | 	'PL_KQOLf_bJf2MmCiYiWeXTPpZuh00yrTT', #94 - Justin Y.
 48 | 	'PLWoMlj_1yvw6I2T8LBDRE3Dl-009QG0UV', #196 - Justin Y. Presents The Great Ones (Oldest Published)
 49 | 	'PL-qupiaXIo35dwoLWPX4h21-AhCEghOJk', #127 - Videos with Justin Y Comments
 50 | 	'PL2S1ygZEsbrq7LnFYZ4Vo_fwBAZN_usLl', #38 - Videos Justin Y comments on
 51 | 	'PLWCtQoesywqPQhmKo6r8_za2gd6qzg89L', #40 - JUSTIN Y.
 52 | 	'PLW1z7JmxIwi3NngIaihev50HbibVMUR7T', #108 - Justin T.
 53 | 	'PLfnjQQeSRtRDJGvPY9_2x3bg1Ibd5UW3t', #43 - Justin y
 54 | 	'PLqfTYcfGfk2Z95KaNly6gLV2ksDr6BW1i', #48 - Justin Y. Video playlist
 55 | 	'PLmV_z0VTBoQM7I0c2P0Ne89mFy6GRRrNz', #150 - Justin Y. Is here
 56 | 	'PLpZ6EQdG37vDF0-hslIe6BkcKxFwwvyYW', #1179 - Weird side of YouTube
 57 | 	'PLEDLY9RcAVEClHaZTv4D2B_1ZbT6prKzR', #55 - Weeb Stuff
 58 | 	'PLc5e3gLSmfQoKhG4TaujsyQ9Tnzo2HusM', #53 - Justin. Y was here
 59 | 	'PL5IiKBxO9ciwo1kzKsTRWAttaUPOJa7ko', #69 - Justin Y
 60 | 	'PL2P9kapzgxPKt1R4a1uhaSTrBBWgYliP3', #313 - Vids With Justin Y.'s Comment
 61 | 	'PLaMsHFoQCfi-c04jfFe5tRbWPjphMrNs7', #65 - Videos I've seen Justin Y. in
 62 | 	'PLTMsl9Zjn8658xhd3QolGv0xeQ5qtfjSq', #25 - Finding Justin y
 63 | 	'PLMqyXJorJPzAvyHgV4rrixNuIEjI2Jc-g', #31 - Where I find Justin Y.
 64 | 	'PLth-U9OH9GxXZf6f3rUFIyfne-eBV4cU8', #36 - Justin Y. hunt
 65 | 	'PLg1UXes0H5Kr_0pirdTJSUPYtBSlYsPoq', #21 - Justin y is here
 66 | 	'PLc6nYgH9n1HnVaiKaoEwJSywuoeewRC1B', #41 - The great ones
 67 | 	'PLOJTuMA4-7oEAb6qTHG6J98Q66452tQyw', #35 - Nightcore
 68 | 	'PL5as-6qnU5d87-bosb3Edbp1tRL-UYxTP', #2706 - The best Videos on YouTube and Vines
 69 | 	'PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ', #2799 - Instant Regret Clicking this Playlist
 70 | 	'PLSCxT16tijxi7VuC3Uh3nPLmJocr7Ju28', #2346 - Cy's dank meme playlist
 71 | 	'LLp1FjmTu8nw4lkKlU7iI74w',           #801 - Gavin's Liked Videos
 72 | 	'LLa6TeYZ2DlueFRne5DyAnyg',           #910 - Grandayy's Liked Videos
 73 | 	'LL1EW42tsTQTuFIkKcAhufkw',           #616 - Talking Nonsense's Liked Videos
 74 | 	'LLq_X6pFQK2r8ptOq64nMYbA',           #5000 - Misaka Mikoto's Liked Videos
 75 | 	'LLFBuuvyZWLmYX_ve0RsBxbQ',           #1261 - kermit's Liked Videos
 76 | 	'LLQMjMW-9PhWoH6TWwmnVWvA',           #2762 - CallMeCarson's Liked Videos
 77 | 	'LLt7E8Qpue2TU9Yh47vkEbsQ',           #420 - Dolan Dark's Liked Vidoes
 78 | 	'LL0vXwnNFwrXRlje-gSxw-Eg',           #5000 - DatfaceddoeThe2nd Aka The Master Of Kirby's Liked Videos
 79 | 	'LLt-GOpCw4dOBlIyqL9A1ztA',           #5000 - Sr Pelo's Liked Videos
 80 | 	'LLMYTaTc_gVRyGF6LWzdIsqA',           #3551 - Cyranek's Liked Videos
 81 | 	'LL9ecwl3FTG66jIKA9JRDtmg',           #3221 - SiIvaGunner's Liked Videos
 82 | 	'LLYzPXprvl5Y-Sf0g4vX-m6g',           #1288 - jacksepticeye's Liked Videos
 83 | 	'LLk6rHCnCNxqWHKFknnyZGZw',           #781 - blazeaster
 84 | 	'PL68kEVQCeE3oxk3hZms2nJ0s3kYrp0rbj', #4873 - Slightly Less Important Videos I
 85 | 	'LLo8bcnLyZH8tBIH9V1mLgqQ',           #1145 - TheOdd1sOut's Liked Videos
 86 | 	'LLllm3HivMERwu2x2Sjz5EIg',           #795 - Vargskelethor Joel's Liked Videos
 87 | 	'LLny_vGt2N7_QJ5qBOAHxlcw',           #932 - maxmoefoe's Liked Videos
 88 | 	'LLQ4FyiI_1mWI2AtLS5ChdPQ',           #1163 - Boyinaband's Liked Videos
 89 | 	'PLv3TTBr1W_9vPB6WPEnPwOpLYeZQW5tuD', #4998 - Instant Regret Clicking This Playlist 2.0
 90 | 	'LLGwu0nbY2wSkW8N-cghnLpA',           #818 - Jaiden Animations's Liked Videos
 91 | 	'LLPcFg7aBbaVzXoIKSNqwaww',           #1451 - jacksfilms's Liked Videos
 92 | 	'LLu6v4AdYxVH5fhfq9mi5llA',           #459 - Let Me Explain Studios's Liked Videos
 93 | 	'LLJ0-OtVpF0wOKEqT2Z1HEtA',           #318 - ElectroBOOM's Liked Videos
 94 | 	'LLo1qj9072AgkWlmkR-PLwCQ',           #486 - AngeloJFurfaro's Liked Videos
 95 | ]
 96 | 
 97 | #Create directory to hold downloaded data
 98 | if not os.path.exists(SAVE_DIR):
 99 | 	os.makedirs(SAVE_DIR)
100 | 
101 | #Try to load the existing database so far
102 | all_comments = {}
103 | try:
104 | 	with codecs.open(DATA_FILE, 'r', encoding='utf-8') as fin:
105 | 		for line in fin:
106 | 			vid, title, comment = line[:-1].split('~')
107 | 			all_comments[vid] = (title, comment)
108 | except:
109 | 	pass
110 | 
111 | def scrape_playlist(playlist):
112 | 	NEXT_PAGE = ''
113 | 	while True:
114 | 		#Setup strings
115 | 		PLAYLIST_URL = 'https://www.googleapis.com/youtube/v3/playlistItems?key=' + API_KEY + '&part=snippet&playlistId=' + playlist + '&maxResults=50'
116 | 		if NEXT_PAGE != '':
117 | 			PLAYLIST_URL += '&pageToken=' + NEXT_PAGE
118 | 		SAVE_FILE = SAVE_DIR + 'ply_' + playlist + '_' + str(NEXT_PAGE) + '.txt'
119 | 		data_out = codecs.open(DATA_FILE, 'a', encoding='utf-8')
120 | 
121 | 		#Download the query (or load from file if cached)
122 | 		if os.path.isfile(SAVE_FILE):
123 | 			query_str = ""
124 | 			with codecs.open(SAVE_FILE, 'r', encoding='utf-8') as fin:
125 | 				query_str = fin.read()
126 | 		else:
127 | 			if READ_CACHE_ONLY:
128 | 				break
129 | 			query_str = urlopen(PLAYLIST_URL, timeout=10).read().decode('utf-8')
130 | 			with codecs.open(SAVE_FILE, 'w', encoding='utf-8') as fout:
131 | 				fout.write(query_str)
132 | 
133 | 		#Ignore if query is empty
134 | 		if len(query_str) == 0:
135 | 			print("===== WARNING: Empty Response =====")
136 | 			return
137 | 
138 | 		#Loop over all videos in the playlist
139 | 		query_json = json.loads(query_str)
140 | 		items = query_json['items']
141 | 		for item in items:
142 | 			#Get the video information
143 | 			snippet = item['snippet']
144 | 			title = snippet['title']
145 | 			vid = snippet['resourceId']['videoId']
146 | 			if vid in all_comments:
147 | 				continue
148 | 
149 | 			#Scrape Justin Y comments from the video
150 | 			try:
151 | 				good_comments = scrape_api(vid)
152 | 			except:
153 | 				good_comments = []
154 | 
155 | 			#Clean the text a bit and add it to the data set
156 | 			title = title.replace('\n',' . ').replace('\r',' . ').replace('~',' ')
157 | 			print(title.encode('utf-8').decode())
158 | 
159 | 			#Save all the good comments
160 | 			for good_comment in good_comments:
161 | 				good_comment = good_comment.replace('\n',' . ').replace('\r',' . ').replace('~',' ')
162 | 				data_out.write(vid + '~' + title + '~' + good_comment + '\n')
163 | 				all_comments[vid] = (title, good_comment)
164 | 				print("    " + good_comment.encode('utf-8').decode())
165 | 
166 | 		#Get the next page to process or quit if done
167 | 		data_out.close()
168 | 		if 'nextPageToken' in query_json:
169 | 			NEXT_PAGE = query_json['nextPageToken']
170 | 		else:
171 | 			break
172 | 
173 | def scrape_api(vid):
174 | 	cur_page = ''
175 | 	good_comments = []
176 | 	for i in range(NUM_COMMENT_PAGES):
177 | 		#Setup strings
178 | 		COMMENT_URL = 'https://www.googleapis.com/youtube/v3/commentThreads?key=' + API_KEY + '&textFormat=plainText&part=snippet&videoId=' + vid + '&maxResults=100&order=relevance'
179 | 		if len(cur_page) > 0:
180 | 			COMMENT_URL += '&pageToken=' + cur_page
181 | 		SAVE_FILE = SAVE_DIR + 'com_' + vid + cur_page + '.txt'
182 | 
183 | 		#Download the query (or load from file if cached)
184 | 		if os.path.isfile(SAVE_FILE):
185 | 			query_str = ''
186 | 			with codecs.open(SAVE_FILE, 'r', encoding='utf-8') as fin:
187 | 				query_str = fin.read()
188 | 		else:
189 | 			if READ_CACHE_ONLY:
190 | 				continue
191 | 			query_str = urlopen(COMMENT_URL).read().decode('utf-8')
192 | 			with open(SAVE_FILE, 'w', encoding='utf-8') as fout:
193 | 				fout.write(query_str)
194 | 
195 | 		query_json = json.loads(query_str)
196 | 		items = query_json['items']
197 | 		if ALL_COMMENTS:
198 | 			#Look for popular comments and add them
199 | 			for j in range(len(items)):
200 | 				item = items[j]
201 | 				snippet = item['snippet']['topLevelComment']['snippet']
202 | 				num_likes = int(snippet['likeCount'])
203 | 				if num_likes < 50 or (j >= 3 and num_likes < 200):
204 | 					continue
205 | 				comment = snippet['textDisplay']
206 | 				if len(comment) < ALL_COMMENTS_LEN_MIN or len(comment) > ALL_COMMENTS_LEN_MAX:
207 | 					continue
208 | 				good_comments.append(comment)
209 | 		else:
210 | 			#Look for Justin Y comments
211 | 			justin_y_comment = ''
212 | 			for item in items:
213 | 				snippet = item['snippet']['topLevelComment']['snippet']
214 | 				if JUSTIN_Y_CHANNEL in snippet['authorChannelUrl']:
215 | 					justin_y_comment = snippet['textDisplay']
216 | 					break
217 | 
218 | 			#Return result if found
219 | 			if len(justin_y_comment) > 0:
220 | 				return [justin_y_comment]
221 | 
222 | 		#Otherwise search the next page
223 | 		if 'nextPageToken' in query_json:
224 | 			cur_page = query_json['nextPageToken']
225 | 		else:
226 | 			break
227 | 
228 | 	#Return whatever was found
229 | 	return good_comments
230 | 
231 | for playlist in ALL_PLAYLISTS:
232 | 	print("==========================================================")
233 | 	print("     Staring playlist " + playlist)
234 | 	print("==========================================================")
235 | 	print("")
236 | 	scrape_playlist(playlist)
237 | 


--------------------------------------------------------------------------------
/Train.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import numpy as np
  3 | from matplotlib import pyplot as plt
  4 | import util
  5 | 
  6 | SEQ_SIZE = 8
  7 | CTXT_SIZE = 200
  8 | EMBEDDING_SIZE = 200
  9 | USE_LSTM = True
 10 | USE_OUT_SEQ = False
 11 | CONTINUE_TRAIN = False
 12 | NUM_EPOCHS = 100
 13 | NUM_MINI_EPOCHS = 1
 14 | BATCH_SIZE = 200
 15 | LR = 0.001
 16 | DO_RATE = 0.05
 17 | BN = 0.99
 18 | SAVE_DIR = 'trained_all/'
 19 | PARSED_DIR = 'parsed_all/'
 20 | 
 21 | #Create directory to save model
 22 | if not os.path.exists(SAVE_DIR):
 23 | 	os.makedirs(SAVE_DIR)
 24 | 
 25 | #Load comment dictionary
 26 | comment_words, comment_word_to_ix = util.load_comment_dict(PARSED_DIR)
 27 | comment_dict_size = len(comment_words)
 28 | 
 29 | #Load training samples
 30 | title_ix_samples, title_unique_samples, past_samples, pred_samples = util.create_training_samples(PARSED_DIR, SEQ_SIZE, USE_OUT_SEQ)
 31 | num_samples = past_samples.shape[0]
 32 | 
 33 | #Load Keras and Theano
 34 | print("Loading Keras...")
 35 | import os, math
 36 | #os.environ['THEANORC'] = "./gpu.theanorc"
 37 | os.environ['KERAS_BACKEND'] = "tensorflow"
 38 | import tensorflow as tf
 39 | print("Tensorflow Version: " + tf.__version__)
 40 | import keras
 41 | print("Keras Version: " + keras.__version__)
 42 | from keras.layers import Input, Dense, Activation, Dropout, Flatten, Reshape, RepeatVector, TimeDistributed, LeakyReLU, CuDNNGRU, concatenate
 43 | from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D, Convolution1D
 44 | from keras.layers.embeddings import Embedding
 45 | from keras.layers.local import LocallyConnected2D
 46 | from keras.layers.pooling import MaxPooling2D
 47 | from keras.layers.noise import GaussianNoise
 48 | from keras.layers.normalization import BatchNormalization
 49 | from keras.layers.recurrent import LSTM, SimpleRNN, GRU
 50 | from keras.models import Model, Sequential, load_model, model_from_json
 51 | from keras.optimizers import Adam, RMSprop, SGD
 52 | from keras.preprocessing.image import ImageDataGenerator
 53 | from keras.regularizers import l1
 54 | from keras.utils import plot_model, to_categorical
 55 | from keras import backend as K
 56 | K.set_image_data_format('channels_first')
 57 | 
 58 | #Fix bug with sparse_categorical_accuracy
 59 | from tensorflow.python.ops import math_ops
 60 | from tensorflow.python.framework import ops
 61 | from tensorflow.python.keras import backend as K
 62 | from tensorflow.python.ops import array_ops
 63 | def new_sparse_categorical_accuracy(y_true, y_pred):
 64 | 	y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims
 65 | 	y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims
 66 | 	# If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
 67 | 	if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))):
 68 | 		y_true = array_ops.squeeze(y_true, [-1])
 69 | 	y_pred = math_ops.argmax(y_pred, axis=-1)
 70 | 	# If the predicted output and actual output types don't match, force cast them
 71 | 	# to match.
 72 | 	if K.dtype(y_pred) != K.dtype(y_true):
 73 | 		y_pred = math_ops.cast(y_pred, K.dtype(y_true))
 74 | 	return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
 75 | 
 76 | #Build the training models
 77 | if CONTINUE_TRAIN:
 78 | 	print("Loading Model...")
 79 | 	model = load_model(SAVE_DIR + 'Model.h5')
 80 | else:
 81 | 	print("Building Model...")
 82 | 	ctxt_in = Input(shape=title_unique_samples.shape[1:])
 83 | 	past_in = Input(shape=past_samples.shape[1:])
 84 | 
 85 | 	if USE_LSTM:
 86 | 		ctxt_dense = Dense(CTXT_SIZE)(ctxt_in)
 87 | 		ctxt_dense = LeakyReLU(0.2)(ctxt_dense)
 88 | 		ctxt_dense = RepeatVector(SEQ_SIZE)(ctxt_dense)
 89 | 
 90 | 		past_dense = Embedding(comment_dict_size, EMBEDDING_SIZE, input_length=SEQ_SIZE)(past_in)
 91 | 		x = concatenate([ctxt_dense, past_dense])
 92 | 		x = Dropout(DO_RATE)(x)
 93 | 
 94 | 		x = CuDNNGRU(200, return_sequences=USE_OUT_SEQ)(x)
 95 | 		if USE_OUT_SEQ:
 96 | 			x = TimeDistributed(BatchNormalization(momentum=BN))(x)
 97 | 			x = TimeDistributed(Dense(comment_dict_size, activation='softmax'))(x)
 98 | 		else:
 99 | 			x = BatchNormalization(momentum=BN)(x)
100 | 			x = Dense(comment_dict_size, activation='softmax')(x)
101 | 	else:
102 | 		ctxt_dense = Dense(CTXT_SIZE)(ctxt_in)
103 | 		ctxt_dense = LeakyReLU(0.2)(ctxt_dense)
104 | 		past_dense = Embedding(comment_dict_size, EMBEDDING_SIZE, input_length=SEQ_SIZE)(past_in)
105 | 		past_dense = Flatten(data_format = 'channels_last')(past_dense)
106 | 		x = concatenate([ctxt_dense, past_dense])
107 | 
108 | 		x = Dense(800)(x)
109 | 		x = LeakyReLU(0.2)(x)
110 | 		if DO_RATE > 0.0:
111 | 			x = Dropout(DO_RATE)(x)
112 | 		#x = BatchNormalization(momentum=BN)(x)
113 | 
114 | 		x = Dense(400)(x)
115 | 		x = LeakyReLU(0.2)(x)
116 | 		if DO_RATE > 0.0:
117 | 			x = Dropout(DO_RATE)(x)
118 | 		#x = BatchNormalization(momentum=BN)(x)
119 | 
120 | 		x = Dense(comment_dict_size, activation='softmax')(x)
121 | 
122 | 	if USE_OUT_SEQ:
123 | 		metric = new_sparse_categorical_accuracy
124 | 	else:
125 | 		metric = 'sparse_categorical_accuracy'
126 | 
127 | 	model = Model(inputs=[ctxt_in, past_in], outputs=[x])
128 | 	model.compile(optimizer=Adam(lr=LR), loss='sparse_categorical_crossentropy', metrics=[metric])
129 | 	print(model.summary())
130 | 
131 | 	#plot_model(model, to_file=SAVE_DIR + 'model.png', show_shapes=True)
132 | 
133 | #Utilites
134 | def plotScores(scores, test_scores, fname, on_top=True):
135 | 	plt.clf()
136 | 	ax = plt.gca()
137 | 	ax.yaxis.tick_right()
138 | 	ax.yaxis.set_ticks_position('both')
139 | 	ax.yaxis.grid(True)
140 | 	plt.plot(scores)
141 | 	plt.plot(test_scores)
142 | 	plt.xlabel('Epoch')
143 | 	plt.tight_layout()
144 | 	loc = ('upper right' if on_top else 'lower right')
145 | 	plt.draw()
146 | 	plt.savefig(fname)
147 | 
148 | #Train model
149 | print("Training...")
150 | train_loss = []
151 | train_acc = []
152 | test_loss = []
153 | test_acc = []
154 | i_train = np.arange(num_samples)
155 | batches_per_epoch = num_samples // BATCH_SIZE
156 | for epoch in range(NUM_EPOCHS):
157 | 	np.random.shuffle(i_train)
158 | 	for j in range(NUM_MINI_EPOCHS):
159 | 		loss = 0.0
160 | 		acc = 0.0
161 | 		num = 0.0
162 | 		start_i = batches_per_epoch * j // NUM_MINI_EPOCHS
163 | 		end_i = batches_per_epoch * (j + 1) // NUM_MINI_EPOCHS
164 | 		for i in range(start_i, end_i):
165 | 			i_batch = i_train[i*BATCH_SIZE:(i + 1)*BATCH_SIZE]
166 | 			title_batch = title_unique_samples[title_ix_samples[i_batch]]
167 | 			past_batch = past_samples[i_batch]
168 | 			pred_batch = pred_samples[i_batch]
169 | 
170 | 			batch_loss, batch_acc = model.train_on_batch([title_batch, past_batch], [pred_batch])
171 | 			loss += batch_loss
172 | 			acc += batch_acc
173 | 			num += 1.0
174 | 
175 | 			if i % 5 == 0:
176 | 				progress = ((i - start_i) * 100) // (end_i - start_i)
177 | 				sys.stdout.write(
178 | 					str(progress) + "%" +
179 | 					"  Loss:" + str(loss / num) +
180 | 					"  Acc:" + str(acc / num) + "        ")
181 | 				sys.stdout.write('\r')
182 | 				sys.stdout.flush()
183 | 		sys.stdout.write('\n')
184 | 		loss /= num
185 | 		acc /= num
186 | 
187 | 		train_loss.append(loss)
188 | 		train_acc.append(acc)
189 | 
190 | 		plotScores(train_loss, test_loss, SAVE_DIR + 'Loss.png', True)
191 | 		plotScores(train_acc, test_acc, SAVE_DIR + 'Acc.png', False)
192 | 
193 | 		if loss == min(train_loss):
194 | 			model.save(SAVE_DIR + 'Model.h5')
195 | 			print("Saved")
196 | 
197 | 	print("====  EPOCH FINISHED  ====")
198 | 
199 | print("Done")
200 | 


--------------------------------------------------------------------------------
/YTCommenter.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HackerPoet/YouTubeCommenter/6df02550abd15a0559bcc8fdce668b3a4aa32133/YTCommenter.zip


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import os, re
  2 | import numpy as np
  3 | 
  4 | def clean_text(text):
  5 | 	for c in [u"\u0060", u"\u00B4", u"\u2018", u"\u2019"]:
  6 | 		text = text.replace(c, "'")
  7 | 	for c in [u"\u00C0", u"\u00C1", u"\u00C2", u"\u00C3", u"\u00C4", u"\u00C5",
  8 | 	          u"\u00E0", u"\u00E1", u"\u00E2", u"\u00E3", u"\u00E4", u"\u00E5"]:
  9 | 		text = text.replace(c, "a")
 10 | 	for c in [u"\u00C8", u"\u00C9", u"\u00CA", u"\u00CB",
 11 | 	          u"\u00E8", u"\u00E9", u"\u00EA", u"\u00EB"]:
 12 | 		text = text.replace(c, "e")
 13 | 	for c in [u"\u00CC", u"\u00CD", u"\u00CE", u"\u00CF",
 14 | 	          u"\u00EC", u"\u00ED", u"\u00EE", u"\u00EF"]:
 15 | 		text = text.replace(c, "i")
 16 | 	for c in [u"\u00D2", u"\u00D3", u"\u00D4", u"\u00D5", u"\u00D6",
 17 | 	          u"\u00F2", u"\u00F3", u"\u00F4", u"\u00F5", u"\u00F6"]:
 18 | 		text = text.replace(c, "o")
 19 | 	for c in [u"\u00DA", u"\u00DB", u"\u00DC", u"\u00DD",
 20 | 	          u"\u00FA", u"\u00FB", u"\u00FC", u"\u00FD"]:
 21 | 		text = text.replace(c, "u")
 22 | 	text = text.replace(u"\u00D1", "n").replace(u"\u00F1", "n")
 23 | 	text = text.encode('utf-8').decode()
 24 | 	if 'http' in text:
 25 | 		return ''
 26 | 	text = re.sub(r'[^0-9a-z .,?!\'/:;<>#\-\$%&]', ' ', text.lower())
 27 | 	text = ' ' + text + ' '
 28 | 	text = text.replace('&', ' and ')
 29 | 	text = re.sub(r'\.( +\.)+', '..', text)
 30 | 	text = re.sub(r'\.\.+', ' ^ ', text)
 31 | 	text = re.sub(r',+', ',', text)
 32 | 	text = re.sub(r'\-+', '-', text)
 33 | 	text = re.sub(r'\?+', ' ? ', text)
 34 | 	text = re.sub(r'\!+', ' ! ', text)
 35 | 	text = re.sub(r'\'+', "'", text)
 36 | 	text = re.sub(r';+', ':', text)
 37 | 	text = re.sub(r'/+', ' / ', text)
 38 | 	text = re.sub(r'<+', ' < ', text)
 39 | 	text = re.sub(r'>+', ' > ', text)
 40 | 	text = text.replace('%', '% ')
 41 | 	text = text.replace(' - ', ' : ')
 42 | 	text = text.replace(' -', " - ")
 43 | 	text = text.replace('- ', " - ")
 44 | 	text = text.replace(" '", " ")
 45 | 	text = text.replace("' ", " ")
 46 | 	for c in ".,:":
 47 | 		text = text.replace(c + ' ', ' ' + c + ' ')
 48 | 	#text = re.sub(r' \d\d?:\d\d ', ' 0:00 ', text)
 49 | 	text = re.sub(r' +', ' ', text.strip(' '))
 50 | 	text = text.replace('^', '...')
 51 | 	return text
 52 | 
 53 | def load_dict(fname, word_list, word_dict):
 54 | 	with open(fname, 'r') as fin:
 55 | 		for line in fin:
 56 | 			line = line[:-1]
 57 | 			assert(line not in word_dict)
 58 | 			word_dict[line] = len(word_list)
 59 | 			word_list.append(line)
 60 | 	assert(word_list[0] == '')
 61 | 
 62 | def load_title_dict(PARSED_DIR):
 63 | 	title_words = []
 64 | 	title_word_to_ix = {}
 65 | 	load_dict(PARSED_DIR + 'title_dict.txt', title_words, title_word_to_ix)
 66 | 	print("Loaded " + str(len(title_words)) + " title word dictionary.")
 67 | 	return title_words, title_word_to_ix
 68 | 
 69 | def load_comment_dict(PARSED_DIR):
 70 | 	comment_words = []
 71 | 	comment_word_to_ix = {}
 72 | 	load_dict(PARSED_DIR + 'comment_dict.txt', comment_words, comment_word_to_ix)
 73 | 	print("Loaded " + str(len(comment_words)) + " comment word dictionary.")
 74 | 	return comment_words, comment_word_to_ix
 75 | 
 76 | def load_title_sentences(PARSED_DIR):
 77 | 	#Load the raw data
 78 | 	print("Loading Titles...")
 79 | 	titles = np.load(PARSED_DIR + 'titles.npy')
 80 | 	title_lens = np.load(PARSED_DIR + 'title_lens.npy')
 81 | 	print("Loaded " + str(len(title_lens)) + " titles.")
 82 | 
 83 | 	#Extract all title sentences
 84 | 	title_ix = 0
 85 | 	title_sentences = []
 86 | 	for title_len in title_lens:
 87 | 		title_sentences.append(titles[title_ix:title_ix + title_len])
 88 | 		title_ix += title_len
 89 | 	return title_sentences
 90 | 
 91 | def load_comment_sentences(PARSED_DIR):
 92 | 	#Load the raw data
 93 | 	print("Loading Comments...")
 94 | 	comments = np.load(PARSED_DIR + 'comments.npy')
 95 | 	comment_lens = np.load(PARSED_DIR + 'comment_lens.npy')
 96 | 	print("Loaded " + str(len(comment_lens)) + " comments.")
 97 | 
 98 | 	#Extract all comment sentences
 99 | 	comment_ix = 0
100 | 	comment_sentences = []
101 | 	for comment_len in comment_lens:
102 | 		comment_sentences.append(comments[comment_ix:comment_ix + comment_len])
103 | 		comment_ix += comment_len
104 | 	return comment_sentences
105 | 
106 | def bag_of_words(title_ixs, title_dict_size):
107 | 	title_sample = np.zeros((title_dict_size,), dtype=np.uint8)
108 | 	for ix in title_ixs:
109 | 		title_sample[ix] = 1
110 | 	return title_sample
111 | 
112 | def create_training_samples(PARSED_DIR, seq_size, out_seq=False):
113 | 	#Load the raw data
114 | 	print("Loading Titles...")
115 | 	titles = np.load(PARSED_DIR + 'titles.npy')
116 | 	title_lens = np.load(PARSED_DIR + 'title_lens.npy')
117 | 	print("Loaded " + str(len(title_lens)) + " titles.")
118 | 
119 | 	#Load the raw data
120 | 	print("Loading Comments...")
121 | 	comments = np.load(PARSED_DIR + 'comments.npy')
122 | 	comment_lens = np.load(PARSED_DIR + 'comment_lens.npy')
123 | 	print("Loaded " + str(len(comment_lens)) + " comments.")
124 | 
125 | 	#Convert to training samples
126 | 	print("Creating Training Samples...")
127 | 	title_ix = 0
128 | 	comment_ix = 0
129 | 	title_ix_samples = []
130 | 	title_unique_samples = []
131 | 	past_samples = []
132 | 	pred_samples = []
133 | 	title_dict_size = np.amax(titles) + 1
134 | 	for i in range(title_lens.shape[0]):
135 | 		title_len = title_lens[i]
136 | 		title_sample = np.zeros((title_dict_size,), dtype=np.uint8)
137 | 		for j in range(title_len):
138 | 			word = titles[title_ix]
139 | 			title_sample[word] = 1
140 | 			title_ix += 1
141 | 		title_unique_samples.append(title_sample)
142 | 
143 | 		comment_len = comment_lens[i]
144 | 		past_sample = np.zeros((seq_size,), dtype=np.int32)
145 | 		end_j = comment_len + 1
146 | 		if out_seq:
147 | 			end_j = max(end_j, seq_size)
148 | 		for j in range(end_j):
149 | 			if not out_seq or j >= seq_size - 1:
150 | 				title_ix_samples.append(len(title_unique_samples) - 1)
151 | 				past_samples.append(past_sample)
152 | 
153 | 			if j >= comment_len:
154 | 				next_word = 0
155 | 			else:
156 | 				next_word = comments[comment_ix]
157 | 				comment_ix += 1
158 | 
159 | 			past_sample = np.roll(past_sample, -1)
160 | 			past_sample[-1] = next_word
161 | 			if not out_seq:
162 | 				pred_samples.append(next_word)
163 | 			elif j >= seq_size - 1:
164 | 				pred_samples.append(past_sample)
165 | 
166 | 	num_samples = len(past_samples)
167 | 	assert(title_ix == len(titles))
168 | 	assert(comment_ix == len(comments))
169 | 	assert(num_samples == len(pred_samples))
170 | 	assert(num_samples == len(title_ix_samples))
171 | 	title_ix_samples = np.array(title_ix_samples, dtype=np.int32)
172 | 	title_unique_samples = np.array(title_unique_samples, dtype=np.uint8)
173 | 	past_samples = np.array(past_samples, dtype=np.int32)
174 | 	pred_samples = np.array(pred_samples, dtype=np.int32)
175 | 	pred_samples = np.expand_dims(pred_samples, axis=-1)
176 | 	print("Created " + str(num_samples) + " samples.")
177 | 
178 | 	return title_ix_samples, title_unique_samples, past_samples, pred_samples
179 | 


--------------------------------------------------------------------------------