├── .gitignore ├── Generate.py ├── LICENSE ├── Parse.py ├── README.md ├── Scrape.py ├── Train.py ├── YTCommenter.zip └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.bat 2 | *.h5 3 | *.npy 4 | *.png 5 | *.pyc 6 | *.txt -------------------------------------------------------------------------------- /Generate.py: -------------------------------------------------------------------------------- 1 | import os, random, json 2 | import numpy as np 3 | from scipy import stats 4 | import util 5 | 6 | SEQ_SIZE = 8 7 | NUM_TO_GEN = 20 8 | MODEL_DIR = 'trained_all/' 9 | PARSED_DIR = 'parsed_all/' 10 | MAKE_STATEFUL = False 11 | IS_REVERSE = False 12 | 13 | #Load titles 14 | title_words, title_word_to_ix = util.load_title_dict(PARSED_DIR) 15 | title_dict_size = len(title_words) 16 | title_sentences = util.load_title_sentences(PARSED_DIR) 17 | 18 | #Load comments 19 | comment_words, comment_word_to_ix = util.load_comment_dict(PARSED_DIR) 20 | comment_dict_size = len(comment_words) 21 | comment_sentences = util.load_comment_sentences(PARSED_DIR) 22 | assert(len(title_sentences) == len(comment_sentences)) 23 | 24 | def word_ixs_to_str(word_ixs, is_title): 25 | result_txt = "" 26 | for w_ix in word_ixs: 27 | w = (title_words if is_title else comment_words)[w_ix] 28 | if len(result_txt) == 0 or w in ['.', ',', "'", '!', '?', ':', ';', '...']: 29 | result_txt += w 30 | elif len(result_txt) > 0 and result_txt[-1] == "'" and w in ['s', 're', 't', 'll', 've', 'd']: 31 | result_txt += w 32 | else: 33 | result_txt += ' ' + w 34 | if len(result_txt) > 0: 35 | result_txt = result_txt[:1].upper() + result_txt[1:] 36 | return result_txt 37 | 38 | def probs_to_word_ix(pk, is_first): 39 | if is_first: 40 | pk[0] = 0.0 41 | pk /= np.sum(pk) 42 | else: 43 | pk *= pk 44 | pk /= np.sum(pk) 45 | #for i in range(3): 46 | # max_val = np.amax(pk) 47 | # if max_val > 0.5: 48 | # break 49 | # pk *= pk 50 | # pk /= np.sum(pk) 51 | 52 | xk = np.arange(pk.shape[0], dtype=np.int32) 53 | custm = stats.rv_discrete(name='custm', values=(xk, pk)) 54 | return custm.rvs() 55 | 56 | def pred_text(model, context, max_len=64): 57 | output = [] 58 | context = np.expand_dims(context, axis=0) 59 | if MAKE_STATEFUL: 60 | past_sample = np.zeros((1,), dtype=np.int32) 61 | else: 62 | past_sample = np.zeros((SEQ_SIZE,), dtype=np.int32) 63 | while len(output) < max_len: 64 | pk = model.predict([context, np.expand_dims(past_sample, axis=0)], batch_size=1)[-1] 65 | if MAKE_STATEFUL: 66 | pk = pk[0] 67 | else: 68 | past_sample = np.roll(past_sample, 1 if IS_REVERSE else -1) 69 | new_sample = probs_to_word_ix(pk, len(output) == 0) 70 | past_sample[0 if IS_REVERSE else -1] = new_sample 71 | if new_sample == 0: 72 | break 73 | output.append(new_sample) 74 | 75 | model.reset_states() 76 | return output 77 | 78 | #Load Keras and Theano 79 | print("Loading Keras...") 80 | import os, math 81 | os.environ['KERAS_BACKEND'] = "tensorflow" 82 | import tensorflow as tf 83 | print("Tensorflow Version: " + tf.__version__) 84 | import keras 85 | print("Keras Version: " + keras.__version__) 86 | from keras.layers import Input, Dense, Activation, Dropout, Flatten, Reshape, RepeatVector, TimeDistributed, concatenate 87 | from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D, Convolution1D 88 | from keras.layers.embeddings import Embedding 89 | from keras.layers.local import LocallyConnected2D 90 | from keras.layers.pooling import MaxPooling2D 91 | from keras.layers.noise import GaussianNoise 92 | from keras.layers.normalization import BatchNormalization 93 | from keras.layers.recurrent import LSTM, SimpleRNN, GRU 94 | from keras.models import Model, Sequential, load_model, model_from_json 95 | from keras.optimizers import Adam, RMSprop, SGD 96 | from keras.preprocessing.image import ImageDataGenerator 97 | from keras.regularizers import l1 98 | from keras.utils import plot_model, to_categorical 99 | from keras import backend as K 100 | K.set_image_data_format('channels_first') 101 | 102 | #Fix bug with sparse_categorical_accuracy 103 | from tensorflow.python.ops import math_ops 104 | from tensorflow.python.framework import ops 105 | from tensorflow.python.keras import backend as K 106 | from tensorflow.python.ops import array_ops 107 | def new_sparse_categorical_accuracy(y_true, y_pred): 108 | y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims 109 | y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims 110 | # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,) 111 | if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))): 112 | y_true = array_ops.squeeze(y_true, [-1]) 113 | y_pred = math_ops.argmax(y_pred, axis=-1) 114 | # If the predicted output and actual output types don't match, force cast them 115 | # to match. 116 | if K.dtype(y_pred) != K.dtype(y_true): 117 | y_pred = math_ops.cast(y_pred, K.dtype(y_true)) 118 | return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx()) 119 | 120 | #Load the model 121 | print("Loading Model...") 122 | model = load_model(MODEL_DIR + 'model.h5', custom_objects={'new_sparse_categorical_accuracy':new_sparse_categorical_accuracy}) 123 | 124 | if MAKE_STATEFUL: 125 | weights = model.get_weights() 126 | model_json = json.loads(model.to_json()) 127 | 128 | layers = model_json['config']['layers'] 129 | for layer in layers: 130 | if 'batch_input_shape' in layer['config']: 131 | layer['config']['batch_input_shape'][0] = 1 132 | if layer['config']['batch_input_shape'][1] == SEQ_SIZE: 133 | layer['config']['batch_input_shape'][1] = 1 134 | if layer['class_name'] == 'Embedding': 135 | layer['config']['input_length'] = 1 136 | if layer['class_name'] == 'RepeatVector': 137 | layer['config']['n'] = 1 138 | if layer['class_name'] == 'LSTM': 139 | assert(layer['config']['stateful'] == False) 140 | layer['config']['stateful'] = True 141 | 142 | print(json.dumps(model_json, indent=4, sort_keys=True)) 143 | model = model_from_json(json.dumps(model_json)) 144 | model.set_weights(weights) 145 | 146 | #plot_model(model, to_file='temp.png', show_shapes=True) 147 | 148 | def generate_titles(my_title): 149 | my_title = util.clean_text(my_title) 150 | my_words = my_title.split(' ') 151 | print(' '.join((w.upper() if w in title_word_to_ix else w) for w in my_words) + '\n') 152 | my_title_ixs = [title_word_to_ix[w] for w in my_words if w in title_word_to_ix] 153 | my_title_sample = util.bag_of_words(my_title_ixs, title_dict_size) 154 | for i in range(10): 155 | print(' ' + word_ixs_to_str(pred_text(model, my_title_sample), False)) 156 | print('') 157 | 158 | while True: 159 | my_title = input('Enter Title:\n') 160 | generate_titles(my_title) 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 HackerPoet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Parse.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | import codecs, itertools 3 | import numpy as np 4 | import util 5 | 6 | #INPUT_DATA = 'scraped/data.txt' 7 | INPUT_DATA = 'scraped/all_comments.txt' 8 | SAVE_DIR = 'parsed_all/' 9 | 10 | #Create directory to hold parsed data 11 | if not os.path.exists(SAVE_DIR): 12 | os.makedirs(SAVE_DIR) 13 | 14 | def split_to_words(full_str): 15 | #Split clean sentence into words 16 | words = full_str.split(' ') 17 | 18 | #Remove annoying space-separated letters 19 | num_single_letter = 0 20 | for word in words: 21 | if len(word) == 1 and word in 'abcdefghijklmnopqrstuvwxyzx': 22 | num_single_letter += 1 23 | #If sentence contains an unusually long word, ignore it 24 | if len(word) >= 24: 25 | return [] 26 | if num_single_letter > 5: 27 | return [] 28 | 29 | return [word for word in words if len(word) > 0] 30 | 31 | def parse_line(line): 32 | vid, title, comment = line[:-1].split('~') 33 | title = util.clean_text(title) 34 | comment = util.clean_text(comment) 35 | title_ix = split_to_words(title) 36 | if len(title_ix) == 0: 37 | return [], [] 38 | comment_ix = split_to_words(comment) 39 | return title_ix, comment_ix 40 | 41 | def words_to_ixs(words, all_words, word_to_ix): 42 | for word in words: 43 | if word not in word_to_ix: 44 | word_to_ix[word] = len(all_words) 45 | all_words.append(word) 46 | return [word_to_ix[w] for w in words] 47 | 48 | #Read the file line by line 49 | print("Parsing...") 50 | all_title_words = [] 51 | all_comment_words = [] 52 | with codecs.open(INPUT_DATA, 'r', encoding='utf-8') as fin: 53 | for line in fin: 54 | title_words, comment_words = parse_line(line) 55 | if len(title_words) == 0 or len(comment_words) == 0: 56 | continue 57 | all_title_words.append(title_words) 58 | all_comment_words.append(comment_words) 59 | 60 | #Generate a word frequency to help eliminate uncommon samples 61 | print("Counting Occurrence...") 62 | comment_word_count = {} 63 | for comment_words in all_comment_words: 64 | for word in comment_words: 65 | if word in comment_word_count: 66 | comment_word_count[word] += 1 67 | else: 68 | comment_word_count[word] = 1 69 | 70 | #Eliminate any words that appeared only once 71 | print("Eliminating Ultra-Rare Words...") 72 | all_title_ixs = [] 73 | all_comment_ixs = [] 74 | title_word_list = [''] 75 | title_word_map= {'':0} 76 | comment_word_list = [''] 77 | comment_word_map = {'':0} 78 | for title_words, comment_words in zip(all_title_words, all_comment_words): 79 | for word in comment_words: 80 | if comment_word_count[word] <= 1: 81 | break 82 | else: 83 | all_title_ixs.append(words_to_ixs(title_words, title_word_list, title_word_map)) 84 | all_comment_ixs.append(words_to_ixs(comment_words, comment_word_list, comment_word_map)) 85 | 86 | #Generate lengths for the flattened data 87 | print("Converting To Indices...") 88 | all_title_lens = [] 89 | all_comment_lens = [] 90 | for title_ix in all_title_ixs: 91 | all_title_lens.append(len(title_ix)) 92 | for comment_ix in all_comment_ixs: 93 | all_comment_lens.append(len(comment_ix)) 94 | all_title_ixs = list(itertools.chain.from_iterable(all_title_ixs)) 95 | all_comment_ixs = list(itertools.chain.from_iterable(all_comment_ixs)) 96 | 97 | #Write results with numpy 98 | print("Total Pairs: " + str(len(all_title_lens))) 99 | print("Total Title Words: " + str(len(all_title_ixs))) 100 | print("Total Comment Words: " + str(len(all_comment_ixs))) 101 | np.save(SAVE_DIR + 'titles.npy', np.array(all_title_ixs, dtype=np.int32)) 102 | np.save(SAVE_DIR + 'comments.npy', np.array(all_comment_ixs, dtype=np.int32)) 103 | np.save(SAVE_DIR + 'title_lens.npy', np.array(all_title_lens, dtype=np.int32)) 104 | np.save(SAVE_DIR + 'comment_lens.npy', np.array(all_comment_lens, dtype=np.int32)) 105 | 106 | #Save dictionary of all used words 107 | print("Title Dict Size: " + str(len(title_word_list))) 108 | with open(SAVE_DIR + 'title_dict.txt', 'w') as fout: 109 | for w in title_word_list: 110 | fout.write(w + '\n') 111 | print("Comment Dict Size: " + str(len(comment_word_list))) 112 | with open(SAVE_DIR + 'comment_dict.txt', 'w') as fout: 113 | for w in comment_word_list: 114 | fout.write(w + '\n') 115 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YouTubeCommenter 2 | AI to generate YouTube comments based on video title. 3 | 4 | **PLEASE NOTE:** This project is primarily available for reference, it is not in active development. 5 | 6 | # Video 7 | https://youtu.be/tY6SvZEic9k 8 | -------------------------------------------------------------------------------- /Scrape.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen 2 | import os, re 3 | import codecs 4 | import json 5 | 6 | #Update this line with your API key 7 | API_KEY = '' 8 | 9 | JUSTIN_Y_CHANNEL = 'UCiTfB-A55Vq2fB610vaWJVA' 10 | SAVE_DIR = 'scraped/' 11 | DATA_FILE = SAVE_DIR + 'all_comments.txt' 12 | NUM_COMMENT_PAGES = 1 13 | READ_CACHE_ONLY = False 14 | 15 | ALL_COMMENTS = True 16 | ALL_COMMENTS_LEN_MIN = 16 17 | ALL_COMMENTS_LEN_MAX = 120 18 | 19 | ALL_PLAYLISTS = [ 20 | 'LLiTfB-A55Vq2fB610vaWJVA', #5000 - Justin Y's liked videos 21 | 'PLEDLY9RcAVEAIhk1PO_3HNKZnqo8Tk6b5', #195 - The Great ones 22 | 'PLAZQuoSz85i6ZXYGUGoTny2Foj-GOAwde', #445 - the justin y. archive 23 | 'PLDqtsMAe_KP1FOYhp2PB6Og4KkMiNaHtf', #365 - Videos that Justin Y. has commented on 24 | 'PLdwJKVt5J4kPa4hpJb9T-ai3gCgn_aBns', #226 - Videos visited by our god king Justin Y 25 | 'PLdCWc2HiLQ-KiJCvVAPbItZng7L3zQeoD', #116 - Justin Y. comments on here 26 | 'PLs9Y9RDj_m8GFrXCJVvmjXXrHb3TvtEWZ', #197 - Justin Y. Comments 27 | 'PLXzFN31Zk_rJv9pbpLoBC4tom2upYDSpC', #110 - Justin y. comments 28 | 'PLjlDY1ZxBlPKuaikG2_PfzYhOj04WXihN', #268 - Videos that have Justin Y. 29 | 'PL2P9kapzgxPKt1R4a1uhaSTrBBWgYliP3', #307 - Vids With Justin Y.'s Comment 30 | 'PLflokqibVlnOJ_oJo99UaOpiTT1iX0jUc', #118 - Where Justin Y. is 31 | 'PLyZd6Up2ag97WCkrF94SoTYpo9el5Jbl4', #256 - Justin Y. Encounter 32 | 'PLEDLY9RcAVEAPgF9Oas2HY1Irv0fANt-v', #414 - Videos involving me 33 | 'PLsLRmcu39w_EWxoKXVu3BP3_YT_xVhevT', #187 - Justin Y commented 34 | 'PLcl0qJ8euYS2KNnNBAsl_AYpMGj559ql_', #386 - The chonicles of human achievement 35 | 'PLEDLY9RcAVECo1fh3XI4Thh5yORYLcYu5', #71 - Stuff to watch at 3 AM 36 | 'PL36thkaVLp_cjbfm7wC5Y72C0CKXclILj', #85 - Justin Y sightings 37 | 'PLoJiUwNc7oHTyNVH5r-D357Quz27k6C2r', #87 - Every Video I've watched and found Justin Y. in it 38 | 'PLWoMlj_1yvw7u4Swrh6xLSK62BfEi-clx', #101 - Justin Y, 03:00 69 (Oldest Published) 39 | 'PL8cL6SlaG8o9sl93VEY-WvCdFlC1sW_Fb', #91 - Justin Y 40 | 'PLzuaYEgVgFn6K3pxIkVGHx6reD61Gi4HH', #412 - Can You Find Justin Y. in the comments 41 | 'PL73NEkiN4yYhmHWwxuLKoH4p-NdJess4p', #303 - Justin Y. will be obliterated 42 | 'PLWtaAQnBZMhc7wMKeV5OYm-HgLw5FFhFo', #129 - Mystery 43 | 'PLjbgElHjuOFt7Jr96LO44-hnA0mvk8OQN', #119 - Justin Y. Comments in videos 44 | 'PLwDG7WIMSrIv5yyHK2iAe4NPaw7shSwA-', #74 - Videos I watch that Justin y commented on 45 | 'PLlkgpk6VYN-VC06r47a-ujPy817WRItP8', #138 - Justin Y. 46 | 'PLlDNv7LhQsqHNwz_qCLoQ00uV3ZaUZWKm', #207 - Justin Y 47 | 'PL_KQOLf_bJf2MmCiYiWeXTPpZuh00yrTT', #94 - Justin Y. 48 | 'PLWoMlj_1yvw6I2T8LBDRE3Dl-009QG0UV', #196 - Justin Y. Presents The Great Ones (Oldest Published) 49 | 'PL-qupiaXIo35dwoLWPX4h21-AhCEghOJk', #127 - Videos with Justin Y Comments 50 | 'PL2S1ygZEsbrq7LnFYZ4Vo_fwBAZN_usLl', #38 - Videos Justin Y comments on 51 | 'PLWCtQoesywqPQhmKo6r8_za2gd6qzg89L', #40 - JUSTIN Y. 52 | 'PLW1z7JmxIwi3NngIaihev50HbibVMUR7T', #108 - Justin T. 53 | 'PLfnjQQeSRtRDJGvPY9_2x3bg1Ibd5UW3t', #43 - Justin y 54 | 'PLqfTYcfGfk2Z95KaNly6gLV2ksDr6BW1i', #48 - Justin Y. Video playlist 55 | 'PLmV_z0VTBoQM7I0c2P0Ne89mFy6GRRrNz', #150 - Justin Y. Is here 56 | 'PLpZ6EQdG37vDF0-hslIe6BkcKxFwwvyYW', #1179 - Weird side of YouTube 57 | 'PLEDLY9RcAVEClHaZTv4D2B_1ZbT6prKzR', #55 - Weeb Stuff 58 | 'PLc5e3gLSmfQoKhG4TaujsyQ9Tnzo2HusM', #53 - Justin. Y was here 59 | 'PL5IiKBxO9ciwo1kzKsTRWAttaUPOJa7ko', #69 - Justin Y 60 | 'PL2P9kapzgxPKt1R4a1uhaSTrBBWgYliP3', #313 - Vids With Justin Y.'s Comment 61 | 'PLaMsHFoQCfi-c04jfFe5tRbWPjphMrNs7', #65 - Videos I've seen Justin Y. in 62 | 'PLTMsl9Zjn8658xhd3QolGv0xeQ5qtfjSq', #25 - Finding Justin y 63 | 'PLMqyXJorJPzAvyHgV4rrixNuIEjI2Jc-g', #31 - Where I find Justin Y. 64 | 'PLth-U9OH9GxXZf6f3rUFIyfne-eBV4cU8', #36 - Justin Y. hunt 65 | 'PLg1UXes0H5Kr_0pirdTJSUPYtBSlYsPoq', #21 - Justin y is here 66 | 'PLc6nYgH9n1HnVaiKaoEwJSywuoeewRC1B', #41 - The great ones 67 | 'PLOJTuMA4-7oEAb6qTHG6J98Q66452tQyw', #35 - Nightcore 68 | 'PL5as-6qnU5d87-bosb3Edbp1tRL-UYxTP', #2706 - The best Videos on YouTube and Vines 69 | 'PLv3TTBr1W_9tppikBxAE_G6qjWdBljBHJ', #2799 - Instant Regret Clicking this Playlist 70 | 'PLSCxT16tijxi7VuC3Uh3nPLmJocr7Ju28', #2346 - Cy's dank meme playlist 71 | 'LLp1FjmTu8nw4lkKlU7iI74w', #801 - Gavin's Liked Videos 72 | 'LLa6TeYZ2DlueFRne5DyAnyg', #910 - Grandayy's Liked Videos 73 | 'LL1EW42tsTQTuFIkKcAhufkw', #616 - Talking Nonsense's Liked Videos 74 | 'LLq_X6pFQK2r8ptOq64nMYbA', #5000 - Misaka Mikoto's Liked Videos 75 | 'LLFBuuvyZWLmYX_ve0RsBxbQ', #1261 - kermit's Liked Videos 76 | 'LLQMjMW-9PhWoH6TWwmnVWvA', #2762 - CallMeCarson's Liked Videos 77 | 'LLt7E8Qpue2TU9Yh47vkEbsQ', #420 - Dolan Dark's Liked Vidoes 78 | 'LL0vXwnNFwrXRlje-gSxw-Eg', #5000 - DatfaceddoeThe2nd Aka The Master Of Kirby's Liked Videos 79 | 'LLt-GOpCw4dOBlIyqL9A1ztA', #5000 - Sr Pelo's Liked Videos 80 | 'LLMYTaTc_gVRyGF6LWzdIsqA', #3551 - Cyranek's Liked Videos 81 | 'LL9ecwl3FTG66jIKA9JRDtmg', #3221 - SiIvaGunner's Liked Videos 82 | 'LLYzPXprvl5Y-Sf0g4vX-m6g', #1288 - jacksepticeye's Liked Videos 83 | 'LLk6rHCnCNxqWHKFknnyZGZw', #781 - blazeaster 84 | 'PL68kEVQCeE3oxk3hZms2nJ0s3kYrp0rbj', #4873 - Slightly Less Important Videos I 85 | 'LLo8bcnLyZH8tBIH9V1mLgqQ', #1145 - TheOdd1sOut's Liked Videos 86 | 'LLllm3HivMERwu2x2Sjz5EIg', #795 - Vargskelethor Joel's Liked Videos 87 | 'LLny_vGt2N7_QJ5qBOAHxlcw', #932 - maxmoefoe's Liked Videos 88 | 'LLQ4FyiI_1mWI2AtLS5ChdPQ', #1163 - Boyinaband's Liked Videos 89 | 'PLv3TTBr1W_9vPB6WPEnPwOpLYeZQW5tuD', #4998 - Instant Regret Clicking This Playlist 2.0 90 | 'LLGwu0nbY2wSkW8N-cghnLpA', #818 - Jaiden Animations's Liked Videos 91 | 'LLPcFg7aBbaVzXoIKSNqwaww', #1451 - jacksfilms's Liked Videos 92 | 'LLu6v4AdYxVH5fhfq9mi5llA', #459 - Let Me Explain Studios's Liked Videos 93 | 'LLJ0-OtVpF0wOKEqT2Z1HEtA', #318 - ElectroBOOM's Liked Videos 94 | 'LLo1qj9072AgkWlmkR-PLwCQ', #486 - AngeloJFurfaro's Liked Videos 95 | ] 96 | 97 | #Create directory to hold downloaded data 98 | if not os.path.exists(SAVE_DIR): 99 | os.makedirs(SAVE_DIR) 100 | 101 | #Try to load the existing database so far 102 | all_comments = {} 103 | try: 104 | with codecs.open(DATA_FILE, 'r', encoding='utf-8') as fin: 105 | for line in fin: 106 | vid, title, comment = line[:-1].split('~') 107 | all_comments[vid] = (title, comment) 108 | except: 109 | pass 110 | 111 | def scrape_playlist(playlist): 112 | NEXT_PAGE = '' 113 | while True: 114 | #Setup strings 115 | PLAYLIST_URL = 'https://www.googleapis.com/youtube/v3/playlistItems?key=' + API_KEY + '&part=snippet&playlistId=' + playlist + '&maxResults=50' 116 | if NEXT_PAGE != '': 117 | PLAYLIST_URL += '&pageToken=' + NEXT_PAGE 118 | SAVE_FILE = SAVE_DIR + 'ply_' + playlist + '_' + str(NEXT_PAGE) + '.txt' 119 | data_out = codecs.open(DATA_FILE, 'a', encoding='utf-8') 120 | 121 | #Download the query (or load from file if cached) 122 | if os.path.isfile(SAVE_FILE): 123 | query_str = "" 124 | with codecs.open(SAVE_FILE, 'r', encoding='utf-8') as fin: 125 | query_str = fin.read() 126 | else: 127 | if READ_CACHE_ONLY: 128 | break 129 | query_str = urlopen(PLAYLIST_URL, timeout=10).read().decode('utf-8') 130 | with codecs.open(SAVE_FILE, 'w', encoding='utf-8') as fout: 131 | fout.write(query_str) 132 | 133 | #Ignore if query is empty 134 | if len(query_str) == 0: 135 | print("===== WARNING: Empty Response =====") 136 | return 137 | 138 | #Loop over all videos in the playlist 139 | query_json = json.loads(query_str) 140 | items = query_json['items'] 141 | for item in items: 142 | #Get the video information 143 | snippet = item['snippet'] 144 | title = snippet['title'] 145 | vid = snippet['resourceId']['videoId'] 146 | if vid in all_comments: 147 | continue 148 | 149 | #Scrape Justin Y comments from the video 150 | try: 151 | good_comments = scrape_api(vid) 152 | except: 153 | good_comments = [] 154 | 155 | #Clean the text a bit and add it to the data set 156 | title = title.replace('\n',' . ').replace('\r',' . ').replace('~',' ') 157 | print(title.encode('utf-8').decode()) 158 | 159 | #Save all the good comments 160 | for good_comment in good_comments: 161 | good_comment = good_comment.replace('\n',' . ').replace('\r',' . ').replace('~',' ') 162 | data_out.write(vid + '~' + title + '~' + good_comment + '\n') 163 | all_comments[vid] = (title, good_comment) 164 | print(" " + good_comment.encode('utf-8').decode()) 165 | 166 | #Get the next page to process or quit if done 167 | data_out.close() 168 | if 'nextPageToken' in query_json: 169 | NEXT_PAGE = query_json['nextPageToken'] 170 | else: 171 | break 172 | 173 | def scrape_api(vid): 174 | cur_page = '' 175 | good_comments = [] 176 | for i in range(NUM_COMMENT_PAGES): 177 | #Setup strings 178 | COMMENT_URL = 'https://www.googleapis.com/youtube/v3/commentThreads?key=' + API_KEY + '&textFormat=plainText&part=snippet&videoId=' + vid + '&maxResults=100&order=relevance' 179 | if len(cur_page) > 0: 180 | COMMENT_URL += '&pageToken=' + cur_page 181 | SAVE_FILE = SAVE_DIR + 'com_' + vid + cur_page + '.txt' 182 | 183 | #Download the query (or load from file if cached) 184 | if os.path.isfile(SAVE_FILE): 185 | query_str = '' 186 | with codecs.open(SAVE_FILE, 'r', encoding='utf-8') as fin: 187 | query_str = fin.read() 188 | else: 189 | if READ_CACHE_ONLY: 190 | continue 191 | query_str = urlopen(COMMENT_URL).read().decode('utf-8') 192 | with open(SAVE_FILE, 'w', encoding='utf-8') as fout: 193 | fout.write(query_str) 194 | 195 | query_json = json.loads(query_str) 196 | items = query_json['items'] 197 | if ALL_COMMENTS: 198 | #Look for popular comments and add them 199 | for j in range(len(items)): 200 | item = items[j] 201 | snippet = item['snippet']['topLevelComment']['snippet'] 202 | num_likes = int(snippet['likeCount']) 203 | if num_likes < 50 or (j >= 3 and num_likes < 200): 204 | continue 205 | comment = snippet['textDisplay'] 206 | if len(comment) < ALL_COMMENTS_LEN_MIN or len(comment) > ALL_COMMENTS_LEN_MAX: 207 | continue 208 | good_comments.append(comment) 209 | else: 210 | #Look for Justin Y comments 211 | justin_y_comment = '' 212 | for item in items: 213 | snippet = item['snippet']['topLevelComment']['snippet'] 214 | if JUSTIN_Y_CHANNEL in snippet['authorChannelUrl']: 215 | justin_y_comment = snippet['textDisplay'] 216 | break 217 | 218 | #Return result if found 219 | if len(justin_y_comment) > 0: 220 | return [justin_y_comment] 221 | 222 | #Otherwise search the next page 223 | if 'nextPageToken' in query_json: 224 | cur_page = query_json['nextPageToken'] 225 | else: 226 | break 227 | 228 | #Return whatever was found 229 | return good_comments 230 | 231 | for playlist in ALL_PLAYLISTS: 232 | print("==========================================================") 233 | print(" Staring playlist " + playlist) 234 | print("==========================================================") 235 | print("") 236 | scrape_playlist(playlist) 237 | -------------------------------------------------------------------------------- /Train.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import numpy as np 3 | from matplotlib import pyplot as plt 4 | import util 5 | 6 | SEQ_SIZE = 8 7 | CTXT_SIZE = 200 8 | EMBEDDING_SIZE = 200 9 | USE_LSTM = True 10 | USE_OUT_SEQ = False 11 | CONTINUE_TRAIN = False 12 | NUM_EPOCHS = 100 13 | NUM_MINI_EPOCHS = 1 14 | BATCH_SIZE = 200 15 | LR = 0.001 16 | DO_RATE = 0.05 17 | BN = 0.99 18 | SAVE_DIR = 'trained_all/' 19 | PARSED_DIR = 'parsed_all/' 20 | 21 | #Create directory to save model 22 | if not os.path.exists(SAVE_DIR): 23 | os.makedirs(SAVE_DIR) 24 | 25 | #Load comment dictionary 26 | comment_words, comment_word_to_ix = util.load_comment_dict(PARSED_DIR) 27 | comment_dict_size = len(comment_words) 28 | 29 | #Load training samples 30 | title_ix_samples, title_unique_samples, past_samples, pred_samples = util.create_training_samples(PARSED_DIR, SEQ_SIZE, USE_OUT_SEQ) 31 | num_samples = past_samples.shape[0] 32 | 33 | #Load Keras and Theano 34 | print("Loading Keras...") 35 | import os, math 36 | #os.environ['THEANORC'] = "./gpu.theanorc" 37 | os.environ['KERAS_BACKEND'] = "tensorflow" 38 | import tensorflow as tf 39 | print("Tensorflow Version: " + tf.__version__) 40 | import keras 41 | print("Keras Version: " + keras.__version__) 42 | from keras.layers import Input, Dense, Activation, Dropout, Flatten, Reshape, RepeatVector, TimeDistributed, LeakyReLU, CuDNNGRU, concatenate 43 | from keras.layers.convolutional import Conv2D, Conv2DTranspose, UpSampling2D, Convolution1D 44 | from keras.layers.embeddings import Embedding 45 | from keras.layers.local import LocallyConnected2D 46 | from keras.layers.pooling import MaxPooling2D 47 | from keras.layers.noise import GaussianNoise 48 | from keras.layers.normalization import BatchNormalization 49 | from keras.layers.recurrent import LSTM, SimpleRNN, GRU 50 | from keras.models import Model, Sequential, load_model, model_from_json 51 | from keras.optimizers import Adam, RMSprop, SGD 52 | from keras.preprocessing.image import ImageDataGenerator 53 | from keras.regularizers import l1 54 | from keras.utils import plot_model, to_categorical 55 | from keras import backend as K 56 | K.set_image_data_format('channels_first') 57 | 58 | #Fix bug with sparse_categorical_accuracy 59 | from tensorflow.python.ops import math_ops 60 | from tensorflow.python.framework import ops 61 | from tensorflow.python.keras import backend as K 62 | from tensorflow.python.ops import array_ops 63 | def new_sparse_categorical_accuracy(y_true, y_pred): 64 | y_pred_rank = ops.convert_to_tensor(y_pred).get_shape().ndims 65 | y_true_rank = ops.convert_to_tensor(y_true).get_shape().ndims 66 | # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,) 67 | if (y_true_rank is not None) and (y_pred_rank is not None) and (len(K.int_shape(y_true)) == len(K.int_shape(y_pred))): 68 | y_true = array_ops.squeeze(y_true, [-1]) 69 | y_pred = math_ops.argmax(y_pred, axis=-1) 70 | # If the predicted output and actual output types don't match, force cast them 71 | # to match. 72 | if K.dtype(y_pred) != K.dtype(y_true): 73 | y_pred = math_ops.cast(y_pred, K.dtype(y_true)) 74 | return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx()) 75 | 76 | #Build the training models 77 | if CONTINUE_TRAIN: 78 | print("Loading Model...") 79 | model = load_model(SAVE_DIR + 'Model.h5') 80 | else: 81 | print("Building Model...") 82 | ctxt_in = Input(shape=title_unique_samples.shape[1:]) 83 | past_in = Input(shape=past_samples.shape[1:]) 84 | 85 | if USE_LSTM: 86 | ctxt_dense = Dense(CTXT_SIZE)(ctxt_in) 87 | ctxt_dense = LeakyReLU(0.2)(ctxt_dense) 88 | ctxt_dense = RepeatVector(SEQ_SIZE)(ctxt_dense) 89 | 90 | past_dense = Embedding(comment_dict_size, EMBEDDING_SIZE, input_length=SEQ_SIZE)(past_in) 91 | x = concatenate([ctxt_dense, past_dense]) 92 | x = Dropout(DO_RATE)(x) 93 | 94 | x = CuDNNGRU(200, return_sequences=USE_OUT_SEQ)(x) 95 | if USE_OUT_SEQ: 96 | x = TimeDistributed(BatchNormalization(momentum=BN))(x) 97 | x = TimeDistributed(Dense(comment_dict_size, activation='softmax'))(x) 98 | else: 99 | x = BatchNormalization(momentum=BN)(x) 100 | x = Dense(comment_dict_size, activation='softmax')(x) 101 | else: 102 | ctxt_dense = Dense(CTXT_SIZE)(ctxt_in) 103 | ctxt_dense = LeakyReLU(0.2)(ctxt_dense) 104 | past_dense = Embedding(comment_dict_size, EMBEDDING_SIZE, input_length=SEQ_SIZE)(past_in) 105 | past_dense = Flatten(data_format = 'channels_last')(past_dense) 106 | x = concatenate([ctxt_dense, past_dense]) 107 | 108 | x = Dense(800)(x) 109 | x = LeakyReLU(0.2)(x) 110 | if DO_RATE > 0.0: 111 | x = Dropout(DO_RATE)(x) 112 | #x = BatchNormalization(momentum=BN)(x) 113 | 114 | x = Dense(400)(x) 115 | x = LeakyReLU(0.2)(x) 116 | if DO_RATE > 0.0: 117 | x = Dropout(DO_RATE)(x) 118 | #x = BatchNormalization(momentum=BN)(x) 119 | 120 | x = Dense(comment_dict_size, activation='softmax')(x) 121 | 122 | if USE_OUT_SEQ: 123 | metric = new_sparse_categorical_accuracy 124 | else: 125 | metric = 'sparse_categorical_accuracy' 126 | 127 | model = Model(inputs=[ctxt_in, past_in], outputs=[x]) 128 | model.compile(optimizer=Adam(lr=LR), loss='sparse_categorical_crossentropy', metrics=[metric]) 129 | print(model.summary()) 130 | 131 | #plot_model(model, to_file=SAVE_DIR + 'model.png', show_shapes=True) 132 | 133 | #Utilites 134 | def plotScores(scores, test_scores, fname, on_top=True): 135 | plt.clf() 136 | ax = plt.gca() 137 | ax.yaxis.tick_right() 138 | ax.yaxis.set_ticks_position('both') 139 | ax.yaxis.grid(True) 140 | plt.plot(scores) 141 | plt.plot(test_scores) 142 | plt.xlabel('Epoch') 143 | plt.tight_layout() 144 | loc = ('upper right' if on_top else 'lower right') 145 | plt.draw() 146 | plt.savefig(fname) 147 | 148 | #Train model 149 | print("Training...") 150 | train_loss = [] 151 | train_acc = [] 152 | test_loss = [] 153 | test_acc = [] 154 | i_train = np.arange(num_samples) 155 | batches_per_epoch = num_samples // BATCH_SIZE 156 | for epoch in range(NUM_EPOCHS): 157 | np.random.shuffle(i_train) 158 | for j in range(NUM_MINI_EPOCHS): 159 | loss = 0.0 160 | acc = 0.0 161 | num = 0.0 162 | start_i = batches_per_epoch * j // NUM_MINI_EPOCHS 163 | end_i = batches_per_epoch * (j + 1) // NUM_MINI_EPOCHS 164 | for i in range(start_i, end_i): 165 | i_batch = i_train[i*BATCH_SIZE:(i + 1)*BATCH_SIZE] 166 | title_batch = title_unique_samples[title_ix_samples[i_batch]] 167 | past_batch = past_samples[i_batch] 168 | pred_batch = pred_samples[i_batch] 169 | 170 | batch_loss, batch_acc = model.train_on_batch([title_batch, past_batch], [pred_batch]) 171 | loss += batch_loss 172 | acc += batch_acc 173 | num += 1.0 174 | 175 | if i % 5 == 0: 176 | progress = ((i - start_i) * 100) // (end_i - start_i) 177 | sys.stdout.write( 178 | str(progress) + "%" + 179 | " Loss:" + str(loss / num) + 180 | " Acc:" + str(acc / num) + " ") 181 | sys.stdout.write('\r') 182 | sys.stdout.flush() 183 | sys.stdout.write('\n') 184 | loss /= num 185 | acc /= num 186 | 187 | train_loss.append(loss) 188 | train_acc.append(acc) 189 | 190 | plotScores(train_loss, test_loss, SAVE_DIR + 'Loss.png', True) 191 | plotScores(train_acc, test_acc, SAVE_DIR + 'Acc.png', False) 192 | 193 | if loss == min(train_loss): 194 | model.save(SAVE_DIR + 'Model.h5') 195 | print("Saved") 196 | 197 | print("==== EPOCH FINISHED ====") 198 | 199 | print("Done") 200 | -------------------------------------------------------------------------------- /YTCommenter.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HackerPoet/YouTubeCommenter/6df02550abd15a0559bcc8fdce668b3a4aa32133/YTCommenter.zip -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | import numpy as np 3 | 4 | def clean_text(text): 5 | for c in [u"\u0060", u"\u00B4", u"\u2018", u"\u2019"]: 6 | text = text.replace(c, "'") 7 | for c in [u"\u00C0", u"\u00C1", u"\u00C2", u"\u00C3", u"\u00C4", u"\u00C5", 8 | u"\u00E0", u"\u00E1", u"\u00E2", u"\u00E3", u"\u00E4", u"\u00E5"]: 9 | text = text.replace(c, "a") 10 | for c in [u"\u00C8", u"\u00C9", u"\u00CA", u"\u00CB", 11 | u"\u00E8", u"\u00E9", u"\u00EA", u"\u00EB"]: 12 | text = text.replace(c, "e") 13 | for c in [u"\u00CC", u"\u00CD", u"\u00CE", u"\u00CF", 14 | u"\u00EC", u"\u00ED", u"\u00EE", u"\u00EF"]: 15 | text = text.replace(c, "i") 16 | for c in [u"\u00D2", u"\u00D3", u"\u00D4", u"\u00D5", u"\u00D6", 17 | u"\u00F2", u"\u00F3", u"\u00F4", u"\u00F5", u"\u00F6"]: 18 | text = text.replace(c, "o") 19 | for c in [u"\u00DA", u"\u00DB", u"\u00DC", u"\u00DD", 20 | u"\u00FA", u"\u00FB", u"\u00FC", u"\u00FD"]: 21 | text = text.replace(c, "u") 22 | text = text.replace(u"\u00D1", "n").replace(u"\u00F1", "n") 23 | text = text.encode('utf-8').decode() 24 | if 'http' in text: 25 | return '' 26 | text = re.sub(r'[^0-9a-z .,?!\'/:;<>#\-\$%&]', ' ', text.lower()) 27 | text = ' ' + text + ' ' 28 | text = text.replace('&', ' and ') 29 | text = re.sub(r'\.( +\.)+', '..', text) 30 | text = re.sub(r'\.\.+', ' ^ ', text) 31 | text = re.sub(r',+', ',', text) 32 | text = re.sub(r'\-+', '-', text) 33 | text = re.sub(r'\?+', ' ? ', text) 34 | text = re.sub(r'\!+', ' ! ', text) 35 | text = re.sub(r'\'+', "'", text) 36 | text = re.sub(r';+', ':', text) 37 | text = re.sub(r'/+', ' / ', text) 38 | text = re.sub(r'<+', ' < ', text) 39 | text = re.sub(r'>+', ' > ', text) 40 | text = text.replace('%', '% ') 41 | text = text.replace(' - ', ' : ') 42 | text = text.replace(' -', " - ") 43 | text = text.replace('- ', " - ") 44 | text = text.replace(" '", " ") 45 | text = text.replace("' ", " ") 46 | for c in ".,:": 47 | text = text.replace(c + ' ', ' ' + c + ' ') 48 | #text = re.sub(r' \d\d?:\d\d ', ' 0:00 ', text) 49 | text = re.sub(r' +', ' ', text.strip(' ')) 50 | text = text.replace('^', '...') 51 | return text 52 | 53 | def load_dict(fname, word_list, word_dict): 54 | with open(fname, 'r') as fin: 55 | for line in fin: 56 | line = line[:-1] 57 | assert(line not in word_dict) 58 | word_dict[line] = len(word_list) 59 | word_list.append(line) 60 | assert(word_list[0] == '') 61 | 62 | def load_title_dict(PARSED_DIR): 63 | title_words = [] 64 | title_word_to_ix = {} 65 | load_dict(PARSED_DIR + 'title_dict.txt', title_words, title_word_to_ix) 66 | print("Loaded " + str(len(title_words)) + " title word dictionary.") 67 | return title_words, title_word_to_ix 68 | 69 | def load_comment_dict(PARSED_DIR): 70 | comment_words = [] 71 | comment_word_to_ix = {} 72 | load_dict(PARSED_DIR + 'comment_dict.txt', comment_words, comment_word_to_ix) 73 | print("Loaded " + str(len(comment_words)) + " comment word dictionary.") 74 | return comment_words, comment_word_to_ix 75 | 76 | def load_title_sentences(PARSED_DIR): 77 | #Load the raw data 78 | print("Loading Titles...") 79 | titles = np.load(PARSED_DIR + 'titles.npy') 80 | title_lens = np.load(PARSED_DIR + 'title_lens.npy') 81 | print("Loaded " + str(len(title_lens)) + " titles.") 82 | 83 | #Extract all title sentences 84 | title_ix = 0 85 | title_sentences = [] 86 | for title_len in title_lens: 87 | title_sentences.append(titles[title_ix:title_ix + title_len]) 88 | title_ix += title_len 89 | return title_sentences 90 | 91 | def load_comment_sentences(PARSED_DIR): 92 | #Load the raw data 93 | print("Loading Comments...") 94 | comments = np.load(PARSED_DIR + 'comments.npy') 95 | comment_lens = np.load(PARSED_DIR + 'comment_lens.npy') 96 | print("Loaded " + str(len(comment_lens)) + " comments.") 97 | 98 | #Extract all comment sentences 99 | comment_ix = 0 100 | comment_sentences = [] 101 | for comment_len in comment_lens: 102 | comment_sentences.append(comments[comment_ix:comment_ix + comment_len]) 103 | comment_ix += comment_len 104 | return comment_sentences 105 | 106 | def bag_of_words(title_ixs, title_dict_size): 107 | title_sample = np.zeros((title_dict_size,), dtype=np.uint8) 108 | for ix in title_ixs: 109 | title_sample[ix] = 1 110 | return title_sample 111 | 112 | def create_training_samples(PARSED_DIR, seq_size, out_seq=False): 113 | #Load the raw data 114 | print("Loading Titles...") 115 | titles = np.load(PARSED_DIR + 'titles.npy') 116 | title_lens = np.load(PARSED_DIR + 'title_lens.npy') 117 | print("Loaded " + str(len(title_lens)) + " titles.") 118 | 119 | #Load the raw data 120 | print("Loading Comments...") 121 | comments = np.load(PARSED_DIR + 'comments.npy') 122 | comment_lens = np.load(PARSED_DIR + 'comment_lens.npy') 123 | print("Loaded " + str(len(comment_lens)) + " comments.") 124 | 125 | #Convert to training samples 126 | print("Creating Training Samples...") 127 | title_ix = 0 128 | comment_ix = 0 129 | title_ix_samples = [] 130 | title_unique_samples = [] 131 | past_samples = [] 132 | pred_samples = [] 133 | title_dict_size = np.amax(titles) + 1 134 | for i in range(title_lens.shape[0]): 135 | title_len = title_lens[i] 136 | title_sample = np.zeros((title_dict_size,), dtype=np.uint8) 137 | for j in range(title_len): 138 | word = titles[title_ix] 139 | title_sample[word] = 1 140 | title_ix += 1 141 | title_unique_samples.append(title_sample) 142 | 143 | comment_len = comment_lens[i] 144 | past_sample = np.zeros((seq_size,), dtype=np.int32) 145 | end_j = comment_len + 1 146 | if out_seq: 147 | end_j = max(end_j, seq_size) 148 | for j in range(end_j): 149 | if not out_seq or j >= seq_size - 1: 150 | title_ix_samples.append(len(title_unique_samples) - 1) 151 | past_samples.append(past_sample) 152 | 153 | if j >= comment_len: 154 | next_word = 0 155 | else: 156 | next_word = comments[comment_ix] 157 | comment_ix += 1 158 | 159 | past_sample = np.roll(past_sample, -1) 160 | past_sample[-1] = next_word 161 | if not out_seq: 162 | pred_samples.append(next_word) 163 | elif j >= seq_size - 1: 164 | pred_samples.append(past_sample) 165 | 166 | num_samples = len(past_samples) 167 | assert(title_ix == len(titles)) 168 | assert(comment_ix == len(comments)) 169 | assert(num_samples == len(pred_samples)) 170 | assert(num_samples == len(title_ix_samples)) 171 | title_ix_samples = np.array(title_ix_samples, dtype=np.int32) 172 | title_unique_samples = np.array(title_unique_samples, dtype=np.uint8) 173 | past_samples = np.array(past_samples, dtype=np.int32) 174 | pred_samples = np.array(pred_samples, dtype=np.int32) 175 | pred_samples = np.expand_dims(pred_samples, axis=-1) 176 | print("Created " + str(num_samples) + " samples.") 177 | 178 | return title_ix_samples, title_unique_samples, past_samples, pred_samples 179 | --------------------------------------------------------------------------------