├── README.md ├── downloader_class.py ├── examples.csv ├── get_subs.py ├── requirements.txt └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | YT_subtitles is a tool for building a dataset from youtube subtitles. It extracts the (non machine-generated) subtitles 4 | from all the videos returned by a list of search terms. 5 | 6 | The resulting files contain a string of text per language, per minute of subtitles, with the name of the language appended 7 | as a header. The dataset is designed to improve the multilingual performance of language models trained on it. 8 | If only a single language is available, the output is just a text version of the subtitles, with no metadata. 9 | 10 | Download a pre-processed dataset scraped using the search terms in `examples.csv` [here](https://eaidata.bmk.sh/data/yt_subs.jsonl.zst) 11 | 12 | The dataset is a [jsonl](https://github.com/wbolster/jsonlines) file compressed with [zstd](https://github.com/facebook/zstd). You can easily read the files using [lm-dataformat](https://github.com/leogao2/lm_dataformat). 13 | 14 | # Setup 15 | 16 | `pip install -r requirements.txt` 17 | 18 | If you're running this on an Ubuntu server, I've found I need to install the following packages to get requests-html working 19 | 20 | ``` 21 | sudo apt-get install gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget 22 | ``` 23 | 24 | # Usage 25 | 26 | ``` 27 | CLI for YT_subtitles - extracts Youtube subtitles from a list of search terms 28 | 29 | positional arguments: 30 | search_terms A comma separated list of search terms, alternatively, 31 | pass path to a csv with the -c argument 32 | 33 | optional arguments: 34 | -h, --help show this help message and exit 35 | --out_path OUT_PATH Output location for final .txt files (default "output") 36 | -s, --save_links whether to save links to a .csv file (default True) 37 | -c, --csv if true, positional arg should be a path to a .csv file 38 | containing search terms (default False) 39 | --scroll SCROLL how far to scroll down in the youtube search (default 1000) 40 | ``` 41 | 42 | e.g 43 | 44 | `python get_subs.py -c examples.csv` 45 | 46 | or 47 | 48 | `python get_subs.py "movie review,GPT-3,true crime documentary"` 49 | 50 | ## TODO: 51 | - [ ] output as [lm dataformat](https://github.com/leogao2/lm_dataformat) 52 | -------------------------------------------------------------------------------- /downloader_class.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import math 3 | import re 4 | import random 5 | import os 6 | import traceback 7 | from bs4 import BeautifulSoup as bs 8 | from requests_html import HTMLSession, MaxRetries 9 | from tqdm import tqdm 10 | from youtube_transcript_api import YouTubeTranscriptApi 11 | from utils import lang_code_to_name 12 | 13 | 14 | class Subtitles_downloader(): 15 | 16 | def __init__(self, out_path="out", langs="all", save_links=True, scrolldown=10000): 17 | print('loading HTML session...') 18 | self.session = HTMLSession() 19 | self.sub_filter_code = "&sp=EgIoAQ%253D%253D" 20 | self.out_path = out_path 21 | self.langs = langs 22 | self.save_links = save_links 23 | self.scrolldown = scrolldown 24 | self.video_ids = [] 25 | 26 | def search(self, queries): 27 | """ 28 | returns a list of video ids for videos that match the search term, and have subtitles. 29 | :param queries: list of search queries 30 | :param scrolldown: number of times to page down 31 | """ 32 | if isinstance(queries, list): 33 | for q in queries: 34 | try: 35 | if self.save_links: 36 | # check if this search has already been made, if so, skip 37 | csv_out_path = 'links/{}_search_results.csv'.format(q) 38 | if os.path.isfile(csv_out_path): 39 | #TODO: read video ids from csv & check txt files instead of just assuming each id has been downloaded 40 | continue 41 | 42 | search_url = "https://www.youtube.com/results?search_query={}{}".format(q, self.sub_filter_code) 43 | print('Fetching search results for "{}"...'.format(q)) 44 | response = self.session.get(search_url) 45 | print('Executing JS...') 46 | try: 47 | response.html.render(scrolldown=self.scrolldown, timeout=30.0) 48 | except MaxRetries as e: 49 | print(e) 50 | continue 51 | 52 | # create bs object to parse HTML 53 | soup = bs(response.html.html, "html.parser") 54 | 55 | # get all video ids from soup 56 | query_ids = [] 57 | count = 0 58 | for count, link in enumerate( 59 | list(set(soup.findAll('a', attrs={'href': re.compile("^/watch\?v=.{8,12}")})))): 60 | query_ids.append(link.get('href').split('v=')[1]) 61 | self.video_ids.extend(query_ids) 62 | 63 | # save links to csv file 64 | if self.save_links: 65 | with open(csv_out_path, "w") as f: 66 | writer = csv.writer(f) 67 | writer.writerows([query_ids]) 68 | 69 | print('{} unique links found for "{}"!'.format(count, q)) 70 | except: 71 | print('Search query {} failed!'.format(q)) 72 | traceback.print_exc() 73 | else: 74 | raise TypeError("search queries must be list") 75 | 76 | def download_subs(self): 77 | for video_id in tqdm(list(set(self.video_ids)), desc="Downloading subtitles..."): 78 | try: 79 | # gets list of available transcript from YTtranscriptAPI 80 | transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) 81 | 82 | # init dictionary to store results and out name for txt file 83 | out = {} 84 | out_name = "{}/{}".format(self.out_path, video_id) 85 | 86 | # count number of languages available 87 | n_langs = 0 88 | for t in transcript_list: 89 | if t.is_generated: 90 | continue 91 | if self.langs != "all": 92 | if t.language_code not in self.langs: 93 | continue 94 | n_langs += 1 95 | 96 | total_minutes = -1 97 | for t in transcript_list: 98 | # filter out generated transcripts and non-specified languages 99 | if t.is_generated: 100 | continue 101 | if self.langs != "all": 102 | if t.language_code not in self.langs: 103 | continue 104 | # fetch the actual transcript 105 | transcript = t.fetch() 106 | 107 | # write every minute of every language in transcript to a results dict 108 | out[t.language_code] = {} 109 | for i in transcript: 110 | end = i["start"] + i["duration"] 111 | minute = int(math.floor(end / 60.0)) 112 | if n_langs > 1: 113 | header_txt = '\n{}: \n'.format(lang_code_to_name(t.language_code)) 114 | else: 115 | header_txt = '' 116 | if minute > total_minutes: 117 | total_minutes = minute 118 | if minute in out[t.language_code]: 119 | out[t.language_code][minute] += '{}\n'.format(i["text"]) 120 | else: 121 | out[t.language_code][minute] = '{}{}\n'.format(header_txt, i["text"]) 122 | 123 | # write every minute of every language in transcript to txt file, shuffling language order 124 | with open("{}.txt".format(out_name), "w") as text_file: 125 | minutes = list(out.values()) 126 | all_minutes = set(x for l in minutes for x in l) 127 | for m in all_minutes: 128 | keys = list(out.keys()) 129 | random.shuffle(keys) 130 | for l in keys: 131 | try: 132 | text_file.write(out[l][m]) 133 | except: 134 | pass 135 | except Exception: 136 | print('Download for {} failed!'.format(video_id)) 137 | traceback.print_exc() 138 | -------------------------------------------------------------------------------- /examples.csv: -------------------------------------------------------------------------------- 1 | MIT lecture,biology lecture,science lecture,TED talk,hbomberguy,computer science,quantum computing,cosmology lecture, 2 | yalecourses,mit opencourseware,bts,asmr,billie eilish,pewdiepie,baby shark,nature documentary,blackpink,old town road, 3 | music,vevo,peliculas completas en español,minecraft,fortnite,ariana grande,alan walker,calma,musica,wwe,cosmology,black holes, 4 | neutron star,david attenborough,cake recipe,pasta recipe,twitch stream,George Hotz,cs50 lecture,computerphile,talks at google,noam chomsky,periodic table, 5 | summer lecture,big bang,spacex,jared diamond,anthropology,bill nye,howstuffworks,discovery,electrical engineering,fortnite memes,fortnite boogiedown, 6 | fortnite funny moments,fortnite gameplay,chuck palahniuk,max boyle,philosophy,howard stern,ash ketchum,biology,Big Bang Theory,sufjan stevens, 7 | nuclear power,data science,soccer,physics,history of physics,calculus,quantum chromodynamics,steroids,uncertainty principle,electromagnetism, 8 | electrons,energy,elon musk,joseph smith,natural selection,spielberg,economics,rare earth minerals,earth science,evolution,geology,faraday, 9 | chemistry,walmart,Story of Science,coffee,energy drinks,german,university of goettingen,france culture,quantum gravity,electroweak theory, 10 | black holes,Stephen Hawking,Cosmos,Freud,culinary arts,knife skills,animation,cs lewis,ethics,tech,marine biology,electrical engineering, 11 | geology undergrad,Raspberry Pi,microeconomics,sea monkeys,zinc,derivative,qatar,cbc,russ anderson,wall street journal,brewing,beer homebrew, 12 | eigenvalues,juventus,formula-1,berlin wall,Jolt Cola,James Madison,Thomas Jefferson,Lewis and Clark,Ferran Adrià,Game Theory, 13 | applied psychology,economic system,political science,first generation,food stamps,number line,quadratic formula,origami,pie chart, 14 | industry buzzword,marx,segregation,zeitgeist,vermeer,florentijn school,black protestant,Freedom Trail,Ada lovelace,Nietzche,Hegel, 15 | linguistics,tedx youth,Jacob Geller,Stanford lecture,school of life,Gilles Deleuze,Effective Altruism,Peter singer,frantz fanon, 16 | Hannah Arendt,Angela Davis,democracy now,Camus,David Graeber,Hito Steyerl,Foucault,simone de beauvoir,Walter Benjamin,Schopenhauer, 17 | bon appetit,maangchii,fermentation,khan academy,makeup tutorial,James Charles,vsauce,neural network,vox,dude perfect,vortex,plato, 18 | archimedes,Sagan,language of the body,criminal justice,gibran kahlil,psychology,geocities,sand,malala,fetus,pro life,scientific american, 19 | Fahrenheit 451,apocalypse now,Technoprogressive,Elvis,Evolution,Einstein,Catastrophes,3d printing,deepak chopra,Ancient Greek,immunology, 20 | Gaia,League of Extraordinary Gentlemen,Crypto,Cryptography,Mission to Mars,Virtual Reality,agriculture,oil,pesticides,sony,saturn,Banks, 21 | elon musk,spacex,apple,solar,iran,CERN,Health,Sam Harris,Metafilter,life extension,Balkanization,Convergence,Scientific Method,Magic, 22 | Sci-Fi,4chan,Nebula,Science,paradigm,Eugenics,Technology,Drugs,Steel,Shield,Electricity,Data Mining,Economics,Enterprise,Securities, 23 | Protective Social Services,Identity,Social Roots of Violence,Loyalty,movie review,GPT-3,true crime documentary,Rick Rubin,red bull music academy, 24 | tonight show,bbc,munchies,talk show,Bernie Sanders,germaine greer,university lecture,book review,kurzgesagt,news,agadmator,documentaries,tech, 25 | pickle recipe,food network,student cooking,anthony bourdain, fusion power, urban exploration, iphone review, car technology, verge, jeff beck, 26 | homemade sushi, ray william johnson, jennamarbles, jacksepticeye,react,card tricks,free solo,hip hop,tesla,comics,gta5,conspiracy,infowars, 27 | cambridge university,nasa,flat earth,richard dawkins,nazca lines,markiplier,statics,dynamics,interview,v for vendetta,incel,augmented reality, 28 | robotics,tumblr,shark week,technology connections,David Bowie,logo design,mentalism,Ai artificial intelligence,why is the sky blue,ancient aliens, 29 | lunar missions, nuclear disasters,space,national geographic videos,Sesame Street,Frank Zappa, ice hockey, Los Angeles Rams, modern art, Heavy Metal, 30 | poetry, science advocacy, science documentary, tv program, math tutorial,Acid Rock,Da Vinci Code,Aeronautics,London, pro wrestling,Dungeons and Dragons, 31 | Christopher Nolan,Fermat,latte art,brexit,entrepeneurship,english vocabulary,anarchism,funny vloggers,beauty products,dramaalert,plastic surgery,parallax effect, 32 | parkour,calculus,logarithms,history,architecture,steam trains,prank,social experiment,kardashians,minimalism,vegan recipes,cinematography, 33 | natural disaster,dota,cod,student cooking,anthony bourdain,Arnold Schwarzenegger,cosplay,boxing,dinosaur extinction,bill gates,Richard Feynman, 34 | David Lynch,Art History,Stephen Hawking,veganism,will it blend,Roman Mythology,medicine,antivaxxers,vlog,parody,sculpture,Ben Shapiro, 35 | Weird Al Yankovic,french cuisine,video game,Time Magazine,NPR,Quartz,the royal family,paleontology,Robin Williams,cinema,obama, 36 | archaeology,how it's made,gene editing,World War II,crispr,Zionism,Reddit,rap battle,patreon,lgbt,Uncharted,csgo,martin luther king, 37 | fibonacci,dj,visual effects,metal music,audiobook,war,greek mythology,ethics,stand up comedy,dresden files,tom hanks,capitalism, 38 | transgender issues,hemingway,frank sinatra,mythbusters,game of thrones,harry potter,speedrun,ps4,color theory,explain like i'm five, 39 | electoral college,poker,self help,global warming,lovecraft,film theory,NYU lecture,rationalism,logic,music theory,free speech, 40 | david foster wallace,louis CK,cancel culture,cynicalbrit,dark matter,pbs documentary,bjarne stroustrup,darwin,jordan peterson, 41 | arduino,reddit AMA,legal advice,self-driving car,ufo,zombie,neil gaiman,snoop dogg,tarantino,shakespeare,fifa,assassin's creed, 42 | clint eastwood,bass,LSD,steve jobs,atheism,crusades,women's rights,sir roger penrose,marcel duchamp,slavoj zizek,jake paul,deepmind,big think,dev summit, 43 | wordpress tutorial,web design,tutorial,travel video,anthropology,photosynthesis,john green,crash course history,crash course chemistry,crash course philosophy,crash course, 44 | internet historian,crime,2008 recession,revolution,fruitarian,deconstructed,stalin,fluid dynamics,phd thesis,how to write,north korea,aerodynamics, 45 | how rockets work,italian food,Terrence McKenna,Uighurs,convention,tibees,let's play,Jamaica,Vaporwave,amazon,deep web,street food,missionaries,carl jung,disease, -------------------------------------------------------------------------------- /get_subs.py: -------------------------------------------------------------------------------- 1 | import traceback, argparse 2 | from multiprocessing import Pool, cpu_count 3 | from itertools import repeat 4 | from downloader_class import Subtitles_downloader 5 | import csv, os 6 | from utils import chunks 7 | 8 | def download_subs_single(queries, out_path="out", save_links=True, scrolldown=1000): 9 | try: 10 | sub_downloader = Subtitles_downloader(out_path=out_path, save_links=save_links, scrolldown=scrolldown) 11 | sub_downloader.search(queries) 12 | sub_downloader.download_subs() 13 | except: 14 | print('Thread failed!') 15 | traceback.print_exc() 16 | 17 | 18 | def download_subs_mp(queries, out_path="out", save_links=True, scrolldown=1000): 19 | queries = chunks(queries, (len(queries) // cpu_count() - 1)) 20 | with Pool(cpu_count() - 1) as p: 21 | p.starmap(download_subs_single, zip(queries, repeat(out_path), repeat(save_links), repeat(scrolldown))) 22 | print('Done!') 23 | 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser(description='CLI for YT_subtitles - extracts Youtube subtitles from a list of search terms') 27 | parser.add_argument('search_terms', 28 | help='A comma separated list of search terms, alternatively, pass path to a csv with the -c ' 29 | 'argument', 30 | type=str) 31 | parser.add_argument('--out_path', help='Output location for final .txt files', type=str, required=False, 32 | default='output') 33 | parser.add_argument('-s', '--save_links', help='whether to save links to a .csv file', action="store_false") 34 | parser.add_argument('-c', '--csv', 35 | help='if true, positional arg should be a path to a .csv file containing search terms', 36 | action="store_true") 37 | parser.add_argument('--scroll', help='how far to scroll down in the youtube search', type=int, required=False, 38 | default=1000) 39 | args = parser.parse_args() 40 | os.makedirs(args.out_path, exist_ok=True) 41 | if args.save_links: 42 | os.makedirs("links", exist_ok=True) 43 | if not args.csv: 44 | search_terms = args.search_terms.split(',') 45 | else: 46 | search_terms = [] 47 | with open(args.search_terms, newline='') as inputfile: 48 | for row in csv.reader(inputfile): 49 | search_terms.append(row) 50 | # flattens list of list into unique list of items that aren't empty 51 | search_terms = list(set([item for sublist in search_terms for item in sublist if item])) 52 | print('Searching Youtube for: \n {}'.format(search_terms)) 53 | download_subs_mp(search_terms, args.out_path, args.save_links, args.scroll) 54 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | requests_html 3 | youtube_transcript_api 4 | pycountry -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pycountry 2 | 3 | all_langs = pycountry.languages 4 | 5 | 6 | def chunks(l, n): 7 | n = max(1, n) 8 | return (l[i:i + n] for i in range(0, len(l), n)) 9 | 10 | 11 | def lang_code_to_name(lang_code): 12 | lang = all_langs.get(alpha_2="{}".format(lang_code)) 13 | if lang is None: 14 | lang = all_langs.get(alpha_3="{}".format(lang_code)) 15 | if lang is None: 16 | # sometimes lang codes have a dash that then specifies regional dialect - just take the first part 17 | lang = all_langs.get(alpha_2="{}".format(lang_code.split("-")[0])) 18 | if lang is None: 19 | lang = all_langs.get(alpha_2="{}".format(lang_code.split("-")[0])) 20 | if lang is None: 21 | if "zh" in lang_code: 22 | # this should cover lots of regional zh langs (sorry to lump them all into one!) 23 | lang = all_langs.get(alpha_2="{}".format("zh")) 24 | if lang is None: 25 | print('No language name found for {}, returning language code'.format(lang_code)) 26 | return lang_code 27 | else: 28 | return lang_code 29 | return lang.name 30 | --------------------------------------------------------------------------------