├── README.md
├── downloader_class.py
├── examples.csv
├── get_subs.py
├── requirements.txt
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | YT_subtitles is a tool for building a dataset from youtube subtitles. It extracts the (non machine-generated) subtitles
 4 | from all the videos returned by a list of search terms.
 5 | 
 6 | The resulting files contain a string of text per language, per minute of subtitles, with the name of the language appended
 7 | as a header. The dataset is designed to improve the multilingual performance of language models trained on it.
 8 | If only a single language is available, the output is just a text version of the subtitles, with no metadata.
 9 | 
10 | Download a pre-processed dataset scraped using the search terms in `examples.csv` [here](https://eaidata.bmk.sh/data/yt_subs.jsonl.zst)
11 | 
12 | The dataset is a [jsonl](https://github.com/wbolster/jsonlines) file compressed with [zstd](https://github.com/facebook/zstd). You can easily read the files using [lm-dataformat](https://github.com/leogao2/lm_dataformat).
13 | 
14 | # Setup
15 | 
16 | `pip install -r requirements.txt`
17 | 
18 | If you're running this on an Ubuntu server, I've found I need to install the following packages to get requests-html working
19 | 
20 | ```
21 | sudo apt-get install gconf-service libasound2 libatk1.0-0 libatk-bridge2.0-0 libc6 libcairo2 libcups2 libdbus-1-3 libexpat1 libfontconfig1 libgcc1 libgconf-2-4 libgdk-pixbuf2.0-0 libglib2.0-0 libgtk-3-0 libnspr4 libpango-1.0-0 libpangocairo-1.0-0 libstdc++6 libx11-6 libx11-xcb1 libxcb1 libxcomposite1 libxcursor1 libxdamage1 libxext6 libxfixes3 libxi6 libxrandr2 libxrender1 libxss1 libxtst6 ca-certificates fonts-liberation libappindicator1 libnss3 lsb-release xdg-utils wget
22 | ```
23 | 
24 | # Usage
25 | 
26 | ```
27 | CLI for YT_subtitles - extracts Youtube subtitles from a list of search terms
28 | 
29 | positional arguments:
30 |   search_terms         A comma separated list of search terms, alternatively,
31 |                        pass path to a csv with the -c argument
32 | 
33 | optional arguments:
34 |   -h, --help           show this help message and exit
35 |   --out_path OUT_PATH  Output location for final .txt files (default "output")
36 |   -s, --save_links     whether to save links to a .csv file (default True)
37 |   -c, --csv            if true, positional arg should be a path to a .csv file
38 |                        containing search terms (default False)
39 |   --scroll SCROLL      how far to scroll down in the youtube search (default 1000)
40 | ```
41 | 
42 | e.g
43 | 
44 | `python get_subs.py -c examples.csv`
45 | 
46 | or
47 | 
48 | `python get_subs.py "movie review,GPT-3,true crime documentary"`
49 | 
50 | ## TODO: 
51 | - [ ] output as [lm dataformat](https://github.com/leogao2/lm_dataformat)
52 | 


--------------------------------------------------------------------------------
/downloader_class.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import math
  3 | import re
  4 | import random
  5 | import os
  6 | import traceback
  7 | from bs4 import BeautifulSoup as bs
  8 | from requests_html import HTMLSession, MaxRetries
  9 | from tqdm import tqdm
 10 | from youtube_transcript_api import YouTubeTranscriptApi
 11 | from utils import lang_code_to_name
 12 | 
 13 | 
 14 | class Subtitles_downloader():
 15 | 
 16 |     def __init__(self, out_path="out", langs="all", save_links=True, scrolldown=10000):
 17 |         print('loading HTML session...')
 18 |         self.session = HTMLSession()
 19 |         self.sub_filter_code = "&sp=EgIoAQ%253D%253D"
 20 |         self.out_path = out_path
 21 |         self.langs = langs
 22 |         self.save_links = save_links
 23 |         self.scrolldown = scrolldown
 24 |         self.video_ids = []
 25 | 
 26 |     def search(self, queries):
 27 |         """
 28 |         returns a list of video ids for videos that match the search term, and have subtitles.
 29 |         :param queries: list of search queries
 30 |         :param scrolldown: number of times to page down
 31 |         """
 32 |         if isinstance(queries, list):
 33 |             for q in queries:
 34 |                 try:
 35 |                     if self.save_links:
 36 |                         # check if this search has already been made, if so, skip
 37 |                         csv_out_path = 'links/{}_search_results.csv'.format(q)
 38 |                         if os.path.isfile(csv_out_path):
 39 |                             #TODO: read video ids from csv & check txt files instead of just assuming each id has been downloaded
 40 |                             continue
 41 | 
 42 |                     search_url = "https://www.youtube.com/results?search_query={}{}".format(q, self.sub_filter_code)
 43 |                     print('Fetching search results for "{}"...'.format(q))
 44 |                     response = self.session.get(search_url)
 45 |                     print('Executing JS...')
 46 |                     try:
 47 |                         response.html.render(scrolldown=self.scrolldown, timeout=30.0)
 48 |                     except MaxRetries as e:
 49 |                         print(e)
 50 |                         continue
 51 | 
 52 |                     # create bs object to parse HTML
 53 |                     soup = bs(response.html.html, "html.parser")
 54 | 
 55 |                     # get all video ids from soup
 56 |                     query_ids = []
 57 |                     count = 0
 58 |                     for count, link in enumerate(
 59 |                             list(set(soup.findAll('a', attrs={'href': re.compile("^/watch\?v=.{8,12}")})))):
 60 |                         query_ids.append(link.get('href').split('v=')[1])
 61 |                     self.video_ids.extend(query_ids)
 62 | 
 63 |                     # save links to csv file
 64 |                     if self.save_links:
 65 |                         with open(csv_out_path, "w") as f:
 66 |                             writer = csv.writer(f)
 67 |                             writer.writerows([query_ids])
 68 | 
 69 |                     print('{} unique links found for "{}"!'.format(count, q))
 70 |                 except:
 71 |                     print('Search query {} failed!'.format(q))
 72 |                     traceback.print_exc()
 73 |         else:
 74 |             raise TypeError("search queries must be list")
 75 | 
 76 |     def download_subs(self):
 77 |         for video_id in tqdm(list(set(self.video_ids)), desc="Downloading subtitles..."):
 78 |             try:
 79 |                 # gets list of available transcript from YTtranscriptAPI
 80 |                 transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
 81 | 
 82 |                 # init dictionary to store results and out name for txt file
 83 |                 out = {}
 84 |                 out_name = "{}/{}".format(self.out_path, video_id)
 85 | 
 86 |                 # count number of languages available
 87 |                 n_langs = 0
 88 |                 for t in transcript_list:
 89 |                     if t.is_generated:
 90 |                         continue
 91 |                     if self.langs != "all":
 92 |                         if t.language_code not in self.langs:
 93 |                             continue
 94 |                     n_langs += 1
 95 | 
 96 |                 total_minutes = -1
 97 |                 for t in transcript_list:
 98 |                     # filter out generated transcripts and non-specified languages
 99 |                     if t.is_generated:
100 |                         continue
101 |                     if self.langs != "all":
102 |                         if t.language_code not in self.langs:
103 |                             continue
104 |                     # fetch the actual transcript
105 |                     transcript = t.fetch()
106 | 
107 |                     # write every minute of every language in transcript to a results dict
108 |                     out[t.language_code] = {}
109 |                     for i in transcript:
110 |                         end = i["start"] + i["duration"]
111 |                         minute = int(math.floor(end / 60.0))
112 |                         if n_langs > 1:
113 |                             header_txt = '\n{}: \n'.format(lang_code_to_name(t.language_code))
114 |                         else:
115 |                             header_txt = ''
116 |                         if minute > total_minutes:
117 |                             total_minutes = minute
118 |                         if minute in out[t.language_code]:
119 |                             out[t.language_code][minute] += '{}\n'.format(i["text"])
120 |                         else:
121 |                             out[t.language_code][minute] = '{}{}\n'.format(header_txt, i["text"])
122 | 
123 |                 # write every minute of every language in transcript to txt file, shuffling language order
124 |                 with open("{}.txt".format(out_name), "w") as text_file:
125 |                     minutes = list(out.values())
126 |                     all_minutes = set(x for l in minutes for x in l)
127 |                     for m in all_minutes:
128 |                         keys = list(out.keys())
129 |                         random.shuffle(keys)
130 |                         for l in keys:
131 |                             try:
132 |                                 text_file.write(out[l][m])
133 |                             except:
134 |                                 pass
135 |             except Exception:
136 |                 print('Download for {} failed!'.format(video_id))
137 |                 traceback.print_exc()
138 | 


--------------------------------------------------------------------------------
/examples.csv:
--------------------------------------------------------------------------------
 1 | MIT lecture,biology lecture,science lecture,TED talk,hbomberguy,computer science,quantum computing,cosmology lecture,
 2 | yalecourses,mit opencourseware,bts,asmr,billie eilish,pewdiepie,baby shark,nature documentary,blackpink,old town road,
 3 | music,vevo,peliculas completas en español,minecraft,fortnite,ariana grande,alan walker,calma,musica,wwe,cosmology,black holes,
 4 | neutron star,david attenborough,cake recipe,pasta recipe,twitch stream,George Hotz,cs50 lecture,computerphile,talks at google,noam chomsky,periodic table,
 5 | summer lecture,big bang,spacex,jared diamond,anthropology,bill nye,howstuffworks,discovery,electrical engineering,fortnite memes,fortnite boogiedown,
 6 | fortnite funny moments,fortnite gameplay,chuck palahniuk,max boyle,philosophy,howard stern,ash ketchum,biology,Big Bang Theory,sufjan stevens,
 7 | nuclear power,data science,soccer,physics,history of physics,calculus,quantum chromodynamics,steroids,uncertainty principle,electromagnetism,
 8 | electrons,energy,elon musk,joseph smith,natural selection,spielberg,economics,rare earth minerals,earth science,evolution,geology,faraday,
 9 | chemistry,walmart,Story of Science,coffee,energy drinks,german,university of goettingen,france culture,quantum gravity,electroweak theory,
10 | black holes,Stephen Hawking,Cosmos,Freud,culinary arts,knife skills,animation,cs lewis,ethics,tech,marine biology,electrical engineering,
11 | geology undergrad,Raspberry Pi,microeconomics,sea monkeys,zinc,derivative,qatar,cbc,russ anderson,wall street journal,brewing,beer homebrew,
12 | eigenvalues,juventus,formula-1,berlin wall,Jolt Cola,James Madison,Thomas Jefferson,Lewis and Clark,Ferran Adrià,Game Theory,
13 | applied psychology,economic system,political science,first generation,food stamps,number line,quadratic formula,origami,pie chart,
14 | industry buzzword,marx,segregation,zeitgeist,vermeer,florentijn school,black protestant,Freedom Trail,Ada lovelace,Nietzche,Hegel,
15 | linguistics,tedx youth,Jacob Geller,Stanford lecture,school of life,Gilles Deleuze,Effective Altruism,Peter singer,frantz fanon,
16 | Hannah Arendt,Angela Davis,democracy now,Camus,David Graeber,Hito Steyerl,Foucault,simone de beauvoir,Walter Benjamin,Schopenhauer,
17 | bon appetit,maangchii,fermentation,khan academy,makeup tutorial,James Charles,vsauce,neural network,vox,dude perfect,vortex,plato,
18 | archimedes,Sagan,language of the body,criminal justice,gibran kahlil,psychology,geocities,sand,malala,fetus,pro life,scientific american,
19 | Fahrenheit 451,apocalypse now,Technoprogressive,Elvis,Evolution,Einstein,Catastrophes,3d printing,deepak chopra,Ancient Greek,immunology,
20 | Gaia,League of Extraordinary Gentlemen,Crypto,Cryptography,Mission to Mars,Virtual Reality,agriculture,oil,pesticides,sony,saturn,Banks,
21 | elon musk,spacex,apple,solar,iran,CERN,Health,Sam Harris,Metafilter,life extension,Balkanization,Convergence,Scientific Method,Magic,
22 | Sci-Fi,4chan,Nebula,Science,paradigm,Eugenics,Technology,Drugs,Steel,Shield,Electricity,Data Mining,Economics,Enterprise,Securities,
23 | Protective Social Services,Identity,Social Roots of Violence,Loyalty,movie review,GPT-3,true crime documentary,Rick Rubin,red bull music academy,
24 | tonight show,bbc,munchies,talk show,Bernie Sanders,germaine greer,university lecture,book review,kurzgesagt,news,agadmator,documentaries,tech,
25 | pickle recipe,food network,student cooking,anthony bourdain, fusion power, urban exploration, iphone review, car technology, verge, jeff beck,
26 | homemade sushi, ray william johnson, jennamarbles, jacksepticeye,react,card tricks,free solo,hip hop,tesla,comics,gta5,conspiracy,infowars,
27 | cambridge university,nasa,flat earth,richard dawkins,nazca lines,markiplier,statics,dynamics,interview,v for vendetta,incel,augmented reality,
28 | robotics,tumblr,shark week,technology connections,David Bowie,logo design,mentalism,Ai artificial intelligence,why is the sky blue,ancient aliens,
29 | lunar missions, nuclear disasters,space,national geographic videos,Sesame Street,Frank Zappa, ice hockey, Los Angeles Rams, modern art, Heavy Metal,
30 | poetry, science advocacy, science documentary, tv program, math tutorial,Acid Rock,Da Vinci Code,Aeronautics,London, pro wrestling,Dungeons and Dragons,
31 | Christopher Nolan,Fermat,latte art,brexit,entrepeneurship,english vocabulary,anarchism,funny vloggers,beauty products,dramaalert,plastic surgery,parallax effect,
32 | parkour,calculus,logarithms,history,architecture,steam trains,prank,social experiment,kardashians,minimalism,vegan recipes,cinematography,
33 | natural disaster,dota,cod,student cooking,anthony bourdain,Arnold Schwarzenegger,cosplay,boxing,dinosaur extinction,bill gates,Richard Feynman,
34 | David Lynch,Art History,Stephen Hawking,veganism,will it blend,Roman Mythology,medicine,antivaxxers,vlog,parody,sculpture,Ben Shapiro,
35 | Weird Al Yankovic,french cuisine,video game,Time Magazine,NPR,Quartz,the royal family,paleontology,Robin Williams,cinema,obama,
36 | archaeology,how it's made,gene editing,World War II,crispr,Zionism,Reddit,rap battle,patreon,lgbt,Uncharted,csgo,martin luther king,
37 | fibonacci,dj,visual effects,metal music,audiobook,war,greek mythology,ethics,stand up comedy,dresden files,tom hanks,capitalism,
38 | transgender issues,hemingway,frank sinatra,mythbusters,game of thrones,harry potter,speedrun,ps4,color theory,explain like i'm five,
39 | electoral college,poker,self help,global warming,lovecraft,film theory,NYU lecture,rationalism,logic,music theory,free speech,
40 | david foster wallace,louis CK,cancel culture,cynicalbrit,dark matter,pbs documentary,bjarne stroustrup,darwin,jordan peterson,
41 | arduino,reddit AMA,legal advice,self-driving car,ufo,zombie,neil gaiman,snoop dogg,tarantino,shakespeare,fifa,assassin's creed,
42 | clint eastwood,bass,LSD,steve jobs,atheism,crusades,women's rights,sir roger penrose,marcel duchamp,slavoj zizek,jake paul,deepmind,big think,dev summit,
43 | wordpress tutorial,web design,tutorial,travel video,anthropology,photosynthesis,john green,crash course history,crash course chemistry,crash course philosophy,crash course,
44 | internet historian,crime,2008 recession,revolution,fruitarian,deconstructed,stalin,fluid dynamics,phd thesis,how to write,north korea,aerodynamics,
45 | how rockets work,italian food,Terrence McKenna,Uighurs,convention,tibees,let's play,Jamaica,Vaporwave,amazon,deep web,street food,missionaries,carl jung,disease,


--------------------------------------------------------------------------------
/get_subs.py:
--------------------------------------------------------------------------------
 1 | import traceback, argparse
 2 | from multiprocessing import Pool, cpu_count
 3 | from itertools import repeat
 4 | from downloader_class import Subtitles_downloader
 5 | import csv, os
 6 | from utils import chunks
 7 | 
 8 | def download_subs_single(queries, out_path="out", save_links=True, scrolldown=1000):
 9 |     try:
10 |         sub_downloader = Subtitles_downloader(out_path=out_path, save_links=save_links, scrolldown=scrolldown)
11 |         sub_downloader.search(queries)
12 |         sub_downloader.download_subs()
13 |     except:
14 |         print('Thread failed!')
15 |         traceback.print_exc()
16 | 
17 | 
18 | def download_subs_mp(queries, out_path="out", save_links=True, scrolldown=1000):
19 |     queries = chunks(queries, (len(queries) // cpu_count() - 1))
20 |     with Pool(cpu_count() - 1) as p:
21 |         p.starmap(download_subs_single, zip(queries, repeat(out_path), repeat(save_links), repeat(scrolldown)))
22 |     print('Done!')
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     parser = argparse.ArgumentParser(description='CLI for YT_subtitles - extracts Youtube subtitles from a list of search terms')
27 |     parser.add_argument('search_terms',
28 |                         help='A comma separated list of search terms, alternatively, pass path to a csv with the -c '
29 |                              'argument',
30 |                         type=str)
31 |     parser.add_argument('--out_path', help='Output location for final .txt files', type=str, required=False,
32 |                         default='output')
33 |     parser.add_argument('-s', '--save_links', help='whether to save links to a .csv file', action="store_false")
34 |     parser.add_argument('-c', '--csv',
35 |                         help='if true, positional arg should be a path to a .csv file containing search terms',
36 |                         action="store_true")
37 |     parser.add_argument('--scroll', help='how far to scroll down in the youtube search', type=int, required=False,
38 |                         default=1000)
39 |     args = parser.parse_args()
40 |     os.makedirs(args.out_path, exist_ok=True)
41 |     if args.save_links:
42 |         os.makedirs("links", exist_ok=True)
43 |     if not args.csv:
44 |         search_terms = args.search_terms.split(',')
45 |     else:
46 |         search_terms = []
47 |         with open(args.search_terms, newline='') as inputfile:
48 |             for row in csv.reader(inputfile):
49 |                 search_terms.append(row)
50 |         # flattens list of list into unique list of items that aren't empty
51 |         search_terms = list(set([item for sublist in search_terms for item in sublist if item]))
52 |     print('Searching Youtube for: \n {}'.format(search_terms))
53 |     download_subs_mp(search_terms, args.out_path, args.save_links, args.scroll)
54 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | requests_html
3 | youtube_transcript_api
4 | pycountry


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import pycountry
 2 | 
 3 | all_langs = pycountry.languages
 4 | 
 5 | 
 6 | def chunks(l, n):
 7 |     n = max(1, n)
 8 |     return (l[i:i + n] for i in range(0, len(l), n))
 9 | 
10 | 
11 | def lang_code_to_name(lang_code):
12 |     lang = all_langs.get(alpha_2="{}".format(lang_code))
13 |     if lang is None:
14 |         lang = all_langs.get(alpha_3="{}".format(lang_code))
15 |         if lang is None:
16 |             # sometimes lang codes have a dash that then specifies regional dialect - just take the first part
17 |             lang = all_langs.get(alpha_2="{}".format(lang_code.split("-")[0]))
18 |             if lang is None:
19 |                 lang = all_langs.get(alpha_2="{}".format(lang_code.split("-")[0]))
20 |                 if lang is None:
21 |                     if "zh" in lang_code:
22 |                         # this should cover lots of regional zh langs (sorry to lump them all into one!)
23 |                         lang = all_langs.get(alpha_2="{}".format("zh"))
24 |                         if lang is None:
25 |                             print('No language name found for {}, returning language code'.format(lang_code))
26 |                             return lang_code
27 |                     else:
28 |                         return lang_code
29 |     return lang.name
30 | 


--------------------------------------------------------------------------------