├── fetchmap.sh ├── cleanit.sh ├── pullit.sh ├── pullit_series.sh ├── gencaption.py ├── runscrape.sh ├── pullit.py ├── gencsv.py ├── README.md └── pullit_atoz.py /fetchmap.sh: -------------------------------------------------------------------------------- 1 | rm -r d-sitemap-*.xml pullurls.txt 2 | wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-1.xml 3 | wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-2.xml 4 | -------------------------------------------------------------------------------- /cleanit.sh: -------------------------------------------------------------------------------- 1 | #bigfiles="$(grep "26f92b06ea07e32d487c58faa81de51a" /tmp/dp_md5|cut -d " " -f 3)" 2 | bigfiles="$(grep "992d453ab2832f6b25f02c5856f27541" /tmp/dp_md5_series|cut -d " " -f 3)" 3 | for i in $bigfiles 4 | do 5 | rm $i 6 | done 7 | -------------------------------------------------------------------------------- /pullit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | allvideos="$(grep -h -o -E "https://www\.disneyplus\.com/(movies|series)/[^<]*" d-sitemap-1.xml d-sitemap-2.xml)" 3 | for i in $allvideos 4 | do 5 | part="$(basename "$i")" 6 | echo "https://disney.content.edge.bamgrid.com/svc/content/DmcVideoBundle/version/5.1/region/US/audience/false/maturity/1450/language/en/encodedFamilyId/$part" 7 | done 8 | -------------------------------------------------------------------------------- /pullit_series.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | allvideos="$(grep -h -o -E "https://www\.disneyplus\.com/series/[^<]*" d-sitemap-1.xml d-sitemap-2.xml)" 3 | for i in $allvideos 4 | do 5 | part="$(basename "$i")" 6 | echo "https://disney.content.edge.bamgrid.com/svc/content/DmcSeriesBundle/version/5.1/region/US/audience/false/maturity/1450/language/en/encodedSeriesId/$part" 7 | done 8 | -------------------------------------------------------------------------------- /gencaption.py: -------------------------------------------------------------------------------- 1 | import gencsv 2 | 3 | rows = gencsv.doall("disneyplus_20220130") 4 | counts = {} 5 | for i in rows: 6 | subtitles = i[3].split("|") 7 | for s in subtitles: 8 | if not s.endswith("[CC]"): 9 | continue 10 | oldcount = counts[s] if s in counts else 0 11 | counts[s] = oldcount + 1 12 | for i in sorted(counts.keys()): 13 | print(i + ":", counts[i]) 14 | -------------------------------------------------------------------------------- /runscrape.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # usage: ./runscrape.sh scrapedir 3 | set -e 4 | langs="en-US fr-CA fr-FR de-DE nl-NL es-CL" 5 | basedir="$PWD" 6 | 7 | function cleanit() { 8 | bigfiles="$(grep -e "992d453ab2832f6b25f02c5856f27541" -e "26f92b06ea07e32d487c58faa81de51a" $1|cut -d " " -f 3)" 9 | for i in $bigfiles 10 | do 11 | rm $i 12 | done 13 | } 14 | 15 | mkdir "$1" 16 | cd "$1" 17 | 18 | # uncomment for sitemap based pulling 19 | # wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-1.xml 20 | # wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-2.xml 21 | 22 | function pulllang() { 23 | langname="$1" 24 | mkdir "$langname" 25 | cd "$langname" 26 | # python3 "$basedir/pullit.py" "$langname" allurls.txt allurls_series.txt ../*.xml 27 | python3 "$basedir/pullit_atoz.py" "$langname" allurls.txt allurls_series.txt 28 | mkdir disneyplus_movies 29 | mkdir disneyplus_series 30 | cd disneyplus_movies 31 | wget -i ../allurls.txt 32 | md5sum * >../md5_movies.txt 33 | cleanit ../md5_movies.txt 34 | cd ../disneyplus_series 35 | wget -i ../allurls_series.txt 36 | md5sum * >../md5_series.txt 37 | cleanit ../md5_series.txt 38 | cd ../../ 39 | } 40 | 41 | for langname in $langs 42 | do 43 | pulllang "$langname" & 44 | done 45 | wait 46 | -------------------------------------------------------------------------------- /pullit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import re 4 | 5 | lang = sys.argv[1] 6 | 7 | # usage: ./pullit.py en-US allurls.txt allurls_series.txt d-sitemap-1.xml d-sitemap-2.xml 8 | moviesregex = re.compile( 9 | r"https://www\.disneyplus\.com/(\w*-\w*/)?movies/[^<]*") 10 | seriesregex = re.compile( 11 | r"https://www\.disneyplus\.com/(\w*-\w*/)?series/[^<]*") 12 | 13 | language_name, region_name = lang.split("-") 14 | moviejsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcVideoBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1450/language/" + language_name + "/encodedFamilyId/" 15 | seriesjsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcSeriesBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1450/language/" + language_name + "/encodedSeriesId/" 16 | 17 | urls_movies = [] 18 | ids_movies = set() 19 | urls_series = [] 20 | ids_series = set() 21 | for filename in sys.argv[4:]: 22 | with open(filename, "r") as infile: 23 | indata = infile.read() 24 | for m in moviesregex.finditer(indata): 25 | url = m.group(0) 26 | id = url[url.rfind("/") + 1:] 27 | if id in ids_movies: 28 | continue 29 | ids_movies.add(id) 30 | urls_movies.append(moviejsonbase + id) 31 | for m in seriesregex.finditer(indata): 32 | url = m.group(0) 33 | id = url[url.rfind("/") + 1:] 34 | if id in ids_series: 35 | continue 36 | ids_series.add(id) 37 | urls_series.append(seriesjsonbase + id) 38 | with open(sys.argv[2], "w") as outfile: 39 | outfile.write("\n".join(urls_movies)) 40 | with open(sys.argv[3], "w") as outfile: 41 | outfile.write("\n".join(urls_series)) 42 | -------------------------------------------------------------------------------- /gencsv.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import csv 4 | import sys 5 | 6 | 7 | def doone(filename, nameid): 8 | with open(filename, "r") as infile: 9 | indata = json.load(infile) 10 | if "DmcSeriesBundle" in indata["data"]: 11 | videos = indata["data"]["DmcSeriesBundle"]["episodes"]["videos"] 12 | if len(videos) == 0: 13 | return None 14 | video = videos[0] 15 | textbundle = indata["data"]["DmcSeriesBundle"]["series"]["text"] 16 | texttype = "series" 17 | else: 18 | video = indata["data"]["DmcVideoBundle"]["video"] 19 | textbundle = video["text"] 20 | texttype = "program" 21 | media_metadata = video["mediaMetadata"] 22 | 23 | def nameTrack(i): 24 | if i["renditionName"]: 25 | return i["renditionName"] 26 | return i["language"] + "-" + i["trackType"] 27 | 28 | tracks_text = "|".join( 29 | sorted([nameTrack(i) for i in media_metadata["audioTracks"]])) 30 | subtitles_text = "|".join( 31 | sorted([ 32 | nameTrack(i) for i in media_metadata["captions"] 33 | if i["trackType"] != "FORCED" 34 | ])) 35 | video_title = textbundle["title"]["full"][texttype]["default"]["content"] 36 | return [video_title, nameid, tracks_text, subtitles_text] 37 | 38 | 39 | def doall(foldername): 40 | rows = [] 41 | for subfolder in ["disneyplus_movies", "disneyplus_series"]: 42 | for filename in os.listdir(foldername + "/" + subfolder): 43 | row = doone(foldername + "/" + subfolder + "/" + filename, 44 | filename) 45 | if row is None: 46 | continue 47 | rows.append(row) 48 | return rows 49 | 50 | 51 | def main(): 52 | rows = doall(sys.argv[1]) 53 | rows.sort(key=lambda a: a[0]) 54 | with open(sys.argv[2], "w") as csvfile: 55 | writer = csv.writer(csvfile) 56 | writer.writerow(["Title", "ID", "Audio", "Subtitles"]) 57 | writer.writerows(rows) 58 | 59 | 60 | if __name__ == "__main__": 61 | main() 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scripts for scraping metadata from Disney+. 2 | 3 | Usage: 4 | 5 | ``` 6 | ./runscrape.sh scrape_outdir 7 | ./gencsv_all_lang.sh scrape_outdir 8 | ``` 9 | 10 | Scraped output (2023-05-27): 11 | 12 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 13 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1hPMKYcz8SCdasmBE4UfB80QzNL6ohYtY/view?usp=sharing) 14 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1L4pBwOAa9QXQx6zhnrtkqITf1M65xRbbw3n2x6cWeMI/edit?usp=sharing) 15 | 16 | Scraped output (2023-05-24): 17 | 18 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 19 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/13ETsBlA_y_sHy7gUdRnpEqqftYmooXqy/view?usp=sharing) 20 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1Rd5B-TyfAIpx9ChhuIje6xlwilm-CB1uqMcrEOWZ6Mg/edit?usp=sharing) 21 | 22 | Scraped output (2022-10-14): 23 | 24 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 25 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1LTNYt8DiSov3D8AQh56syvEzmq3l6FBH/view?usp=sharing) 26 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1JMh5OPK8MLVyL9bUk39lN2A0A-jjI_QbSvHO4aBXB80/edit?usp=sharing) 27 | 28 | Scraped output (2022-09-10): 29 | 30 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 31 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1aZrSjxisn_YOL30GSMVxDhbapyZaWSpb/view?usp=sharing) 32 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1X7hNw8fCgSD6HScdkaPoCDOpFAkNd6_RQYjBTEi00Qs/edit?usp=sharing) 33 | 34 | Scraped output (2022-08-07): 35 | 36 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 37 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1nNBv1IFJrQAAyKgaUD-nU7wQGkBVcNGU/view?usp=sharing) 38 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1Xp7LkLFihrQck0yulAgjS0aoM7Pt3PkyIID5Kbzlmvo/edit?usp=sharing) 39 | 40 | Scraped output (2022-07-09): 41 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 42 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1XFXy94Ekr76NtiUj4iQU_VF3qFr2uUph/view?usp=sharing) 43 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1j45A_hLvRBAQ06poayvEran5YzXVtJMdN_ytq8EC6R4/edit?usp=sharing) 44 | 45 | 46 | Scraped output (2022-06-27): 47 | 48 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL 49 | - Some series are missing 50 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1fNH61zl_t89zRJfw48zkzT0PNDarNBuH/view?usp=sharing) 51 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1A01y1cFijAONLyOSQKp_MOawqMvNyulPH079LOVXegM/edit) 52 | 53 | 54 | Scraped output (en-US, 2022-01-30): 55 | 56 | - [JSON](https://drive.google.com/file/d/1fNH61zl_t89zRJfw48zkzT0PNDarNBuH/view?usp=sharing) 57 | - [Spreadsheet](https://docs.google.com/spreadsheets/d/1A01y1cFijAONLyOSQKp_MOawqMvNyulPH079LOVXegM/edit) 58 | 59 | -------------------------------------------------------------------------------- /pullit_atoz.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import sys 3 | 4 | if len(sys.argv) != 4: 5 | print("usage: ./pullit.py en-US allurls.txt allurls_series.txt") 6 | exit(0) 7 | 8 | # for series, pulling from the A-Z collection works better than pulling from sitemap - we pull ~460 instead of ~300 9 | # for movies, pulling from A-Z + Shorts collections seems to mostly match sitemap: 10 | # we miss out on unreleased movies that are in the sitemap but only has a trailer, but you can't watch those anyways... 11 | 12 | lang = sys.argv[1] 13 | 14 | language_name, region_name = lang.split("-") 15 | moviejsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcVideoBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1830/language/" + language_name + "/encodedFamilyId/" 16 | seriesjsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcSeriesBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1830/language/" + language_name + "/encodedSeriesId/" 17 | 18 | # A-Z 19 | base_movies_collection = "https://disney.content.edge.bamgrid.com/svc/content/CuratedSet/version/5.1/region/{}/audience/k-false,l-true/maturity/1830/language/{}/setId/9f7c38e5-41c3-47b4-b99e-b5b3d2eb95d4/pageSize/30/page/" 20 | # Shorts 21 | base_shorts_collection = "https://disney.content.edge.bamgrid.com/svc/content/CuratedSet/version/5.1/region/{}/audience/k-false,l-true/maturity/1830/language/{}/setId/34c856af-325c-4603-8d6b-dd9dc4695a69/pageSize/30/page/" 22 | base_series_collection = "https://disney.content.edge.bamgrid.com/svc/content/CuratedSet/version/5.1/region/{}/audience/k-false,l-true/maturity/1830/language/{}/setId/53adf843-491b-40ae-9b46-bccbceed863b/pageSize/30/page/" 23 | 24 | 25 | def graballpages(base_url): 26 | page = 1 27 | retval = [] 28 | while True: 29 | page_url = base_url + str(page) 30 | print(page_url) 31 | resp = requests.get(page_url) 32 | j = resp.json() 33 | if len(j["data"]["CuratedSet"]["items"]) == 0: 34 | break 35 | retval.append(j) 36 | page += 1 37 | return retval 38 | 39 | 40 | def grabone(base_url, output_base_url): 41 | allpages = graballpages(base_url) 42 | allout = [] 43 | for page in allpages: 44 | for item in page["data"]["CuratedSet"]["items"]: 45 | if "encodedSeriesId" in output_base_url: 46 | itemid = item["encodedSeriesId"] 47 | else: 48 | itemid = item["family"]["encodedFamilyId"] 49 | allout.append(output_base_url + itemid) 50 | return allout 51 | 52 | 53 | urls_movies = grabone( 54 | base_movies_collection.format(region_name, language_name), moviejsonbase) 55 | urls_movies += grabone( 56 | base_shorts_collection.format(region_name, language_name), moviejsonbase) 57 | urls_series = grabone( 58 | base_series_collection.format(region_name, language_name), seriesjsonbase) 59 | with open(sys.argv[2], "w") as outfile: 60 | outfile.write("\n".join(urls_movies)) 61 | with open(sys.argv[3], "w") as outfile: 62 | outfile.write("\n".join(urls_series)) 63 | --------------------------------------------------------------------------------