├── fetchmap.sh
├── cleanit.sh
├── pullit.sh
├── pullit_series.sh
├── gencaption.py
├── runscrape.sh
├── pullit.py
├── gencsv.py
├── README.md
└── pullit_atoz.py


/fetchmap.sh:
--------------------------------------------------------------------------------
1 | rm -r d-sitemap-*.xml pullurls.txt
2 | wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-1.xml
3 | wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-2.xml
4 | 


--------------------------------------------------------------------------------
/cleanit.sh:
--------------------------------------------------------------------------------
1 | #bigfiles="$(grep "26f92b06ea07e32d487c58faa81de51a" /tmp/dp_md5|cut -d " " -f 3)"
2 | bigfiles="$(grep "992d453ab2832f6b25f02c5856f27541" /tmp/dp_md5_series|cut -d " " -f 3)"
3 | for i in $bigfiles
4 | do
5 | 	rm $i
6 | done
7 | 


--------------------------------------------------------------------------------
/pullit.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | allvideos="$(grep -h -o -E "https://www\.disneyplus\.com/(movies|series)/[^<]*" d-sitemap-1.xml d-sitemap-2.xml)"
3 | for i in $allvideos
4 | do
5 |    part="$(basename "$i")"
6 |    echo "https://disney.content.edge.bamgrid.com/svc/content/DmcVideoBundle/version/5.1/region/US/audience/false/maturity/1450/language/en/encodedFamilyId/$part"
7 | done
8 | 


--------------------------------------------------------------------------------
/pullit_series.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | allvideos="$(grep -h -o -E "https://www\.disneyplus\.com/series/[^<]*" d-sitemap-1.xml d-sitemap-2.xml)"
3 | for i in $allvideos
4 | do
5 |    part="$(basename "$i")"
6 |    echo "https://disney.content.edge.bamgrid.com/svc/content/DmcSeriesBundle/version/5.1/region/US/audience/false/maturity/1450/language/en/encodedSeriesId/$part"
7 | done
8 | 


--------------------------------------------------------------------------------
/gencaption.py:
--------------------------------------------------------------------------------
 1 | import gencsv
 2 | 
 3 | rows = gencsv.doall("disneyplus_20220130")
 4 | counts = {}
 5 | for i in rows:
 6 |     subtitles = i[3].split("|")
 7 |     for s in subtitles:
 8 |         if not s.endswith("[CC]"):
 9 |             continue
10 |         oldcount = counts[s] if s in counts else 0
11 |         counts[s] = oldcount + 1
12 | for i in sorted(counts.keys()):
13 |     print(i + ":", counts[i])
14 | 


--------------------------------------------------------------------------------
/runscrape.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # usage: ./runscrape.sh scrapedir
 3 | set -e
 4 | langs="en-US fr-CA fr-FR de-DE nl-NL es-CL"
 5 | basedir="$PWD"
 6 | 
 7 | function cleanit() {
 8 | 	bigfiles="$(grep -e "992d453ab2832f6b25f02c5856f27541" -e "26f92b06ea07e32d487c58faa81de51a" $1|cut -d " " -f 3)"
 9 | 	for i in $bigfiles
10 | 	do
11 |         	rm $i
12 | 	done
13 | }
14 | 
15 | mkdir "$1"
16 | cd "$1"
17 | 
18 | # uncomment for sitemap based pulling
19 | # wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-1.xml
20 | # wget https://cde-lumiere-disneyplus.bamgrid.com/d-sitemap-2.xml
21 | 
22 | function pulllang() {
23 | 	langname="$1"
24 | 	mkdir "$langname"
25 | 	cd "$langname"
26 | 	# python3 "$basedir/pullit.py" "$langname" allurls.txt allurls_series.txt ../*.xml
27 | 	python3 "$basedir/pullit_atoz.py" "$langname" allurls.txt allurls_series.txt
28 | 	mkdir disneyplus_movies
29 | 	mkdir disneyplus_series
30 | 	cd disneyplus_movies
31 | 	wget -i ../allurls.txt
32 | 	md5sum * >../md5_movies.txt
33 | 	cleanit ../md5_movies.txt
34 | 	cd ../disneyplus_series
35 | 	wget -i ../allurls_series.txt
36 | 	md5sum * >../md5_series.txt
37 | 	cleanit ../md5_series.txt
38 | 	cd ../../
39 | }
40 | 
41 | for langname in $langs
42 | do
43 | 	pulllang "$langname" &
44 | done
45 | wait
46 | 


--------------------------------------------------------------------------------
/pullit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import re
 4 | 
 5 | lang = sys.argv[1]
 6 | 
 7 | # usage: ./pullit.py en-US allurls.txt allurls_series.txt d-sitemap-1.xml d-sitemap-2.xml
 8 | moviesregex = re.compile(
 9 |     r"https://www\.disneyplus\.com/(\w*-\w*/)?movies/[^<]*")
10 | seriesregex = re.compile(
11 |     r"https://www\.disneyplus\.com/(\w*-\w*/)?series/[^<]*")
12 | 
13 | language_name, region_name = lang.split("-")
14 | moviejsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcVideoBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1450/language/" + language_name + "/encodedFamilyId/"
15 | seriesjsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcSeriesBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1450/language/" + language_name + "/encodedSeriesId/"
16 | 
17 | urls_movies = []
18 | ids_movies = set()
19 | urls_series = []
20 | ids_series = set()
21 | for filename in sys.argv[4:]:
22 |     with open(filename, "r") as infile:
23 |         indata = infile.read()
24 |     for m in moviesregex.finditer(indata):
25 |         url = m.group(0)
26 |         id = url[url.rfind("/") + 1:]
27 |         if id in ids_movies:
28 |             continue
29 |         ids_movies.add(id)
30 |         urls_movies.append(moviejsonbase + id)
31 |     for m in seriesregex.finditer(indata):
32 |         url = m.group(0)
33 |         id = url[url.rfind("/") + 1:]
34 |         if id in ids_series:
35 |             continue
36 |         ids_series.add(id)
37 |         urls_series.append(seriesjsonbase + id)
38 | with open(sys.argv[2], "w") as outfile:
39 |     outfile.write("\n".join(urls_movies))
40 | with open(sys.argv[3], "w") as outfile:
41 |     outfile.write("\n".join(urls_series))
42 | 


--------------------------------------------------------------------------------
/gencsv.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import csv
 4 | import sys
 5 | 
 6 | 
 7 | def doone(filename, nameid):
 8 |     with open(filename, "r") as infile:
 9 |         indata = json.load(infile)
10 |     if "DmcSeriesBundle" in indata["data"]:
11 |         videos = indata["data"]["DmcSeriesBundle"]["episodes"]["videos"]
12 |         if len(videos) == 0:
13 |             return None
14 |         video = videos[0]
15 |         textbundle = indata["data"]["DmcSeriesBundle"]["series"]["text"]
16 |         texttype = "series"
17 |     else:
18 |         video = indata["data"]["DmcVideoBundle"]["video"]
19 |         textbundle = video["text"]
20 |         texttype = "program"
21 |     media_metadata = video["mediaMetadata"]
22 | 
23 |     def nameTrack(i):
24 |         if i["renditionName"]:
25 |             return i["renditionName"]
26 |         return i["language"] + "-" + i["trackType"]
27 | 
28 |     tracks_text = "|".join(
29 |         sorted([nameTrack(i) for i in media_metadata["audioTracks"]]))
30 |     subtitles_text = "|".join(
31 |         sorted([
32 |             nameTrack(i) for i in media_metadata["captions"]
33 |             if i["trackType"] != "FORCED"
34 |         ]))
35 |     video_title = textbundle["title"]["full"][texttype]["default"]["content"]
36 |     return [video_title, nameid, tracks_text, subtitles_text]
37 | 
38 | 
39 | def doall(foldername):
40 |     rows = []
41 |     for subfolder in ["disneyplus_movies", "disneyplus_series"]:
42 |         for filename in os.listdir(foldername + "/" + subfolder):
43 |             row = doone(foldername + "/" + subfolder + "/" + filename,
44 |                         filename)
45 |             if row is None:
46 |                 continue
47 |             rows.append(row)
48 |     return rows
49 | 
50 | 
51 | def main():
52 |     rows = doall(sys.argv[1])
53 |     rows.sort(key=lambda a: a[0])
54 |     with open(sys.argv[2], "w") as csvfile:
55 |         writer = csv.writer(csvfile)
56 |         writer.writerow(["Title", "ID", "Audio", "Subtitles"])
57 |         writer.writerows(rows)
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     main()
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Scripts for scraping metadata from Disney+.
 2 | 
 3 | Usage:
 4 | 
 5 | ```
 6 | ./runscrape.sh scrape_outdir
 7 | ./gencsv_all_lang.sh scrape_outdir
 8 | ```
 9 | 
10 | Scraped output (2023-05-27):
11 | 
12 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
13 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1hPMKYcz8SCdasmBE4UfB80QzNL6ohYtY/view?usp=sharing)
14 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1L4pBwOAa9QXQx6zhnrtkqITf1M65xRbbw3n2x6cWeMI/edit?usp=sharing)
15 | 
16 | Scraped output (2023-05-24):
17 | 
18 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
19 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/13ETsBlA_y_sHy7gUdRnpEqqftYmooXqy/view?usp=sharing)
20 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1Rd5B-TyfAIpx9ChhuIje6xlwilm-CB1uqMcrEOWZ6Mg/edit?usp=sharing) 
21 | 
22 | Scraped output (2022-10-14):
23 | 
24 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
25 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1LTNYt8DiSov3D8AQh56syvEzmq3l6FBH/view?usp=sharing)
26 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1JMh5OPK8MLVyL9bUk39lN2A0A-jjI_QbSvHO4aBXB80/edit?usp=sharing)
27 | 
28 | Scraped output (2022-09-10):
29 | 
30 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
31 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1aZrSjxisn_YOL30GSMVxDhbapyZaWSpb/view?usp=sharing)
32 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1X7hNw8fCgSD6HScdkaPoCDOpFAkNd6_RQYjBTEi00Qs/edit?usp=sharing)
33 | 
34 | Scraped output (2022-08-07):
35 | 
36 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
37 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1nNBv1IFJrQAAyKgaUD-nU7wQGkBVcNGU/view?usp=sharing)
38 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1Xp7LkLFihrQck0yulAgjS0aoM7Pt3PkyIID5Kbzlmvo/edit?usp=sharing)
39 | 
40 | Scraped output (2022-07-09):
41 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
42 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1XFXy94Ekr76NtiUj4iQU_VF3qFr2uUph/view?usp=sharing)
43 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1j45A_hLvRBAQ06poayvEran5YzXVtJMdN_ytq8EC6R4/edit?usp=sharing)
44 | 
45 | 
46 | Scraped output (2022-06-27):
47 | 
48 | - Locales: de-DE, en-US, es-CL, fr-CA, fr-FR, nl-NL
49 | - Some series are missing
50 | - [JSON/CSVs for all locales](https://drive.google.com/file/d/1fNH61zl_t89zRJfw48zkzT0PNDarNBuH/view?usp=sharing)
51 | - [en-US Spreadsheet](https://docs.google.com/spreadsheets/d/1A01y1cFijAONLyOSQKp_MOawqMvNyulPH079LOVXegM/edit)
52 | 
53 | 
54 | Scraped output (en-US, 2022-01-30):
55 | 
56 | - [JSON](https://drive.google.com/file/d/1fNH61zl_t89zRJfw48zkzT0PNDarNBuH/view?usp=sharing)
57 | - [Spreadsheet](https://docs.google.com/spreadsheets/d/1A01y1cFijAONLyOSQKp_MOawqMvNyulPH079LOVXegM/edit)
58 | 
59 | 


--------------------------------------------------------------------------------
/pullit_atoz.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import sys
 3 | 
 4 | if len(sys.argv) != 4:
 5 |     print("usage: ./pullit.py en-US allurls.txt allurls_series.txt")
 6 |     exit(0)
 7 | 
 8 | # for series, pulling from the A-Z collection works better than pulling from sitemap - we pull ~460 instead of ~300
 9 | # for movies, pulling from A-Z + Shorts collections seems to mostly match sitemap:
10 | # we miss out on unreleased movies that are in the sitemap but only has a trailer, but you can't watch those anyways...
11 | 
12 | lang = sys.argv[1]
13 | 
14 | language_name, region_name = lang.split("-")
15 | moviejsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcVideoBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1830/language/" + language_name + "/encodedFamilyId/"
16 | seriesjsonbase = "https://disney.content.edge.bamgrid.com/svc/content/DmcSeriesBundle/version/5.1/region/" + region_name + "/audience/false/maturity/1830/language/" + language_name + "/encodedSeriesId/"
17 | 
18 | # A-Z
19 | base_movies_collection = "https://disney.content.edge.bamgrid.com/svc/content/CuratedSet/version/5.1/region/{}/audience/k-false,l-true/maturity/1830/language/{}/setId/9f7c38e5-41c3-47b4-b99e-b5b3d2eb95d4/pageSize/30/page/"
20 | # Shorts
21 | base_shorts_collection = "https://disney.content.edge.bamgrid.com/svc/content/CuratedSet/version/5.1/region/{}/audience/k-false,l-true/maturity/1830/language/{}/setId/34c856af-325c-4603-8d6b-dd9dc4695a69/pageSize/30/page/"
22 | base_series_collection = "https://disney.content.edge.bamgrid.com/svc/content/CuratedSet/version/5.1/region/{}/audience/k-false,l-true/maturity/1830/language/{}/setId/53adf843-491b-40ae-9b46-bccbceed863b/pageSize/30/page/"
23 | 
24 | 
25 | def graballpages(base_url):
26 |     page = 1
27 |     retval = []
28 |     while True:
29 |         page_url = base_url + str(page)
30 |         print(page_url)
31 |         resp = requests.get(page_url)
32 |         j = resp.json()
33 |         if len(j["data"]["CuratedSet"]["items"]) == 0:
34 |             break
35 |         retval.append(j)
36 |         page += 1
37 |     return retval
38 | 
39 | 
40 | def grabone(base_url, output_base_url):
41 |     allpages = graballpages(base_url)
42 |     allout = []
43 |     for page in allpages:
44 |         for item in page["data"]["CuratedSet"]["items"]:
45 |             if "encodedSeriesId" in output_base_url:
46 |                 itemid = item["encodedSeriesId"]
47 |             else:
48 |                 itemid = item["family"]["encodedFamilyId"]
49 |             allout.append(output_base_url + itemid)
50 |     return allout
51 | 
52 | 
53 | urls_movies = grabone(
54 |     base_movies_collection.format(region_name, language_name), moviejsonbase)
55 | urls_movies += grabone(
56 |     base_shorts_collection.format(region_name, language_name), moviejsonbase)
57 | urls_series = grabone(
58 |     base_series_collection.format(region_name, language_name), seriesjsonbase)
59 | with open(sys.argv[2], "w") as outfile:
60 |     outfile.write("\n".join(urls_movies))
61 | with open(sys.argv[3], "w") as outfile:
62 |     outfile.write("\n".join(urls_series))
63 | 


--------------------------------------------------------------------------------