├── requirement.txt
├── download_screencasts.py
├── download_courses.py
├── README.md
├── .gitignore
├── parsing_scrreencasts_to_json.py
└── parsing_courses_to_json.py


/requirement.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.5.1
2 | lxml==3.6.4
3 | requests==2.12.1
4 | selenium==3.5.0
5 | 


--------------------------------------------------------------------------------
/download_screencasts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import urllib
 4 | 
 5 | ROOT = "download/Screencasts/"
 6 | 
 7 | def creatDir(directory):
 8 |     if not os.path.exists(directory):
 9 |         os.makedirs(directory)
10 | 
11 | def downloadAFileWithPath(url , path ,filename, ex):
12 |     creatDir(ROOT + path)
13 |     pathName = ROOT + path + '/' +  filename + '.' + ex
14 |     pathDownLoading = pathName + '.downloading'
15 | 
16 |     if os.path.isfile(pathName):
17 |         print "already have the file: " + pathName
18 |         return
19 |     if os.path.isfile(pathDownLoading):
20 |         os.path.isfile(pathDownLoading)
21 |         os.remove(pathDownLoading)
22 |     try:
23 |         urllib.urlretrieve(url,pathDownLoading)
24 |         os.rename(pathDownLoading,pathName)
25 |         print "add path: " + pathName
26 |     except:
27 |         print "error in downloadning"
28 | 
29 | with open('screencasts.json') as json_file:
30 |     screencasts = json.load(json_file)
31 | 
32 | for screencast in screencasts: 
33 |     MAIN_PATH = screencast['path']
34 |     downloadAFileWithPath(screencast['video'], MAIN_PATH, screencast['name'], "mp4")
35 | 


--------------------------------------------------------------------------------
/download_courses.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import urllib
 4 | 
 5 | ROOT = "download/Courses/"
 6 | 
 7 | def creatDir(directory):
 8 |     if not os.path.exists(directory):
 9 |         os.makedirs(directory)
10 | 
11 | def downloadAFileWithPath(url , path ,filename, ex):
12 |     creatDir(ROOT + path)
13 |     pathName = ROOT + path + '/' +  filename + '.' + ex
14 |     pathDownLoading = pathName + '.downloading'
15 | 
16 |     if os.path.isfile(pathName):
17 |         print "already have the file: " + pathName
18 |         return
19 |     if os.path.isfile(pathDownLoading):
20 |         os.path.isfile(pathDownLoading)
21 |         os.remove(pathDownLoading)
22 |     try:
23 |         urllib.urlretrieve(url,pathDownLoading)
24 |         os.rename(pathDownLoading,pathName)
25 |         print "add path: " + pathName
26 |     except:
27 |         print "error in downloadning"
28 | 
29 | with open('courses.json') as json_file:
30 | 	courses = json.load(json_file)
31 | 
32 | for course in courses:
33 | 	MAIN_PATH = course['path']
34 | 	COURSE_NAME = course['name']
35 | 	for level in course['levels']:
36 | 		FOLDER_PATH = level['name']
37 | 		PATH = MAIN_PATH + "/" + COURSE_NAME + "/" + FOLDER_PATH
38 | 		for video in level['videos']:
39 | 			video['name']
40 | 			downloadAFileWithPath(video['url'], PATH, video['name'], "mp4")
41 | 
42 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CodeSchool-Downloader
 2 | This is a python code that helps you to download all videos from [Code School](https://www.codeschool.com/).
 3 | 
 4 | If you have any question feel free to contact me at `sudo.liyang [at] gmail.com`.
 5 | 
 6 | # Donations
 7 | Bitcoin address: `1N5EoxpR9w5mdG7WBbThA96hx6bu6QoNbh`
 8 | 
 9 | ## Demo 
10 | 
11 | [![See video](http://i.imgur.com/bLdUpUq.png)](https://www.youtube.com/watch?v=TqvQNQtWhdU)
12 | 
13 | # Requirement
14 | 
15 | *   [Code School Membership](https://www.codeschool.com/pricing) 
16 | *   Python 2.7
17 | *   [pip](https://pypi.python.org/pypi/pip) 
18 | *   Firefox
19 | 
20 | 
21 | # Usage 
22 | 
23 | ### Prepare 
24 | Install dependency 
25 | 
26 |         $ pip install -r requirement.txt
27 | 
28 | Configure user and password, edit parsing_courses_to_json.py or parsing_screencasts_to_json.py file, add your Username and Password in the code.
29 | ```python
30 |         Username = "your_username"
31 |         Password = "your_password"
32 | ```
33 | 
34 | ### Parsing
35 | Parsing courses or screencasts video direct url to json
36 | and it will open Firefox and auto parse videos, when it finished it will output in current path as courses.json or screencasts.json.
37 | 
38 |         $ python parsing_courses_to_json.py
39 |         $ python parsing_screencasts_to_json.py
40 | 
41 | 
42 | ### Downloading
43 | When you finished parsing, just run download_courses.py or download_screencasts.py to downloading videos 
44 | 
45 |         $ python download_courses.py
46 |         $ python download_screencasts.py
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | courses.json
  2 | geckodriver.log
  3 | download/
  4 | # Created by https://www.gitignore.io/api/python
  5 | 
  6 | ### Python ###
  7 | # Byte-compiled / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | .hypothesis/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | # End of https://www.gitignore.io/api/python
110 | 


--------------------------------------------------------------------------------
/parsing_scrreencasts_to_json.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | from bs4 import BeautifulSoup
  4 | from selenium import webdriver
  5 | from selenium.webdriver.common.keys import Keys
  6 | import os
  7 | import urllib
  8 | from time import sleep
  9 | import atexit
 10 | 
 11 | Username = ""
 12 | Password = ""
 13 | browser = webdriver.Firefox()
 14 | 
 15 | def saveToJSON():
 16 |     global browser
 17 |     prettyScreencast = json.dumps(Screencasts, sort_keys=True, indent=2, separators=(',', ': '))
 18 |     with open("screencasts.json", "w") as json_file:
 19 |         json_file.write(prettyScreencast)
 20 | 
 21 | def cleanPathName(name):
 22 |     if name == "HTML/CSS":
 23 |         name = "HTML&CSS"
 24 |     elif name == ".NET":
 25 |         name = "dot NET"
 26 |     return name
 27 | 
 28 | atexit.register(saveToJSON)
 29 | 
 30 | def sign_in():
 31 |     global browser,Username,Password
 32 |     sign_in_url = "http://www.codeschool.com/users/sign_in"
 33 |     browser.get(sign_in_url)
 34 |     browser.find_element_by_id("user_login").clear()
 35 |     browser.find_element_by_id("user_login").send_keys(Username)
 36 |     browser.find_element_by_id("user_password").clear()
 37 |     browser.find_element_by_id("user_password").send_keys(Password)
 38 |     browser.find_element_by_xpath("//div[@id='sign-in-form']/form/div/div/button").click()
 39 | 
 40 | def parsePageScreenCastLinks():
 41 |     html = browser.page_source
 42 |     soup = BeautifulSoup(html, 'lxml')
 43 |     articles = soup.select("a.db.has-play")
 44 |     page_links = []
 45 |     for article in articles:
 46 |         page_links.append(article["href"])
 47 |     return page_links
 48 | 
 49 | def generateScreenCastsLinks():
 50 |     links = []
 51 |     browser.get("https://www.codeschool.com/screencasts")
 52 |     sleep(5)
 53 |     page_links = parsePageScreenCastLinks()
 54 |     links.extend(page_links)
 55 |     changePage = '''
 56 |     function changePage(page_number){
 57 |         var links = document.querySelectorAll("a.video-page-link")
 58 |         for(var i = links.length - 1; i > -1; i--){
 59 |             var link = links[i]
 60 |             if(link.dataset.page == page_number){
 61 |                 console.log(link)
 62 |                 link.click()
 63 |                 break
 64 |             }
 65 |         }
 66 |     };
 67 |     '''
 68 |     for i in range(2,10):
 69 |         browser.execute_script(changePage + "changePage(%s)" % i)
 70 |         sleep(10)
 71 |         page_links = parsePageScreenCastLinks()
 72 |         links.extend(page_links)
 73 | 
 74 |     # remove duplicates
 75 |     links = set(links)
 76 |     return links
 77 | 
 78 | def getVideoDirectURL(url):
 79 |     global browser
 80 | 
 81 |     isException = True
 82 |     reTryCount = 0
 83 | 
 84 |     while(isException and reTryCount < 3):
 85 |         try:
 86 |             browser.get(url)
 87 |             html  = browser.page_source
 88 |             soup = BeautifulSoup(html, 'lxml')
 89 |             url =  soup.select_one("video")["src"]
 90 |             path =  soup.select_one(".tag--heading").text
 91 |             name =  soup.select_one(".tci").text
 92 |             isException = False
 93 |         except KeyError:
 94 |             print "KeyError"
 95 |             sleep(2)
 96 |             reTryCount += 1
 97 | 
 98 |     return name, path, url
 99 | 
100 | 
101 | sign_in()
102 | Links = generateScreenCastsLinks()
103 | 
104 | Screencasts = []
105 | for index, link in enumerate(Links):
106 |     screencast = {  "name":"",
107 |                     "url":"",
108 |                     "video":"",
109 |                     "path":""}
110 |     print index, link
111 |     screencast["url"] = "https://www.codeschool.com" + link
112 |     screencast["name"], screencast["path"], screencast["video"] = getVideoDirectURL(screencast["url"])
113 |     screencast["path"] = cleanPathName(screencast["path"])
114 |     print screencast["name"]
115 |     Screencasts.append(screencast)
116 | 
117 | 


--------------------------------------------------------------------------------
/parsing_courses_to_json.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | from bs4 import BeautifulSoup
  4 | from selenium import webdriver
  5 | import os
  6 | import urllib
  7 | from time import sleep
  8 | import atexit
  9 | from selenium.webdriver.common.keys import Keys
 10 | 
 11 | Username = ""
 12 | Password = ""
 13 | browser = webdriver.Firefox()
 14 | 
 15 | def saveToJSON():
 16 |     global courses
 17 |     prettyCourses = json.dumps(courses, sort_keys=True, indent=2, separators=(',', ': '))
 18 |     with open("courses.json", "w") as json_file:
 19 |         json_file.write(prettyCourses)
 20 | 
 21 | atexit.register(saveToJSON)
 22 | 
 23 | def sign_in():
 24 |     global browser,Username,Password
 25 |     sign_in_url = "http://www.codeschool.com/users/sign_in"
 26 |     browser.get(sign_in_url)
 27 |     browser.find_element_by_id("user_login").clear()
 28 |     browser.find_element_by_id("user_login").send_keys(Username)
 29 |     browser.find_element_by_id("user_password").clear()
 30 |     browser.find_element_by_id("user_password").send_keys(Password)
 31 |     browser.find_element_by_xpath("//div[@id='sign-in-form']/form/div/div/button").click()
 32 | 
 33 | def LinkGenerator():
 34 |     response = requests.get('https://www.codeschool.com/courses/')
 35 |     soup = BeautifulSoup(response.text,'lxml')
 36 |     list = []
 37 |     for item in soup.findAll('a','course-title-link'):
 38 |         list.append(item['href'])
 39 |     return list
 40 | 
 41 | def cleanPathName(name):
 42 |     if name == "HTML/CSS":
 43 |         name = "HTML&CSS"
 44 |     elif name == ".NET":
 45 |         name = "dot NET"
 46 |     return name
 47 | 
 48 | def readACourse(link):
 49 |     global browser
 50 |     course = {
 51 |         "name": "",
 52 |         "path": "",
 53 |         "url" : "",
 54 |         "levels": []
 55 |     }
 56 |     course_link = "https://www.codeschool.com" + link + "/videos"
 57 |     browser.get(course_link)
 58 |     html = browser.page_source
 59 |     soup = BeautifulSoup(html, 'lxml')
 60 | 
 61 | 
 62 |     course_name = soup.find('h1',{'class','courseBanner-title'}).text
 63 |     course_path = soup.find('p',{'class':'mbf tss ttu'}).find('a').text
 64 |     course_path = cleanPathName(course_path)
 65 | 
 66 |     course['name'] = course_name
 67 |     course['path'] = course_path
 68 |     course['url'] = course_link
 69 | 
 70 |     print course_path
 71 |     print course_name
 72 | 
 73 |     ls = soup.select("div.level")
 74 | 
 75 |     levels = []
 76 |     for l in ls:
 77 |         level_name = l.select_one("p.tss.level-title strong").text
 78 |         videos = l.select("li.list-item.video-title")
 79 |         level = {
 80 |             "name":"",
 81 |             "videos":[]
 82 |         }
 83 |         level["name"] = level_name
 84 |         print "  " + level_name
 85 |         for v in videos:
 86 |             video = {
 87 |                 "name":"",
 88 |                 "url":""
 89 |             }
 90 |             video_title = v.select_one("strong.tct").text
 91 |             click_url = v.select_one("a.bdrn.js-level-open")["href"]
 92 | 
 93 |             print "   " + video_title
 94 |             direct_url = clickTitle(click_url)
 95 |             video["name"] = video_title
 96 |             video["url"] = direct_url
 97 |             level["videos"].append(video)
 98 | 
 99 |             # print direct_url
100 |         course["levels"].append(level)
101 |     return course
102 | 
103 | def readVideoDirectURL():
104 |     global browser
105 |     html = browser.page_source
106 |     soup = BeautifulSoup(html, 'lxml')
107 |     URL = soup.find('video')['src']
108 |     return URL
109 | 
110 | def clickTitle(href):
111 |     global browsr
112 |     browser.execute_script('''document.querySelector("a[href='%s']").click()''' % href)
113 |     sleep(2)
114 |     URL = readVideoDirectURL()
115 |     browser.find_element_by_tag_name("body").send_keys(Keys.ESCAPE)
116 |     sleep(2)
117 |     return URL
118 | 
119 | Links = LinkGenerator()
120 | sign_in()
121 | print "Signed !"
122 | 
123 | courses = []
124 | for index, link in enumerate(Links):
125 |     print str(index) + "  " + "https://www.codeschool.com" + link + '/videos'
126 |     course = readACourse(link)
127 |     courses.append(course)
128 | 


--------------------------------------------------------------------------------