├── requirement.txt ├── download_screencasts.py ├── download_courses.py ├── README.md ├── .gitignore ├── parsing_scrreencasts_to_json.py └── parsing_courses_to_json.py /requirement.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.5.1 2 | lxml==3.6.4 3 | requests==2.12.1 4 | selenium==3.5.0 5 | -------------------------------------------------------------------------------- /download_screencasts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import urllib 4 | 5 | ROOT = "download/Screencasts/" 6 | 7 | def creatDir(directory): 8 | if not os.path.exists(directory): 9 | os.makedirs(directory) 10 | 11 | def downloadAFileWithPath(url , path ,filename, ex): 12 | creatDir(ROOT + path) 13 | pathName = ROOT + path + '/' + filename + '.' + ex 14 | pathDownLoading = pathName + '.downloading' 15 | 16 | if os.path.isfile(pathName): 17 | print "already have the file: " + pathName 18 | return 19 | if os.path.isfile(pathDownLoading): 20 | os.path.isfile(pathDownLoading) 21 | os.remove(pathDownLoading) 22 | try: 23 | urllib.urlretrieve(url,pathDownLoading) 24 | os.rename(pathDownLoading,pathName) 25 | print "add path: " + pathName 26 | except: 27 | print "error in downloadning" 28 | 29 | with open('screencasts.json') as json_file: 30 | screencasts = json.load(json_file) 31 | 32 | for screencast in screencasts: 33 | MAIN_PATH = screencast['path'] 34 | downloadAFileWithPath(screencast['video'], MAIN_PATH, screencast['name'], "mp4") 35 | -------------------------------------------------------------------------------- /download_courses.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import urllib 4 | 5 | ROOT = "download/Courses/" 6 | 7 | def creatDir(directory): 8 | if not os.path.exists(directory): 9 | os.makedirs(directory) 10 | 11 | def downloadAFileWithPath(url , path ,filename, ex): 12 | creatDir(ROOT + path) 13 | pathName = ROOT + path + '/' + filename + '.' + ex 14 | pathDownLoading = pathName + '.downloading' 15 | 16 | if os.path.isfile(pathName): 17 | print "already have the file: " + pathName 18 | return 19 | if os.path.isfile(pathDownLoading): 20 | os.path.isfile(pathDownLoading) 21 | os.remove(pathDownLoading) 22 | try: 23 | urllib.urlretrieve(url,pathDownLoading) 24 | os.rename(pathDownLoading,pathName) 25 | print "add path: " + pathName 26 | except: 27 | print "error in downloadning" 28 | 29 | with open('courses.json') as json_file: 30 | courses = json.load(json_file) 31 | 32 | for course in courses: 33 | MAIN_PATH = course['path'] 34 | COURSE_NAME = course['name'] 35 | for level in course['levels']: 36 | FOLDER_PATH = level['name'] 37 | PATH = MAIN_PATH + "/" + COURSE_NAME + "/" + FOLDER_PATH 38 | for video in level['videos']: 39 | video['name'] 40 | downloadAFileWithPath(video['url'], PATH, video['name'], "mp4") 41 | 42 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeSchool-Downloader 2 | This is a python code that helps you to download all videos from [Code School](https://www.codeschool.com/). 3 | 4 | If you have any question feel free to contact me at `sudo.liyang [at] gmail.com`. 5 | 6 | # Donations 7 | Bitcoin address: `1N5EoxpR9w5mdG7WBbThA96hx6bu6QoNbh` 8 | 9 | ## Demo 10 | 11 | [![See video](http://i.imgur.com/bLdUpUq.png)](https://www.youtube.com/watch?v=TqvQNQtWhdU) 12 | 13 | # Requirement 14 | 15 | * [Code School Membership](https://www.codeschool.com/pricing) 16 | * Python 2.7 17 | * [pip](https://pypi.python.org/pypi/pip) 18 | * Firefox 19 | 20 | 21 | # Usage 22 | 23 | ### Prepare 24 | Install dependency 25 | 26 | $ pip install -r requirement.txt 27 | 28 | Configure user and password, edit parsing_courses_to_json.py or parsing_screencasts_to_json.py file, add your Username and Password in the code. 29 | ```python 30 | Username = "your_username" 31 | Password = "your_password" 32 | ``` 33 | 34 | ### Parsing 35 | Parsing courses or screencasts video direct url to json 36 | and it will open Firefox and auto parse videos, when it finished it will output in current path as courses.json or screencasts.json. 37 | 38 | $ python parsing_courses_to_json.py 39 | $ python parsing_screencasts_to_json.py 40 | 41 | 42 | ### Downloading 43 | When you finished parsing, just run download_courses.py or download_screencasts.py to downloading videos 44 | 45 | $ python download_courses.py 46 | $ python download_screencasts.py 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | courses.json 2 | geckodriver.log 3 | download/ 4 | # Created by https://www.gitignore.io/api/python 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # End of https://www.gitignore.io/api/python 110 | -------------------------------------------------------------------------------- /parsing_scrreencasts_to_json.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from bs4 import BeautifulSoup 4 | from selenium import webdriver 5 | from selenium.webdriver.common.keys import Keys 6 | import os 7 | import urllib 8 | from time import sleep 9 | import atexit 10 | 11 | Username = "" 12 | Password = "" 13 | browser = webdriver.Firefox() 14 | 15 | def saveToJSON(): 16 | global browser 17 | prettyScreencast = json.dumps(Screencasts, sort_keys=True, indent=2, separators=(',', ': ')) 18 | with open("screencasts.json", "w") as json_file: 19 | json_file.write(prettyScreencast) 20 | 21 | def cleanPathName(name): 22 | if name == "HTML/CSS": 23 | name = "HTML&CSS" 24 | elif name == ".NET": 25 | name = "dot NET" 26 | return name 27 | 28 | atexit.register(saveToJSON) 29 | 30 | def sign_in(): 31 | global browser,Username,Password 32 | sign_in_url = "http://www.codeschool.com/users/sign_in" 33 | browser.get(sign_in_url) 34 | browser.find_element_by_id("user_login").clear() 35 | browser.find_element_by_id("user_login").send_keys(Username) 36 | browser.find_element_by_id("user_password").clear() 37 | browser.find_element_by_id("user_password").send_keys(Password) 38 | browser.find_element_by_xpath("//div[@id='sign-in-form']/form/div/div/button").click() 39 | 40 | def parsePageScreenCastLinks(): 41 | html = browser.page_source 42 | soup = BeautifulSoup(html, 'lxml') 43 | articles = soup.select("a.db.has-play") 44 | page_links = [] 45 | for article in articles: 46 | page_links.append(article["href"]) 47 | return page_links 48 | 49 | def generateScreenCastsLinks(): 50 | links = [] 51 | browser.get("https://www.codeschool.com/screencasts") 52 | sleep(5) 53 | page_links = parsePageScreenCastLinks() 54 | links.extend(page_links) 55 | changePage = ''' 56 | function changePage(page_number){ 57 | var links = document.querySelectorAll("a.video-page-link") 58 | for(var i = links.length - 1; i > -1; i--){ 59 | var link = links[i] 60 | if(link.dataset.page == page_number){ 61 | console.log(link) 62 | link.click() 63 | break 64 | } 65 | } 66 | }; 67 | ''' 68 | for i in range(2,10): 69 | browser.execute_script(changePage + "changePage(%s)" % i) 70 | sleep(10) 71 | page_links = parsePageScreenCastLinks() 72 | links.extend(page_links) 73 | 74 | # remove duplicates 75 | links = set(links) 76 | return links 77 | 78 | def getVideoDirectURL(url): 79 | global browser 80 | 81 | isException = True 82 | reTryCount = 0 83 | 84 | while(isException and reTryCount < 3): 85 | try: 86 | browser.get(url) 87 | html = browser.page_source 88 | soup = BeautifulSoup(html, 'lxml') 89 | url = soup.select_one("video")["src"] 90 | path = soup.select_one(".tag--heading").text 91 | name = soup.select_one(".tci").text 92 | isException = False 93 | except KeyError: 94 | print "KeyError" 95 | sleep(2) 96 | reTryCount += 1 97 | 98 | return name, path, url 99 | 100 | 101 | sign_in() 102 | Links = generateScreenCastsLinks() 103 | 104 | Screencasts = [] 105 | for index, link in enumerate(Links): 106 | screencast = { "name":"", 107 | "url":"", 108 | "video":"", 109 | "path":""} 110 | print index, link 111 | screencast["url"] = "https://www.codeschool.com" + link 112 | screencast["name"], screencast["path"], screencast["video"] = getVideoDirectURL(screencast["url"]) 113 | screencast["path"] = cleanPathName(screencast["path"]) 114 | print screencast["name"] 115 | Screencasts.append(screencast) 116 | 117 | -------------------------------------------------------------------------------- /parsing_courses_to_json.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from bs4 import BeautifulSoup 4 | from selenium import webdriver 5 | import os 6 | import urllib 7 | from time import sleep 8 | import atexit 9 | from selenium.webdriver.common.keys import Keys 10 | 11 | Username = "" 12 | Password = "" 13 | browser = webdriver.Firefox() 14 | 15 | def saveToJSON(): 16 | global courses 17 | prettyCourses = json.dumps(courses, sort_keys=True, indent=2, separators=(',', ': ')) 18 | with open("courses.json", "w") as json_file: 19 | json_file.write(prettyCourses) 20 | 21 | atexit.register(saveToJSON) 22 | 23 | def sign_in(): 24 | global browser,Username,Password 25 | sign_in_url = "http://www.codeschool.com/users/sign_in" 26 | browser.get(sign_in_url) 27 | browser.find_element_by_id("user_login").clear() 28 | browser.find_element_by_id("user_login").send_keys(Username) 29 | browser.find_element_by_id("user_password").clear() 30 | browser.find_element_by_id("user_password").send_keys(Password) 31 | browser.find_element_by_xpath("//div[@id='sign-in-form']/form/div/div/button").click() 32 | 33 | def LinkGenerator(): 34 | response = requests.get('https://www.codeschool.com/courses/') 35 | soup = BeautifulSoup(response.text,'lxml') 36 | list = [] 37 | for item in soup.findAll('a','course-title-link'): 38 | list.append(item['href']) 39 | return list 40 | 41 | def cleanPathName(name): 42 | if name == "HTML/CSS": 43 | name = "HTML&CSS" 44 | elif name == ".NET": 45 | name = "dot NET" 46 | return name 47 | 48 | def readACourse(link): 49 | global browser 50 | course = { 51 | "name": "", 52 | "path": "", 53 | "url" : "", 54 | "levels": [] 55 | } 56 | course_link = "https://www.codeschool.com" + link + "/videos" 57 | browser.get(course_link) 58 | html = browser.page_source 59 | soup = BeautifulSoup(html, 'lxml') 60 | 61 | 62 | course_name = soup.find('h1',{'class','courseBanner-title'}).text 63 | course_path = soup.find('p',{'class':'mbf tss ttu'}).find('a').text 64 | course_path = cleanPathName(course_path) 65 | 66 | course['name'] = course_name 67 | course['path'] = course_path 68 | course['url'] = course_link 69 | 70 | print course_path 71 | print course_name 72 | 73 | ls = soup.select("div.level") 74 | 75 | levels = [] 76 | for l in ls: 77 | level_name = l.select_one("p.tss.level-title strong").text 78 | videos = l.select("li.list-item.video-title") 79 | level = { 80 | "name":"", 81 | "videos":[] 82 | } 83 | level["name"] = level_name 84 | print " " + level_name 85 | for v in videos: 86 | video = { 87 | "name":"", 88 | "url":"" 89 | } 90 | video_title = v.select_one("strong.tct").text 91 | click_url = v.select_one("a.bdrn.js-level-open")["href"] 92 | 93 | print " " + video_title 94 | direct_url = clickTitle(click_url) 95 | video["name"] = video_title 96 | video["url"] = direct_url 97 | level["videos"].append(video) 98 | 99 | # print direct_url 100 | course["levels"].append(level) 101 | return course 102 | 103 | def readVideoDirectURL(): 104 | global browser 105 | html = browser.page_source 106 | soup = BeautifulSoup(html, 'lxml') 107 | URL = soup.find('video')['src'] 108 | return URL 109 | 110 | def clickTitle(href): 111 | global browsr 112 | browser.execute_script('''document.querySelector("a[href='%s']").click()''' % href) 113 | sleep(2) 114 | URL = readVideoDirectURL() 115 | browser.find_element_by_tag_name("body").send_keys(Keys.ESCAPE) 116 | sleep(2) 117 | return URL 118 | 119 | Links = LinkGenerator() 120 | sign_in() 121 | print "Signed !" 122 | 123 | courses = [] 124 | for index, link in enumerate(Links): 125 | print str(index) + " " + "https://www.codeschool.com" + link + '/videos' 126 | course = readACourse(link) 127 | courses.append(course) 128 | --------------------------------------------------------------------------------