├── requirements.txt ├── .gitignore ├── config.py.example ├── README.md └── linkedin_learning.py /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | lxml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__/ 3 | downloads/ 4 | config.py -------------------------------------------------------------------------------- /config.py.example: -------------------------------------------------------------------------------- 1 | # copy this file as `config.py` 2 | 3 | import os 4 | 5 | # EDIT 6 | USERNAME = '...' 7 | PASSWORD = '...' 8 | COURSES = [ 9 | ] 10 | 11 | # EDIT IF YOU NEED TO 12 | BASE_DOWNLOAD_PATH = os.path.join(os.path.dirname(__file__), "downloads") 13 | USE_PROXY = False 14 | PROXY = "http://127.0.0.1:8888" if USE_PROXY else None 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LinkedIn Learning Downloader 2 | 3 | #### Based on [mclmza's linkedin-learning-downloader](https://github.com/mclmza/linkedin-learning-downloader) 4 | 5 | Asynchronous scraping tool to fetch LinkedIn-learning's courses videos. 6 | 7 | Dependencies: 8 | - Python 3.6 9 | - aiohttp 10 | - lxml 11 | 12 | #### Info 13 | 14 | Please use this script for your own purposes. 15 | 16 | This script was written for educational usage only. 17 | 18 | Make sure your LinkedIn account is **NOT** protected with 2FA 19 | 20 | #### Usage 21 | > pip install -r requirements.txt 22 | 23 | Copy and edit `config.py.example` (username, password, and courses slugs) 24 | 25 | ```Course's slug can be obtained using its url 26 | e.g: 27 | COURSE URL: https://www.linkedin.com/learning/python-advanced-design-pattern 28 | -> 29 | SLUG: python-advanced-design-pattern 30 | ``` 31 | 32 | > python linkedin_learning.py 33 | 34 | #### TODO 35 | 36 | - ~~Add Subtitles~~ 37 | - Add Description 38 | - Use argparser 39 | - Fetch courses from bookmarks 40 | -------------------------------------------------------------------------------- /linkedin_learning.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | import aiohttp.cookiejar 4 | import lxml.html 5 | import re 6 | import os 7 | import logging 8 | 9 | from itertools import chain, filterfalse, starmap 10 | from collections import namedtuple 11 | from urllib.parse import urljoin 12 | from config import USERNAME, PASSWORD, COURSES, PROXY, BASE_DOWNLOAD_PATH 13 | 14 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 15 | 16 | MAX_DOWNLOADS_SEMAPHORE = asyncio.Semaphore(10) 17 | HEADERS = { 18 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", 19 | "Accept": "*/*", 20 | } 21 | URL = "https://www.linkedin.com" 22 | LOGIN_URL = f"{URL}/login" 23 | FILE_TYPE_VIDEO = ".mp4" 24 | FILE_TYPE_SUBTITLE = ".srt" 25 | COOKIE_JAR = aiohttp.cookiejar.CookieJar() 26 | EXERCISE_FOLDER_PATH = "exercises" 27 | 28 | Course = namedtuple("Course", ["name", "slug", "description", "unlocked", "chapters", "exercises"]) 29 | Chapter = namedtuple("Chapter", ["name", "videos", "index"]) 30 | Video = namedtuple("Video", ["name", "slug", "index", "filename"]) 31 | Exercise = namedtuple("Exercise", ["name", "url", "course", "index"]) 32 | 33 | 34 | def sub_format_time(ms): 35 | seconds, milliseconds = divmod(ms, 1000) 36 | minutes, seconds = divmod(seconds, 60) 37 | hours, minutes = divmod(minutes, 60) 38 | return f'{hours:02}:{minutes:02}:{seconds:02},{milliseconds:02}' 39 | 40 | 41 | def clean_dir_name(dir_name): 42 | # Remove starting digit and dot (e.g '1. A' -> 'A') 43 | # Remove bad characters (e.g 'A: B' -> 'A B') 44 | no_digit = re.sub(r'^\d+\.', "", dir_name) 45 | no_bad_chars = re.sub(r'[\\:<>"/|?*]', "", no_digit) 46 | return no_bad_chars.strip() 47 | 48 | 49 | def build_course(course_element: dict): 50 | chapters = [ 51 | Chapter(name=chapter['title'], 52 | videos=[ 53 | Video(name=video['title'], 54 | slug=video['slug'], 55 | index=idx, 56 | filename=f"{str(idx).zfill(2)} - {clean_dir_name(video['title'])}{FILE_TYPE_VIDEO}" 57 | ) 58 | for idx, video in enumerate(chapter['videos'], start=1) 59 | ], 60 | index=idx) 61 | for idx, chapter in enumerate(course_element['chapters'], start=1) 62 | ] 63 | exercises = [ 64 | Exercise(name=exercise['name'], 65 | url=exercise['url'], 66 | course=course_element['title'], 67 | index=idx) 68 | for idx, exercise in enumerate(course_element['exerciseFiles'], start=1) 69 | ] 70 | course = Course(name=course_element['title'], 71 | slug=course_element['slug'], 72 | description=course_element['description'], 73 | unlocked=course_element['fullCourseUnlocked'], 74 | chapters=chapters, 75 | exercises=exercises) 76 | return course 77 | 78 | 79 | def chapter_dir(course: Course, chapter: Chapter): 80 | folder_name = f"{str(chapter.index).zfill(2)} - {clean_dir_name(chapter.name)}" 81 | chapter_path = os.path.join(BASE_DOWNLOAD_PATH, clean_dir_name(course.name), folder_name) 82 | return chapter_path 83 | 84 | 85 | def exercises_dir(exercise: Exercise): 86 | folder_name = EXERCISE_FOLDER_PATH 87 | exercise_path = os.path.join(BASE_DOWNLOAD_PATH, clean_dir_name(exercise.course), folder_name) 88 | return exercise_path 89 | 90 | 91 | async def login(username, password): 92 | async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session: 93 | logging.info("[*] Login step 1 - Getting CSRF token...") 94 | resp = await session.get(LOGIN_URL, proxy=PROXY) 95 | body = await resp.text() 96 | 97 | # Looking for CSRF Token 98 | html = lxml.html.fromstring(body) 99 | csrf = html.xpath("//input[@name='loginCsrfParam']/@value").pop() 100 | logging.debug(f"[*] CSRF: {csrf}") 101 | data = { 102 | "session_key": username, 103 | "session_password": password, 104 | "loginCsrfParam": csrf, 105 | "isJsEnabled": False 106 | } 107 | logging.info("[*] Login step 1 - Done") 108 | logging.info("[*] Login step 2 - Logging In...") 109 | await session.post(urljoin(URL, 'uas/login-submit'), proxy=PROXY, data=data) 110 | 111 | if not next((x.value for x in session.cookie_jar if x.key.lower() == 'li_at'), False): 112 | raise RuntimeError("[!] Could not login. Please check your credentials") 113 | 114 | HEADERS['Csrf-Token'] = next(x.value for x in session.cookie_jar if x.key.lower() == 'jsessionid') 115 | logging.info("[*] Login step 2 - Done") 116 | 117 | 118 | async def fetch_courses(): 119 | return await asyncio.gather(*map(fetch_course, COURSES)) 120 | 121 | 122 | async def fetch_course(course_slug): 123 | url = f"{URL}/learning-api/detailedCourses??fields=fullCourseUnlocked,releasedOn,exerciseFileUrls,exerciseFiles&" \ 124 | f"addParagraphsToTranscript=true&courseSlug={course_slug}&q=slugs" 125 | 126 | async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session: 127 | resp = await session.get(url, proxy=PROXY, headers=HEADERS) 128 | data = await resp.json() 129 | course = build_course(data['elements'][0]) 130 | 131 | logging.info(f'[*] Fetching course {course.name}') 132 | 133 | await fetch_chapters(course) 134 | await fetch_exercises(course) 135 | logging.info(f'[*] Finished fetching course "{course.name}"') 136 | 137 | 138 | async def fetch_chapters(course: Course): 139 | chapters_dirs = [chapter_dir(course, chapter) for chapter in course.chapters] 140 | 141 | # Creating all missing directories 142 | missing_directories = filterfalse(os.path.exists, chapters_dirs) 143 | for d in missing_directories: 144 | os.makedirs(d) 145 | 146 | await asyncio.gather(*chain.from_iterable(fetch_chapter(course, chapter) for chapter in course.chapters)) 147 | 148 | 149 | async def fetch_exercises(course: Course): 150 | if len(course.exercises) == 0: 151 | return 152 | 153 | # Creating the missing directory 154 | exercise_dir = exercises_dir(course.exercises[0]) 155 | if not os.path.exists(exercise_dir): 156 | os.makedirs(exercise_dir) 157 | 158 | return await asyncio.gather(*map(fetch_zip_or_wait, course.exercises)) 159 | 160 | 161 | def fetch_chapter(course: Course, chapter: Chapter): 162 | return ( 163 | fetch_video_or_wait(course, chapter, video) 164 | for video in chapter.videos 165 | ) 166 | 167 | 168 | async def fetch_video_or_wait(course: Course, chapter: Chapter, video: Video): 169 | async with MAX_DOWNLOADS_SEMAPHORE: 170 | await fetch_video(course, chapter, video) 171 | 172 | 173 | async def fetch_zip_or_wait(exercise: Exercise): 174 | async with MAX_DOWNLOADS_SEMAPHORE: 175 | await fetch_zip(exercise) 176 | 177 | 178 | async def fetch_video(course: Course, chapter: Chapter, video: Video): 179 | subtitles_filename = os.path.splitext(video.filename)[0] + FILE_TYPE_SUBTITLE 180 | video_file_path = os.path.join(chapter_dir(course, chapter), video.filename) 181 | subtitle_file_path = os.path.join(chapter_dir(course, chapter), subtitles_filename) 182 | video_exists = os.path.exists(video_file_path) 183 | subtitle_exists = os.path.exists(subtitle_file_path) 184 | if video_exists and subtitle_exists: 185 | return 186 | 187 | logging.info(f"[~] Fetching course '{course.name}' Chapter no. {chapter.index} Video no. {video.index}") 188 | async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session: 189 | video_url = f'{URL}/learning-api/detailedCourses?addParagraphsToTranscript=false&courseSlug={course.slug}&' \ 190 | f'q=slugs&resolution=_720&videoSlug={video.slug}' 191 | data = None 192 | tries = 3 193 | for _ in range(tries): 194 | try: 195 | resp = await session.get(video_url, proxy=PROXY, headers=HEADERS) 196 | data = await resp.json() 197 | resp.raise_for_status() 198 | break 199 | except aiohttp.client_exceptions.ClientResponseError: 200 | pass 201 | 202 | try: 203 | subtitles = data['elements'][0]['selectedVideo']['transcript'] 204 | # This throws exception if the course is locked for the user as url is not available 205 | video_url = data['elements'][0]['selectedVideo']['url']['progressiveUrl'] 206 | except Exception: 207 | subtitles = None 208 | duration_in_ms = int(data['elements'][0]['selectedVideo']['durationInSeconds']) * 1000 209 | 210 | if not video_exists: 211 | logging.info(f"[~] Writing {video.filename}") 212 | await download_file(video_url, video_file_path) 213 | 214 | if subtitles is not None: 215 | logging.info(f"[~] Writing {subtitles_filename}") 216 | subtitle_lines = subtitles['lines'] 217 | await write_subtitles(subtitle_lines, subtitle_file_path, duration_in_ms) 218 | 219 | logging.info(f"[~] Done fetching course '{course.name}' Chapter no. {chapter.index} Video no. {video.index}") 220 | 221 | 222 | async def fetch_zip(exercise: Exercise): 223 | zip_file_path = os.path.join(exercises_dir(exercise), f"{str(exercise.index).zfill(2)} - {exercise.name}") 224 | zip_exists = os.path.exists(zip_file_path) 225 | if zip_exists: 226 | return 227 | 228 | logging.info(f"[~] Fetching zip '{exercise.name}' Exercise no. {exercise.index}") 229 | await download_file(exercise.url, zip_file_path) 230 | logging.info(f"[~] Done fetching zip '{exercise.name}' Exercise no. {exercise.index}") 231 | 232 | 233 | async def write_subtitles(subs, output_path, video_duration): 234 | def subs_to_lines(idx, sub): 235 | starts_at = sub['transcriptStartAt'] 236 | ends_at = subs[idx]['transcriptStartAt'] if idx < len(subs) else video_duration 237 | caption = sub['caption'] 238 | return f"{idx}\n" \ 239 | f"{sub_format_time(starts_at)} --> {sub_format_time(ends_at)}\n" \ 240 | f"{caption}\n\n" 241 | 242 | with open(output_path, 'wb') as f: 243 | for line in starmap(subs_to_lines, enumerate(subs, start=1)): 244 | f.write(line.encode('utf8')) 245 | 246 | timeout = aiohttp.ClientTimeout(total = 60*60) 247 | 248 | async def download_file(url, output): 249 | async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session: 250 | async with session.get(url, proxy=PROXY, headers=HEADERS, timeout=timeout) as r: 251 | try: 252 | with open(output, 'wb') as f: 253 | while True: 254 | chunk = await r.content.read(1024) 255 | if not chunk: 256 | break 257 | f.write(chunk) 258 | except Exception as e: 259 | logging.exception(f"[!] Error while downloading: '{e}'") 260 | if os.path.exists(output): 261 | os.remove(output) 262 | 263 | 264 | async def process(): 265 | try: 266 | logging.info("[*] -------------Login-------------") 267 | await login(USERNAME, PASSWORD) 268 | logging.info("[*] -------------Done-------------") 269 | 270 | logging.info("[*] -------------Fetching Course-------------") 271 | await fetch_courses() 272 | logging.info("[*] -------------Done-------------") 273 | 274 | except aiohttp.client_exceptions.ClientProxyConnectionError as e: 275 | logging.error(f"Proxy Error: {e}") 276 | 277 | except aiohttp.client_exceptions.ClientConnectionError as e: 278 | logging.error(f"Connection Error: {e}") 279 | 280 | 281 | if __name__ == "__main__": 282 | loop = asyncio.get_event_loop() 283 | loop.run_until_complete(process()) 284 | loop.close() --------------------------------------------------------------------------------