├── requirements.txt
├── .gitignore
├── config.py.example
├── README.md
└── linkedin_learning.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | lxml


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__/
3 | downloads/
4 | config.py


--------------------------------------------------------------------------------
/config.py.example:
--------------------------------------------------------------------------------
 1 | # copy this file as `config.py`
 2 | 
 3 | import os
 4 | 
 5 | # EDIT
 6 | USERNAME = '...'
 7 | PASSWORD = '...'
 8 | COURSES = [
 9 | ]
10 | 
11 | # EDIT IF YOU NEED TO
12 | BASE_DOWNLOAD_PATH = os.path.join(os.path.dirname(__file__), "downloads")
13 | USE_PROXY = False
14 | PROXY = "http://127.0.0.1:8888" if USE_PROXY else None
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # LinkedIn Learning Downloader
 2 | 
 3 | #### Based on [mclmza's linkedin-learning-downloader](https://github.com/mclmza/linkedin-learning-downloader)
 4 | 
 5 | Asynchronous scraping tool to fetch LinkedIn-learning's courses videos.
 6 | 
 7 | Dependencies:
 8 | - Python 3.6
 9 | - aiohttp
10 | - lxml
11 | 
12 | #### Info
13 | 
14 | Please use this script for your own purposes.
15 | 
16 | This script was written for educational usage only.
17 | 
18 | Make sure your LinkedIn account is **NOT** protected with 2FA
19 | 
20 | #### Usage
21 | > pip install -r requirements.txt
22 | 
23 | Copy and edit `config.py.example` (username, password, and courses slugs)
24 | 
25 | ```Course's slug can be obtained using its url
26 | e.g:
27 | COURSE URL: https://www.linkedin.com/learning/python-advanced-design-pattern
28 | ->
29 | SLUG: python-advanced-design-pattern
30 | ```
31 | 
32 | > python linkedin_learning.py
33 | 
34 | #### TODO
35 | 
36 |  - ~~Add Subtitles~~
37 |  - Add Description
38 |  - Use argparser
39 |  - Fetch courses from bookmarks
40 | 


--------------------------------------------------------------------------------
/linkedin_learning.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import aiohttp
  3 | import aiohttp.cookiejar
  4 | import lxml.html
  5 | import re
  6 | import os
  7 | import logging
  8 | 
  9 | from itertools import chain, filterfalse, starmap
 10 | from collections import namedtuple
 11 | from urllib.parse import urljoin
 12 | from config import USERNAME, PASSWORD, COURSES, PROXY, BASE_DOWNLOAD_PATH
 13 | 
 14 | logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
 15 | 
 16 | MAX_DOWNLOADS_SEMAPHORE = asyncio.Semaphore(10)
 17 | HEADERS = {
 18 |     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
 19 |     "Accept": "*/*",
 20 | }
 21 | URL = "https://www.linkedin.com"
 22 | LOGIN_URL = f"{URL}/login"
 23 | FILE_TYPE_VIDEO = ".mp4"
 24 | FILE_TYPE_SUBTITLE = ".srt"
 25 | COOKIE_JAR = aiohttp.cookiejar.CookieJar()
 26 | EXERCISE_FOLDER_PATH = "exercises"
 27 | 
 28 | Course = namedtuple("Course", ["name", "slug", "description", "unlocked", "chapters", "exercises"])
 29 | Chapter = namedtuple("Chapter", ["name", "videos", "index"])
 30 | Video = namedtuple("Video", ["name", "slug", "index", "filename"])
 31 | Exercise = namedtuple("Exercise", ["name", "url", "course", "index"])
 32 | 
 33 | 
 34 | def sub_format_time(ms):
 35 |     seconds, milliseconds = divmod(ms, 1000)
 36 |     minutes, seconds = divmod(seconds, 60)
 37 |     hours, minutes = divmod(minutes, 60)
 38 |     return f'{hours:02}:{minutes:02}:{seconds:02},{milliseconds:02}'
 39 | 
 40 | 
 41 | def clean_dir_name(dir_name):
 42 |     # Remove starting digit and dot (e.g '1. A' -> 'A')
 43 |     # Remove bad characters         (e.g 'A: B' -> 'A B')
 44 |     no_digit = re.sub(r'^\d+\.', "", dir_name)
 45 |     no_bad_chars = re.sub(r'[\\:<>"/|?*]', "", no_digit)
 46 |     return no_bad_chars.strip()
 47 | 
 48 | 
 49 | def build_course(course_element: dict):
 50 |     chapters = [
 51 |         Chapter(name=chapter['title'],
 52 |                 videos=[
 53 |                     Video(name=video['title'],
 54 |                           slug=video['slug'],
 55 |                           index=idx,
 56 |                           filename=f"{str(idx).zfill(2)} - {clean_dir_name(video['title'])}{FILE_TYPE_VIDEO}"
 57 |                           )
 58 |                     for idx, video in enumerate(chapter['videos'], start=1)
 59 |                 ],
 60 |                 index=idx)
 61 |         for idx, chapter in enumerate(course_element['chapters'], start=1)
 62 |     ]
 63 |     exercises = [
 64 |         Exercise(name=exercise['name'],
 65 |                 url=exercise['url'],
 66 |                 course=course_element['title'],
 67 |                 index=idx)
 68 |         for idx, exercise in enumerate(course_element['exerciseFiles'], start=1)
 69 |     ]
 70 |     course = Course(name=course_element['title'],
 71 |                     slug=course_element['slug'],
 72 |                     description=course_element['description'],
 73 |                     unlocked=course_element['fullCourseUnlocked'],
 74 |                     chapters=chapters,
 75 |                     exercises=exercises)
 76 |     return course
 77 | 
 78 | 
 79 | def chapter_dir(course: Course, chapter: Chapter):
 80 |     folder_name = f"{str(chapter.index).zfill(2)} - {clean_dir_name(chapter.name)}"
 81 |     chapter_path = os.path.join(BASE_DOWNLOAD_PATH, clean_dir_name(course.name), folder_name)
 82 |     return chapter_path
 83 | 
 84 | 
 85 | def exercises_dir(exercise: Exercise):
 86 |     folder_name = EXERCISE_FOLDER_PATH
 87 |     exercise_path = os.path.join(BASE_DOWNLOAD_PATH, clean_dir_name(exercise.course), folder_name)
 88 |     return exercise_path
 89 | 
 90 | 
 91 | async def login(username, password):
 92 |     async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session:
 93 |         logging.info("[*] Login step 1 - Getting CSRF token...")
 94 |         resp = await session.get(LOGIN_URL, proxy=PROXY)
 95 |         body = await resp.text()
 96 | 
 97 |         # Looking for CSRF Token
 98 |         html = lxml.html.fromstring(body)
 99 |         csrf = html.xpath("//input[@name='loginCsrfParam']/@value").pop()
100 |         logging.debug(f"[*] CSRF: {csrf}")
101 |         data = {
102 |             "session_key": username,
103 |             "session_password": password,
104 |             "loginCsrfParam": csrf,
105 |             "isJsEnabled": False
106 |         }
107 |         logging.info("[*] Login step 1 - Done")
108 |         logging.info("[*] Login step 2 - Logging In...")
109 |         await session.post(urljoin(URL, 'uas/login-submit'), proxy=PROXY, data=data)
110 | 
111 |         if not next((x.value for x in session.cookie_jar if x.key.lower() == 'li_at'), False):
112 |             raise RuntimeError("[!] Could not login. Please check your credentials")
113 | 
114 |         HEADERS['Csrf-Token'] = next(x.value for x in session.cookie_jar if x.key.lower() == 'jsessionid')
115 |         logging.info("[*] Login step 2 - Done")
116 | 
117 | 
118 | async def fetch_courses():
119 |     return await asyncio.gather(*map(fetch_course, COURSES))
120 | 
121 | 
122 | async def fetch_course(course_slug):
123 |     url = f"{URL}/learning-api/detailedCourses??fields=fullCourseUnlocked,releasedOn,exerciseFileUrls,exerciseFiles&" \
124 |           f"addParagraphsToTranscript=true&courseSlug={course_slug}&q=slugs"
125 | 
126 |     async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session:
127 |         resp = await session.get(url, proxy=PROXY, headers=HEADERS)
128 |         data = await resp.json()
129 |         course = build_course(data['elements'][0])
130 | 
131 |         logging.info(f'[*] Fetching course {course.name}')
132 | 
133 |         await fetch_chapters(course)
134 |         await fetch_exercises(course)
135 |         logging.info(f'[*] Finished fetching course "{course.name}"')
136 | 
137 | 
138 | async def fetch_chapters(course: Course):
139 |     chapters_dirs = [chapter_dir(course, chapter) for chapter in course.chapters]
140 | 
141 |     # Creating all missing directories
142 |     missing_directories = filterfalse(os.path.exists, chapters_dirs)
143 |     for d in missing_directories:
144 |         os.makedirs(d)
145 | 
146 |     await asyncio.gather(*chain.from_iterable(fetch_chapter(course, chapter) for chapter in course.chapters))
147 | 
148 | 
149 | async def fetch_exercises(course: Course):
150 |     if len(course.exercises) == 0:
151 |         return
152 | 
153 |     # Creating the missing directory
154 |     exercise_dir = exercises_dir(course.exercises[0])
155 |     if not os.path.exists(exercise_dir):
156 |         os.makedirs(exercise_dir)
157 | 
158 |     return await asyncio.gather(*map(fetch_zip_or_wait, course.exercises))
159 | 
160 | 
161 | def fetch_chapter(course: Course, chapter: Chapter):
162 |     return (
163 |         fetch_video_or_wait(course, chapter, video)
164 |         for video in chapter.videos
165 |     )
166 | 
167 | 
168 | async def fetch_video_or_wait(course: Course, chapter: Chapter, video: Video):
169 |     async with MAX_DOWNLOADS_SEMAPHORE:
170 |         await fetch_video(course, chapter, video)
171 | 
172 | 
173 | async def fetch_zip_or_wait(exercise: Exercise):
174 |     async with MAX_DOWNLOADS_SEMAPHORE:
175 |         await fetch_zip(exercise)
176 | 
177 | 
178 | async def fetch_video(course: Course, chapter: Chapter, video: Video):
179 |     subtitles_filename = os.path.splitext(video.filename)[0] + FILE_TYPE_SUBTITLE
180 |     video_file_path = os.path.join(chapter_dir(course, chapter), video.filename)
181 |     subtitle_file_path = os.path.join(chapter_dir(course, chapter), subtitles_filename)
182 |     video_exists = os.path.exists(video_file_path)
183 |     subtitle_exists = os.path.exists(subtitle_file_path)
184 |     if video_exists and subtitle_exists:
185 |         return
186 | 
187 |     logging.info(f"[~] Fetching course '{course.name}' Chapter no. {chapter.index} Video no. {video.index}")
188 |     async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session:
189 |         video_url = f'{URL}/learning-api/detailedCourses?addParagraphsToTranscript=false&courseSlug={course.slug}&' \
190 |                     f'q=slugs&resolution=_720&videoSlug={video.slug}'
191 |         data = None
192 |         tries = 3
193 |         for _ in range(tries):
194 |             try:
195 |                 resp = await session.get(video_url, proxy=PROXY, headers=HEADERS)
196 |                 data = await resp.json()
197 |                 resp.raise_for_status()
198 |                 break
199 |             except aiohttp.client_exceptions.ClientResponseError:
200 |                 pass
201 |         
202 |         try:
203 |             subtitles = data['elements'][0]['selectedVideo']['transcript']
204 |             # This throws exception if the course is locked for the user as url is not available
205 |             video_url = data['elements'][0]['selectedVideo']['url']['progressiveUrl']
206 |         except Exception:
207 |             subtitles = None
208 |         duration_in_ms = int(data['elements'][0]['selectedVideo']['durationInSeconds']) * 1000
209 | 
210 |         if not video_exists:
211 |             logging.info(f"[~] Writing {video.filename}")
212 |             await download_file(video_url, video_file_path)
213 | 
214 |         if subtitles is not None:
215 |             logging.info(f"[~] Writing {subtitles_filename}")
216 |             subtitle_lines = subtitles['lines']            
217 |             await write_subtitles(subtitle_lines, subtitle_file_path, duration_in_ms)
218 | 
219 |     logging.info(f"[~] Done fetching course '{course.name}' Chapter no. {chapter.index} Video no. {video.index}")
220 | 
221 | 
222 | async def fetch_zip(exercise: Exercise):
223 |     zip_file_path = os.path.join(exercises_dir(exercise), f"{str(exercise.index).zfill(2)} - {exercise.name}")
224 |     zip_exists = os.path.exists(zip_file_path)
225 |     if zip_exists:
226 |         return
227 | 
228 |     logging.info(f"[~] Fetching zip '{exercise.name}' Exercise no. {exercise.index}")
229 |     await download_file(exercise.url, zip_file_path)
230 |     logging.info(f"[~] Done fetching zip '{exercise.name}' Exercise no. {exercise.index}")
231 | 
232 | 
233 | async def write_subtitles(subs, output_path, video_duration):
234 |     def subs_to_lines(idx, sub):
235 |         starts_at = sub['transcriptStartAt']
236 |         ends_at = subs[idx]['transcriptStartAt'] if idx < len(subs) else video_duration
237 |         caption = sub['caption']
238 |         return f"{idx}\n" \
239 |                f"{sub_format_time(starts_at)} --> {sub_format_time(ends_at)}\n" \
240 |                f"{caption}\n\n"
241 | 
242 |     with open(output_path, 'wb') as f:
243 |         for line in starmap(subs_to_lines, enumerate(subs, start=1)):
244 |             f.write(line.encode('utf8'))
245 | 
246 | timeout = aiohttp.ClientTimeout(total = 60*60)
247 | 
248 | async def download_file(url, output):
249 |     async with aiohttp.ClientSession(headers=HEADERS, cookie_jar=COOKIE_JAR) as session:
250 |         async with session.get(url, proxy=PROXY, headers=HEADERS, timeout=timeout) as r:
251 |             try:
252 |                 with open(output, 'wb') as f:
253 |                     while True:
254 |                         chunk = await r.content.read(1024)
255 |                         if not chunk:
256 |                             break
257 |                         f.write(chunk)
258 |             except Exception as e:
259 |                 logging.exception(f"[!] Error while downloading: '{e}'")
260 |                 if os.path.exists(output):
261 |                     os.remove(output)
262 | 
263 | 
264 | async def process():
265 |     try:
266 |         logging.info("[*] -------------Login-------------")
267 |         await login(USERNAME, PASSWORD)
268 |         logging.info("[*] -------------Done-------------")
269 | 
270 |         logging.info("[*] -------------Fetching Course-------------")
271 |         await fetch_courses()
272 |         logging.info("[*] -------------Done-------------")
273 | 
274 |     except aiohttp.client_exceptions.ClientProxyConnectionError as e:
275 |         logging.error(f"Proxy Error: {e}")
276 | 
277 |     except aiohttp.client_exceptions.ClientConnectionError as e:
278 |         logging.error(f"Connection Error: {e}")
279 | 
280 | 
281 | if __name__ == "__main__":
282 |     loop = asyncio.get_event_loop()
283 |     loop.run_until_complete(process())
284 |     loop.close()


--------------------------------------------------------------------------------