├── .gitignore ├── AUTHORS ├── LICENSE ├── README.md ├── build_exe.bat ├── dist └── infoq_downloader.exe ├── infoq_downloader.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | build/** 2 | infoq_downloader.spec -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | * Mohammad Tayseer (@mtayseer) 2 | * Mosab Ahmad (@mos3abof) 3 | * Alex Varju (@varju) 4 | * @namuan -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Mohammad Tayseer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InfoQ downloader 2 | 3 | [InfoQ](http://www.infoq.com/) is a great resource for many useful sessions. The way they view presentations sync'ed with slides is cool. Unfortunately, I have a slow internet connection which makes my viewing experience sucks. To solve this, I made this scripts which downloads their page, video & slides. 4 | 5 | ## Installation 6 | On Windows, just download [this file](dist/infoq_downloader.exe?raw=true) 7 | 8 | On Linux, run the following 9 | 10 | ```sh 11 | git clone https://github.com/mtayseer/infoq-downloader.git 12 | cd infoq-downloader 13 | pip install -r requirements.txt 14 | ``` 15 | 16 | ## Usage 17 | `python infoq_downloader.py http://www.infoq.com/presentations/github-evolution` 18 | 19 | On Windows 20 | `infoq_downloader.exe http://www.infoq.com/presentations/github-evolution` 21 | 22 | ## Features 23 | 1. Console app 24 | 2. Supports download resuming of slides & videos 25 | 3. The generated HTML is clean 26 | 27 | ## License 28 | MIT. See [LICENSE](LICENSE) -------------------------------------------------------------------------------- /build_exe.bat: -------------------------------------------------------------------------------- 1 | pyinstaller infoq_downloader.py --onefile --upx-dir="F:\tayseer\tools\upx" -------------------------------------------------------------------------------- /dist/infoq_downloader.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mtayseer/infoq-downloader/dead91c45e3abea1756559a118d4d6df91a0a502/dist/infoq_downloader.exe -------------------------------------------------------------------------------- /infoq_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import division, print_function 4 | import os 5 | import sys 6 | import re 7 | import argparse 8 | import requests 9 | import cssselect 10 | import lxml.html 11 | import unicodedata 12 | 13 | if sys.version_info.major == 3: 14 | text_type = str 15 | else: 16 | text_type = unicode 17 | 18 | # Some settings 19 | download_directory = 'downloads' 20 | cleanup_elements = [ 21 | '#footer', '#header', '#topInfo', '.share_this', '.random_links', 22 | '.vendor_vs_popular', '.bottomContent', '#id_300x250_banner_top', 23 | '.presentation_type', '#conference', '#imgPreload', '#text_height_fix_box', 24 | '.download_presentation', '.recorded', 'script[async]', 25 | 'script[src*=addthis]' 26 | ] 27 | 28 | # Set argparse to parse the paramaters 29 | parser = argparse.ArgumentParser(description='Download InfoQ presentations.') 30 | parser.add_argument('url', metavar='URL', type=str, 31 | help='URL of the presentation to download') 32 | 33 | # Parse the arguments passed to the script 34 | args = parser.parse_args() 35 | url = args.url 36 | 37 | # Tell infoq that I'm an iPad, so it gives me simpler HTML to parse & mp4 file 38 | # qto download 39 | user_agent = ( 40 | "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) " 41 | "AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b " 42 | "Safari/531.21.10')" 43 | ) 44 | 45 | # Start downloading 46 | print('Downloading HTML file') 47 | 48 | content = requests.get(url, headers={'User-Agent': user_agent}).content 49 | html_doc = lxml.html.fromstring(content) 50 | title = html_doc.find(".//title").text 51 | video_url = html_doc.cssselect('video > source')[0].attrib['src'] 52 | video_file = os.path.split(video_url)[1] 53 | html_doc.cssselect('video > source')[0].attrib['src'] = video_file 54 | 55 | # Clean the page 56 | for elt in html_doc.cssselect(', '.join(e for e in cleanup_elements)): 57 | elt.getparent().remove(elt) 58 | html_doc.cssselect('#wrapper')[0].attrib['style'] = 'background: none' 59 | content = lxml.html.tostring(html_doc).decode('utf-8') 60 | 61 | # Make slides links point to local copies 62 | slides_re = re.compile(r"'(/resource/presentations/[^']*?/en/slides/[^']*?)'") 63 | slides = slides_re.findall(content) 64 | 65 | # Create a directory for the downloaded presentation if it doesn't exist 66 | if not os.path.exists(download_directory): 67 | os.makedirs(download_directory) 68 | 69 | # presentation folder path 70 | if isinstance(title, text_type): 71 | normalized_title = unicodedata.normalize('NFKD', title) 72 | else: 73 | normalized_title = text_type(title) 74 | presentation_directory = os.path.join(download_directory, normalized_title) 75 | # Create a folder with the name of the presentation 76 | if not os.path.exists(presentation_directory): 77 | os.makedirs(presentation_directory) 78 | 79 | # Create a slides folder inside the presentation folder 80 | if not os.path.exists('{}/slides'.format(presentation_directory)): 81 | os.makedirs('{}/slides'.format(presentation_directory)) 82 | 83 | #Write content 84 | content = re.sub(r"/resource/presentations/[^']*?/en/", '', content) 85 | with open('{}/index.html'.format(presentation_directory), 'w') as f: 86 | f.write(content) 87 | f.flush() 88 | 89 | # Download slides 90 | slides_dir = os.path.join(presentation_directory, 'slides') 91 | if not os.path.isdir(slides_dir): 92 | os.makedirs(slides_dir) 93 | for i, slide in enumerate(slides): 94 | filename = os.path.split(slide)[1] 95 | full_path = os.path.join(slides_dir, '{0}'.format(filename)) 96 | if os.path.exists(full_path): 97 | continue 98 | print('\rDownloading slide {0} of {1}'.format(i+1, len(slides)), end='') 99 | sys.stdout.flush() # Hack for Python 2 100 | url = 'http://www.infoq.com{0}'.format(slide) 101 | with open(full_path, 'wb') as f: 102 | f.write(requests.get(url).content) 103 | 104 | print() 105 | 106 | # If the video file is already downloaded successfully, don't do anything else 107 | if os.path.exists(video_file): 108 | print('Video file already exists') 109 | sys.exit() 110 | 111 | # Download the video file. stream=True here is important to allow me to iterate 112 | # over content 113 | downloaded_file = os.path.join( 114 | presentation_directory, '{}.part'.format(video_file) 115 | ) 116 | 117 | if os.path.exists(downloaded_file): 118 | bytes_downloaded = os.stat(downloaded_file).st_size 119 | else: 120 | bytes_downloaded = 0 121 | 122 | r = requests.get(video_url, stream=True, 123 | headers={'Range': 'bytes={0}-'.format(bytes_downloaded)}) 124 | content_length = int(r.headers['content-length']) + bytes_downloaded 125 | 126 | with open(downloaded_file, 'ab') as f: 127 | for chunk in r.iter_content(10 * 1024): 128 | f.write(chunk) 129 | f.flush() 130 | # \r used to return the cursor to beginning of line, so I can write 131 | # progress on a single line. 132 | # The comma at the end of line is important, to stop the 'print' command 133 | # from printing an additional new line 134 | percent = f.tell() / content_length * 100 135 | print('\rDownloading video {0:.2f}%'.format(percent), end='') 136 | sys.stdout.flush() # Hack for Python 2 137 | 138 | final_video_name = os.path.join(presentation_directory, video_file) 139 | os.rename(downloaded_file, final_video_name) 140 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | lxml 3 | cssselect --------------------------------------------------------------------------------