├── .gitignore
├── AUTHORS
├── LICENSE
├── README.md
├── build_exe.bat
├── dist
    └── infoq_downloader.exe
├── infoq_downloader.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | build/**
2 | infoq_downloader.spec


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | * Mohammad Tayseer (@mtayseer)
2 | * Mosab Ahmad (@mos3abof)
3 | * Alex Varju (@varju)
4 | * @namuan


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Mohammad Tayseer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # InfoQ downloader
 2 | 
 3 | [InfoQ](http://www.infoq.com/) is a great resource for many useful sessions. The way they view presentations sync'ed with slides is cool. Unfortunately, I have a slow internet connection which makes my viewing experience sucks. To solve this, I made this scripts which downloads their page, video & slides.
 4 | 
 5 | ## Installation
 6 | On Windows, just download [this file](dist/infoq_downloader.exe?raw=true)
 7 | 
 8 | On Linux, run the following
 9 | 
10 | ```sh
11 | git clone https://github.com/mtayseer/infoq-downloader.git
12 | cd infoq-downloader
13 | pip install -r requirements.txt
14 | ```
15 | 
16 | ## Usage
17 | `python infoq_downloader.py http://www.infoq.com/presentations/github-evolution`
18 | 
19 | On Windows
20 | `infoq_downloader.exe http://www.infoq.com/presentations/github-evolution`
21 | 
22 | ## Features
23 | 1. Console app
24 | 2. Supports download resuming of slides & videos
25 | 3. The generated HTML is clean
26 | 
27 | ## License
28 | MIT. See [LICENSE](LICENSE)


--------------------------------------------------------------------------------
/build_exe.bat:
--------------------------------------------------------------------------------
1 | pyinstaller infoq_downloader.py --onefile --upx-dir="F:\tayseer\tools\upx"


--------------------------------------------------------------------------------
/dist/infoq_downloader.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mtayseer/infoq-downloader/dead91c45e3abea1756559a118d4d6df91a0a502/dist/infoq_downloader.exe


--------------------------------------------------------------------------------
/infoq_downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import division, print_function
  4 | import os
  5 | import sys
  6 | import re
  7 | import argparse
  8 | import requests
  9 | import cssselect
 10 | import lxml.html
 11 | import unicodedata
 12 | 
 13 | if sys.version_info.major == 3:
 14 |     text_type = str
 15 | else:
 16 |     text_type = unicode
 17 | 
 18 | # Some settings
 19 | download_directory = 'downloads'
 20 | cleanup_elements = [
 21 |     '#footer', '#header', '#topInfo', '.share_this', '.random_links',
 22 |     '.vendor_vs_popular', '.bottomContent', '#id_300x250_banner_top',
 23 |     '.presentation_type', '#conference', '#imgPreload', '#text_height_fix_box',
 24 |     '.download_presentation', '.recorded', 'script[async]',
 25 |     'script[src*=addthis]'
 26 | ]
 27 | 
 28 | # Set argparse to parse the paramaters
 29 | parser = argparse.ArgumentParser(description='Download InfoQ presentations.')
 30 | parser.add_argument('url', metavar='URL', type=str,
 31 |                     help='URL of the presentation to download')
 32 | 
 33 | # Parse the arguments passed to the script
 34 | args = parser.parse_args()
 35 | url = args.url
 36 | 
 37 | # Tell infoq that I'm an iPad, so it gives me simpler HTML to parse & mp4 file
 38 | # qto download
 39 | user_agent = (
 40 |     "Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) "
 41 |     "AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b "
 42 |     "Safari/531.21.10')"
 43 | )
 44 | 
 45 | # Start downloading
 46 | print('Downloading HTML file')
 47 | 
 48 | content = requests.get(url, headers={'User-Agent': user_agent}).content
 49 | html_doc = lxml.html.fromstring(content)
 50 | title = html_doc.find(".//title").text
 51 | video_url = html_doc.cssselect('video > source')[0].attrib['src']
 52 | video_file = os.path.split(video_url)[1]
 53 | html_doc.cssselect('video > source')[0].attrib['src'] = video_file
 54 | 
 55 | # Clean the page
 56 | for elt in html_doc.cssselect(', '.join(e for e in cleanup_elements)):
 57 |     elt.getparent().remove(elt)
 58 | html_doc.cssselect('#wrapper')[0].attrib['style'] = 'background: none'
 59 | content = lxml.html.tostring(html_doc).decode('utf-8')
 60 | 
 61 | # Make slides links point to local copies
 62 | slides_re = re.compile(r"'(/resource/presentations/[^']*?/en/slides/[^']*?)'")
 63 | slides = slides_re.findall(content)
 64 | 
 65 | # Create a directory for the downloaded presentation if it doesn't exist
 66 | if not os.path.exists(download_directory):
 67 |     os.makedirs(download_directory)
 68 | 
 69 | # presentation folder path
 70 | if isinstance(title, text_type):
 71 |     normalized_title = unicodedata.normalize('NFKD', title)
 72 | else:
 73 |     normalized_title = text_type(title)
 74 | presentation_directory = os.path.join(download_directory, normalized_title)
 75 | # Create a folder with the name of the presentation
 76 | if not os.path.exists(presentation_directory):
 77 |     os.makedirs(presentation_directory)
 78 | 
 79 | # Create a slides folder inside the presentation folder
 80 | if not os.path.exists('{}/slides'.format(presentation_directory)):
 81 |     os.makedirs('{}/slides'.format(presentation_directory))
 82 | 
 83 | #Write content
 84 | content = re.sub(r"/resource/presentations/[^']*?/en/", '', content)
 85 | with open('{}/index.html'.format(presentation_directory), 'w') as f:
 86 |     f.write(content)
 87 |     f.flush()
 88 | 
 89 | # Download slides
 90 | slides_dir = os.path.join(presentation_directory, 'slides')
 91 | if not os.path.isdir(slides_dir):
 92 |     os.makedirs(slides_dir)
 93 | for i, slide in enumerate(slides):
 94 |     filename = os.path.split(slide)[1]
 95 |     full_path = os.path.join(slides_dir, '{0}'.format(filename))
 96 |     if os.path.exists(full_path):
 97 |         continue
 98 |     print('\rDownloading slide {0} of {1}'.format(i+1, len(slides)), end='')
 99 |     sys.stdout.flush()  # Hack for Python 2
100 |     url = 'http://www.infoq.com{0}'.format(slide)
101 |     with open(full_path, 'wb') as f:
102 |         f.write(requests.get(url).content)
103 | 
104 | print()
105 | 
106 | # If the video file is already downloaded successfully, don't do anything else
107 | if os.path.exists(video_file):
108 |     print('Video file already exists')
109 |     sys.exit()
110 | 
111 | # Download the video file. stream=True here is important to allow me to iterate
112 | # over content
113 | downloaded_file = os.path.join(
114 |     presentation_directory, '{}.part'.format(video_file)
115 | )
116 | 
117 | if os.path.exists(downloaded_file):
118 |     bytes_downloaded = os.stat(downloaded_file).st_size
119 | else:
120 |     bytes_downloaded = 0
121 | 
122 | r = requests.get(video_url, stream=True,
123 |                  headers={'Range': 'bytes={0}-'.format(bytes_downloaded)})
124 | content_length = int(r.headers['content-length']) + bytes_downloaded
125 | 
126 | with open(downloaded_file, 'ab') as f:
127 |     for chunk in r.iter_content(10 * 1024):
128 |         f.write(chunk)
129 |         f.flush()
130 |         # \r used to return the cursor to beginning of line, so I can write
131 |         # progress on a single line.
132 |         # The comma at the end of line is important, to stop the 'print' command
133 |         # from printing an additional new line
134 |         percent = f.tell() / content_length * 100
135 |         print('\rDownloading video {0:.2f}%'.format(percent), end='')
136 |         sys.stdout.flush()  # Hack for Python 2
137 | 
138 | final_video_name = os.path.join(presentation_directory, video_file)
139 | os.rename(downloaded_file, final_video_name)
140 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | lxml
3 | cssselect


--------------------------------------------------------------------------------