├── README.md
├── .gitignore
├── MsdnMagazine2013.py
└── InfoQ.py


/README.md:
--------------------------------------------------------------------------------
1 | calibre-recipes
2 | ===============
3 | 
4 | Recipes for calibre
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/MsdnMagazine2013.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from calibre.ebooks.BeautifulSoup import NavigableString
  3 | 
  4 | issue_page = 'jj883946.aspx' # January
  5 | # issue_page = 'jj891014.aspx' # February
  6 | # issue_page = 'jj991969.aspx' # March
  7 | # issue_page = 'dn166920.aspx' # April
  8 | # issue_page = 'dn198231.aspx' # May
  9 | # issue_page = 'dn201737.aspx' # June
 10 | 
 11 | issue_prefix = 'http://msdn.microsoft.com/en-us/magazine/'
 12 | cover_prefix = 'http://i.msdn.microsoft.com/'
 13 | 
 14 | class MsdnMagazine2013(BasicNewsRecipe):
 15 |     title = u'MSDN Magazine 2013'
 16 |     auto_cleanup = True
 17 |     auto_cleanup_keep = '//*[@class="FeatureTitle"]'
 18 |     remove_tags_before = { 'class' : 'MagazineStyle' }
 19 |     remove_tags_after  = { 'class' : 'MagazineStyle' }
 20 |     no_stylesheets = True
 21 |     publication_type = 'magazine'
 22 |     cover_url = cover_prefix + issue_page.replace('.aspx', '.cover_lrg(en-us,MSDN.10).jpg')
 23 |     
 24 |     def get_text(self, ele):
 25 |         title = ''
 26 | 
 27 |         for c in ele.contents:
 28 |             if isinstance(c, NavigableString):
 29 |                 title = title + str(c)
 30 |             elif c.name == 'br':
 31 |                 title = title + ': '
 32 |             else:
 33 |                 title = title + self.get_text(c)
 34 |         
 35 |         return title.strip()
 36 |         
 37 |     def get_description(self, link):
 38 |         description = ''
 39 |     
 40 |         ele = link.nextSibling
 41 |         
 42 |         while ele:
 43 |             if isinstance(ele, NavigableString):
 44 |                 description = description + str(ele)
 45 |             elif ele.name == 'br':
 46 |                 description = description + ' '
 47 | 
 48 |             ele = ele.nextSibling
 49 |     
 50 |         return description.strip()
 51 |     
 52 |     def parse_index(self):
 53 |         soup = self.index_to_soup(issue_prefix + issue_page)
 54 |         
 55 |         self.title = soup.html.head.title.string.strip()
 56 |         
 57 |         mainContent = soup.find('div', { 'id' : "MainContent" })
 58 | 
 59 |         articles_dict = {}
 60 |         article_list = []
 61 |         
 62 |         for link in mainContent.findAll('a'):
 63 |             if link.img:
 64 |                 continue
 65 |         
 66 |             href = link['href']
 67 |             if href == issue_page:
 68 |                 continue
 69 |             
 70 |             if not re.match(r"\w\w\d+\.aspx", href):
 71 |                 continue
 72 |             
 73 |             title = self.get_text(link)
 74 |             
 75 |             a = articles_dict.get(href)
 76 |             if a:
 77 |                 old_title = a['title']
 78 |                 if not old_title.endswith(':'):
 79 |                     old_title = old_title + ':'
 80 | 
 81 |                 a['title'] = old_title + ' ' + title
 82 |             else:
 83 |                 a = { 'title' : title, 'url' : issue_prefix + href.replace('.aspx', '(printer).aspx') }
 84 |                 article_list.append(a)
 85 |                 articles_dict[href] = a
 86 |                 
 87 |             if link.parent.name == 'p':
 88 |                 a['description'] = self.get_description(link)
 89 |             elif link.parent.name == 'strong' and len(link.contents) > 1:
 90 |                 a['description'] = self.get_description(link.parent)
 91 | 
 92 |         return [('Default', article_list)]
 93 |         
 94 |     def postprocess_html(self, soup, first_fetch):
 95 |         for link in soup.findAll('a'):
 96 |             s = link.nextSibling
 97 | 
 98 |             if not (s and isinstance(s, NavigableString)):
 99 |                 continue
100 | 
101 |             text = ' [ ' + link['href']
102 |             if not s.startswith(text):
103 |                 continue
104 |             
105 |             index = s.find(' ] ', len(text)) + 3
106 |             if index > 0:
107 |                 s.replaceWith(s[index:])
108 | 
109 |         return soup


--------------------------------------------------------------------------------
/InfoQ.py:
--------------------------------------------------------------------------------
  1 | import re, urlparse, itertools
  2 | from calibre.ebooks.BeautifulSoup import NavigableString, Tag
  3 | from datetime import date, timedelta
  4 | 
  5 | language = 'en'
  6 | 
  7 | site_url = 'http://www.infoq.com/'
  8 | 
  9 | title_prefix = 'InfoQ'
 10 | 
 11 | date_regexes = [
 12 |     r'Jan\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 13 |     r'Feb\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 14 |     r'Mar\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 15 |     r'Apr\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 16 |     r'May\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 17 |     r'Jun\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 18 |     r'Jul\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 19 |     r'Aug\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 20 |     r'Sep\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 21 |     r'Oct\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 22 |     r'Nov\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 23 |     r'Dec\s+(?P<day>\d{2}),\s+(?P<year>\d{4})'
 24 | ]
 25 | 
 26 | '''
 27 | language = 'zh'
 28 | 
 29 | site_url = 'http://www.infoq.com/cn/'
 30 | 
 31 | title_prefix = 'InfoQ中国站'
 32 | 
 33 | date_regexes = [
 34 |     r'一月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 35 |     r'二月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 36 |     r'三月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 37 |     r'四月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 38 |     r'五月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 39 |     r'六月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 40 |     r'七月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 41 |     r'八月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 42 |     r'九月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 43 |     r'十月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 44 |     r'十一月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})',
 45 |     r'十二月\s+(?P<day>\d{2}),\s+(?P<year>\d{4})'
 46 | ]
 47 | '''
 48 | 
 49 | # the sections to download
 50 | sections = [ 'news', 'articles', 'interviews' ]
 51 | 
 52 | # the range of date (both inclusive) to download
 53 | date_range = (date(2013, 6, 20), date(2013, 6, 22))
 54 | 
 55 | # the range of date to override for sections
 56 | section_date_ranges = {
 57 |     # 'news': (date(2013, 6, 21), date(2013, 6, 22)),
 58 |     # 'articles': (date(2013, 6, 5), date(2013, 6, 10)),
 59 |     # 'interviews': (date(2013, 1, 1), date(2013, 3, 1))
 60 | }
 61 | 
 62 | # do NOT touch the code below unless you know what to do
 63 | 
 64 | def range2str(range, shorten):
 65 |     year_fmt = '%Y%m%d'
 66 |     month_fmt = '%m%d'
 67 |     day_fmt = '%d'
 68 | 
 69 |     begin, end = range
 70 |     if begin == end:
 71 |         return begin.strftime(year_fmt)
 72 |     else:
 73 |         text = begin.strftime(year_fmt) + "~"
 74 |         if not shorten:
 75 |             return text + end.strftime(year_fmt)
 76 |         
 77 |         if begin.year == end.year and begin.month == end.month:
 78 |             return text + end.strftime(day_fmt)
 79 | 
 80 |         if begin.year == end.year:
 81 |             return text + end.strftime(month_fmt)
 82 |             
 83 |         return text + end.strftime(year_fmt)
 84 | 
 85 | def generate_title(prefix):
 86 |     text = prefix + ' ' + range2str(date_range, True)
 87 |     
 88 |     for sec in sections:
 89 |         range = section_date_ranges.get(sec)
 90 |         if range:
 91 |             text = text + ' ' + sec[0].upper() + range2str(range, True)
 92 |     
 93 |     return text
 94 | 
 95 | def parse_date(text):
 96 |     for i in xrange(len(date_regexes)):
 97 |         m = re.search(date_regexes[i], text)
 98 |         if not m: continue
 99 |         
100 |         year = int(m.group('year'))
101 |         month = i + 1
102 |         day = int(m.group('day'))
103 |         
104 |         return date(year, month, day)
105 | 
106 | def get_text(tag):
107 |     text = ''
108 |     for c in tag.contents:
109 |         if isinstance(c, NavigableString):
110 |             text = text + str(c)
111 |         else:
112 |             text = text + get_text(c)
113 |             
114 |     return text.strip()
115 |     
116 | def find_by_class(tag, name, cls):
117 |     for c in tag.findAll(name):
118 |         c_cls = c.get('class')
119 |         if not c_cls: continue
120 |         if cls not in c_cls: continue
121 |         
122 |         yield c
123 | 
124 | _section_texts = {}
125 | _section_item_classes = {
126 |     'news': ['news_type_block'],
127 |     'articles': ['news_type1', 'news_type2'],
128 |     'interviews': ['news_type_video']
129 | }
130 |         
131 | class InfoQ(BasicNewsRecipe):
132 |     title = title_prefix
133 |     
134 |     language = language
135 |     
136 |     no_stylesheets = True
137 |     
138 |     keep_only_tags = [ { 'id': 'content' } ]
139 |     
140 |     remove_tags = [
141 |         { 'id': 'noOfComments' },
142 |         { 'class': 'share_this' },
143 |         { 'class': 'article_page_right' }
144 |     ]
145 |     
146 |     def get_items(self, section):
147 |         print '>>> Retrieving items for section: ', section
148 |     
149 |         text_retrieved = False
150 |         count = 0
151 | 
152 |         while True:
153 |             print '>>> Loading items from ' + section + '/' + str(count)
154 | 
155 |             root = self.index_to_soup(site_url + section + '/' + str(count))
156 |             content_div = root.find('div', { 'id': 'content' })
157 |             
158 |             if not text_retrieved:
159 |                 text_retrieved = True
160 |                 text = content_div.h2.string.strip()
161 |                 _section_texts[section] = text
162 |                 print '>>> Text for section "' + section + '": ' + text
163 |                 
164 |             for item_class in _section_item_classes[section]:
165 |                 for item_div in find_by_class(content_div, 'div', item_class):
166 |                     item = {}
167 |                     link = item_div.h2.a
168 |                     item['title'] = link.string.strip()
169 |                     item['url'] = urlparse.urljoin(site_url, link['href'])
170 |                     item['description'] = get_text(item_div.p)
171 | 
172 |                     author_span = item_div.find('span', { 'class': 'author' })
173 |                     date_text = str(author_span.contents[-1])
174 |                     item['date'] = parse_date(date_text)
175 |                     
176 |                     print '>>> Item parsed: ', item
177 |                     
178 |                     yield item
179 |                     count = count + 1
180 |     
181 |     def parse_index(self):
182 |         self.title = generate_title(self.title)
183 |     
184 |         index = []
185 |         
186 |         for sec in sections:
187 |             item_list = []
188 |         
189 |             range = section_date_ranges.get(sec)
190 |             if not range: range = date_range
191 |             
192 |             begin, end = range
193 |             for item in self.get_items(sec):
194 |                 date = item['date']
195 |                 
196 |                 if date > end: continue
197 |                 if date < begin: break
198 | 
199 |                 item_list.append(item)
200 |             
201 |             index.append((_section_texts[sec] + ' (' + range2str(range, False) + ')', item_list))
202 | 
203 |         return index
204 |     
205 |     def postprocess_html(self, soup, first_fetch):
206 |         author_general = soup.find('span', { 'class': 'author_general' })
207 |         author_general.em.extract()
208 |     
209 |         # the complete content
210 |         full_div = None
211 |     
212 |         transcript_div = soup.find('div', { 'id': 'transcript' })
213 |         if transcript_div: # that's an interview
214 |             # get all <div class="qa" />
215 |             qa_div_list = list(find_by_class(transcript_div, 'div', 'qa'))
216 |             for qa_div in qa_div_list:
217 |                 qa_div.extract()
218 |                 
219 |                 # replace all <a class="question_link">...</a> with <strong>...</strong>
220 |                 question_link = qa_div.find('a', { 'class': 'question_link' })
221 |                 question_strong = Tag(soup, 'strong')
222 |                 question_strong.append(question_link.string)
223 |                 question_link.replaceWith(question_strong)
224 |             
225 |             full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next()
226 |             
227 |             # clean the <h1 />
228 |             full_div.h1.span.extract()
229 |             title_div = full_div.h1.div
230 |             title_div.replaceWith(title_div.string)
231 |             
232 |             # clear the presentation area
233 |             for div in full_div.findAll('div'):
234 |                 div.extract()
235 |             
236 |             # add qa list back to presentation area
237 |             for qa_div in qa_div_list:
238 |                 full_div.append(qa_div)
239 |         else:
240 |             # text only without title
241 |             text_div = find_by_class(soup, 'div', 'text_info').next()
242 |             text_div.extract()
243 |             
244 |             for other in text_div.findAll('div'):
245 |                 other.extract()
246 |             
247 |             # full_div contains title
248 |             full_div = soup.find('div', { 'id': 'content' })
249 |             for other in full_div.findAll('div'):
250 |                 other.extract()
251 |             
252 |             full_div.append(text_div)
253 | 
254 |         full_div.extract()
255 |         
256 |         nav_div = soup.body.div
257 |         nav_div.extract()
258 |         
259 |         # keep nav_div and full_div in <body /> only
260 |         for other in soup.body:
261 |             other.extract()
262 |         
263 |         soup.body.append(nav_div)
264 |         soup.body.append(full_div)
265 | 
266 |         return soup


--------------------------------------------------------------------------------