├── README.md ├── .gitignore ├── MsdnMagazine2013.py └── InfoQ.py /README.md: -------------------------------------------------------------------------------- 1 | calibre-recipes 2 | =============== 3 | 4 | Recipes for calibre 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /MsdnMagazine2013.py: -------------------------------------------------------------------------------- 1 | import re 2 | from calibre.ebooks.BeautifulSoup import NavigableString 3 | 4 | issue_page = 'jj883946.aspx' # January 5 | # issue_page = 'jj891014.aspx' # February 6 | # issue_page = 'jj991969.aspx' # March 7 | # issue_page = 'dn166920.aspx' # April 8 | # issue_page = 'dn198231.aspx' # May 9 | # issue_page = 'dn201737.aspx' # June 10 | 11 | issue_prefix = 'http://msdn.microsoft.com/en-us/magazine/' 12 | cover_prefix = 'http://i.msdn.microsoft.com/' 13 | 14 | class MsdnMagazine2013(BasicNewsRecipe): 15 | title = u'MSDN Magazine 2013' 16 | auto_cleanup = True 17 | auto_cleanup_keep = '//*[@class="FeatureTitle"]' 18 | remove_tags_before = { 'class' : 'MagazineStyle' } 19 | remove_tags_after = { 'class' : 'MagazineStyle' } 20 | no_stylesheets = True 21 | publication_type = 'magazine' 22 | cover_url = cover_prefix + issue_page.replace('.aspx', '.cover_lrg(en-us,MSDN.10).jpg') 23 | 24 | def get_text(self, ele): 25 | title = '' 26 | 27 | for c in ele.contents: 28 | if isinstance(c, NavigableString): 29 | title = title + str(c) 30 | elif c.name == 'br': 31 | title = title + ': ' 32 | else: 33 | title = title + self.get_text(c) 34 | 35 | return title.strip() 36 | 37 | def get_description(self, link): 38 | description = '' 39 | 40 | ele = link.nextSibling 41 | 42 | while ele: 43 | if isinstance(ele, NavigableString): 44 | description = description + str(ele) 45 | elif ele.name == 'br': 46 | description = description + ' ' 47 | 48 | ele = ele.nextSibling 49 | 50 | return description.strip() 51 | 52 | def parse_index(self): 53 | soup = self.index_to_soup(issue_prefix + issue_page) 54 | 55 | self.title = soup.html.head.title.string.strip() 56 | 57 | mainContent = soup.find('div', { 'id' : "MainContent" }) 58 | 59 | articles_dict = {} 60 | article_list = [] 61 | 62 | for link in mainContent.findAll('a'): 63 | if link.img: 64 | continue 65 | 66 | href = link['href'] 67 | if href == issue_page: 68 | continue 69 | 70 | if not re.match(r"\w\w\d+\.aspx", href): 71 | continue 72 | 73 | title = self.get_text(link) 74 | 75 | a = articles_dict.get(href) 76 | if a: 77 | old_title = a['title'] 78 | if not old_title.endswith(':'): 79 | old_title = old_title + ':' 80 | 81 | a['title'] = old_title + ' ' + title 82 | else: 83 | a = { 'title' : title, 'url' : issue_prefix + href.replace('.aspx', '(printer).aspx') } 84 | article_list.append(a) 85 | articles_dict[href] = a 86 | 87 | if link.parent.name == 'p': 88 | a['description'] = self.get_description(link) 89 | elif link.parent.name == 'strong' and len(link.contents) > 1: 90 | a['description'] = self.get_description(link.parent) 91 | 92 | return [('Default', article_list)] 93 | 94 | def postprocess_html(self, soup, first_fetch): 95 | for link in soup.findAll('a'): 96 | s = link.nextSibling 97 | 98 | if not (s and isinstance(s, NavigableString)): 99 | continue 100 | 101 | text = ' [ ' + link['href'] 102 | if not s.startswith(text): 103 | continue 104 | 105 | index = s.find(' ] ', len(text)) + 3 106 | if index > 0: 107 | s.replaceWith(s[index:]) 108 | 109 | return soup -------------------------------------------------------------------------------- /InfoQ.py: -------------------------------------------------------------------------------- 1 | import re, urlparse, itertools 2 | from calibre.ebooks.BeautifulSoup import NavigableString, Tag 3 | from datetime import date, timedelta 4 | 5 | language = 'en' 6 | 7 | site_url = 'http://www.infoq.com/' 8 | 9 | title_prefix = 'InfoQ' 10 | 11 | date_regexes = [ 12 | r'Jan\s+(?P\d{2}),\s+(?P\d{4})', 13 | r'Feb\s+(?P\d{2}),\s+(?P\d{4})', 14 | r'Mar\s+(?P\d{2}),\s+(?P\d{4})', 15 | r'Apr\s+(?P\d{2}),\s+(?P\d{4})', 16 | r'May\s+(?P\d{2}),\s+(?P\d{4})', 17 | r'Jun\s+(?P\d{2}),\s+(?P\d{4})', 18 | r'Jul\s+(?P\d{2}),\s+(?P\d{4})', 19 | r'Aug\s+(?P\d{2}),\s+(?P\d{4})', 20 | r'Sep\s+(?P\d{2}),\s+(?P\d{4})', 21 | r'Oct\s+(?P\d{2}),\s+(?P\d{4})', 22 | r'Nov\s+(?P\d{2}),\s+(?P\d{4})', 23 | r'Dec\s+(?P\d{2}),\s+(?P\d{4})' 24 | ] 25 | 26 | ''' 27 | language = 'zh' 28 | 29 | site_url = 'http://www.infoq.com/cn/' 30 | 31 | title_prefix = 'InfoQ中国站' 32 | 33 | date_regexes = [ 34 | r'一月\s+(?P\d{2}),\s+(?P\d{4})', 35 | r'二月\s+(?P\d{2}),\s+(?P\d{4})', 36 | r'三月\s+(?P\d{2}),\s+(?P\d{4})', 37 | r'四月\s+(?P\d{2}),\s+(?P\d{4})', 38 | r'五月\s+(?P\d{2}),\s+(?P\d{4})', 39 | r'六月\s+(?P\d{2}),\s+(?P\d{4})', 40 | r'七月\s+(?P\d{2}),\s+(?P\d{4})', 41 | r'八月\s+(?P\d{2}),\s+(?P\d{4})', 42 | r'九月\s+(?P\d{2}),\s+(?P\d{4})', 43 | r'十月\s+(?P\d{2}),\s+(?P\d{4})', 44 | r'十一月\s+(?P\d{2}),\s+(?P\d{4})', 45 | r'十二月\s+(?P\d{2}),\s+(?P\d{4})' 46 | ] 47 | ''' 48 | 49 | # the sections to download 50 | sections = [ 'news', 'articles', 'interviews' ] 51 | 52 | # the range of date (both inclusive) to download 53 | date_range = (date(2013, 6, 20), date(2013, 6, 22)) 54 | 55 | # the range of date to override for sections 56 | section_date_ranges = { 57 | # 'news': (date(2013, 6, 21), date(2013, 6, 22)), 58 | # 'articles': (date(2013, 6, 5), date(2013, 6, 10)), 59 | # 'interviews': (date(2013, 1, 1), date(2013, 3, 1)) 60 | } 61 | 62 | # do NOT touch the code below unless you know what to do 63 | 64 | def range2str(range, shorten): 65 | year_fmt = '%Y%m%d' 66 | month_fmt = '%m%d' 67 | day_fmt = '%d' 68 | 69 | begin, end = range 70 | if begin == end: 71 | return begin.strftime(year_fmt) 72 | else: 73 | text = begin.strftime(year_fmt) + "~" 74 | if not shorten: 75 | return text + end.strftime(year_fmt) 76 | 77 | if begin.year == end.year and begin.month == end.month: 78 | return text + end.strftime(day_fmt) 79 | 80 | if begin.year == end.year: 81 | return text + end.strftime(month_fmt) 82 | 83 | return text + end.strftime(year_fmt) 84 | 85 | def generate_title(prefix): 86 | text = prefix + ' ' + range2str(date_range, True) 87 | 88 | for sec in sections: 89 | range = section_date_ranges.get(sec) 90 | if range: 91 | text = text + ' ' + sec[0].upper() + range2str(range, True) 92 | 93 | return text 94 | 95 | def parse_date(text): 96 | for i in xrange(len(date_regexes)): 97 | m = re.search(date_regexes[i], text) 98 | if not m: continue 99 | 100 | year = int(m.group('year')) 101 | month = i + 1 102 | day = int(m.group('day')) 103 | 104 | return date(year, month, day) 105 | 106 | def get_text(tag): 107 | text = '' 108 | for c in tag.contents: 109 | if isinstance(c, NavigableString): 110 | text = text + str(c) 111 | else: 112 | text = text + get_text(c) 113 | 114 | return text.strip() 115 | 116 | def find_by_class(tag, name, cls): 117 | for c in tag.findAll(name): 118 | c_cls = c.get('class') 119 | if not c_cls: continue 120 | if cls not in c_cls: continue 121 | 122 | yield c 123 | 124 | _section_texts = {} 125 | _section_item_classes = { 126 | 'news': ['news_type_block'], 127 | 'articles': ['news_type1', 'news_type2'], 128 | 'interviews': ['news_type_video'] 129 | } 130 | 131 | class InfoQ(BasicNewsRecipe): 132 | title = title_prefix 133 | 134 | language = language 135 | 136 | no_stylesheets = True 137 | 138 | keep_only_tags = [ { 'id': 'content' } ] 139 | 140 | remove_tags = [ 141 | { 'id': 'noOfComments' }, 142 | { 'class': 'share_this' }, 143 | { 'class': 'article_page_right' } 144 | ] 145 | 146 | def get_items(self, section): 147 | print '>>> Retrieving items for section: ', section 148 | 149 | text_retrieved = False 150 | count = 0 151 | 152 | while True: 153 | print '>>> Loading items from ' + section + '/' + str(count) 154 | 155 | root = self.index_to_soup(site_url + section + '/' + str(count)) 156 | content_div = root.find('div', { 'id': 'content' }) 157 | 158 | if not text_retrieved: 159 | text_retrieved = True 160 | text = content_div.h2.string.strip() 161 | _section_texts[section] = text 162 | print '>>> Text for section "' + section + '": ' + text 163 | 164 | for item_class in _section_item_classes[section]: 165 | for item_div in find_by_class(content_div, 'div', item_class): 166 | item = {} 167 | link = item_div.h2.a 168 | item['title'] = link.string.strip() 169 | item['url'] = urlparse.urljoin(site_url, link['href']) 170 | item['description'] = get_text(item_div.p) 171 | 172 | author_span = item_div.find('span', { 'class': 'author' }) 173 | date_text = str(author_span.contents[-1]) 174 | item['date'] = parse_date(date_text) 175 | 176 | print '>>> Item parsed: ', item 177 | 178 | yield item 179 | count = count + 1 180 | 181 | def parse_index(self): 182 | self.title = generate_title(self.title) 183 | 184 | index = [] 185 | 186 | for sec in sections: 187 | item_list = [] 188 | 189 | range = section_date_ranges.get(sec) 190 | if not range: range = date_range 191 | 192 | begin, end = range 193 | for item in self.get_items(sec): 194 | date = item['date'] 195 | 196 | if date > end: continue 197 | if date < begin: break 198 | 199 | item_list.append(item) 200 | 201 | index.append((_section_texts[sec] + ' (' + range2str(range, False) + ')', item_list)) 202 | 203 | return index 204 | 205 | def postprocess_html(self, soup, first_fetch): 206 | author_general = soup.find('span', { 'class': 'author_general' }) 207 | author_general.em.extract() 208 | 209 | # the complete content 210 | full_div = None 211 | 212 | transcript_div = soup.find('div', { 'id': 'transcript' }) 213 | if transcript_div: # that's an interview 214 | # get all
215 | qa_div_list = list(find_by_class(transcript_div, 'div', 'qa')) 216 | for qa_div in qa_div_list: 217 | qa_div.extract() 218 | 219 | # replace all ... with ... 220 | question_link = qa_div.find('a', { 'class': 'question_link' }) 221 | question_strong = Tag(soup, 'strong') 222 | question_strong.append(question_link.string) 223 | question_link.replaceWith(question_strong) 224 | 225 | full_div = find_by_class(soup.find('div', { 'id': 'content' }), 'div', 'presentation_full').next() 226 | 227 | # clean the

228 | full_div.h1.span.extract() 229 | title_div = full_div.h1.div 230 | title_div.replaceWith(title_div.string) 231 | 232 | # clear the presentation area 233 | for div in full_div.findAll('div'): 234 | div.extract() 235 | 236 | # add qa list back to presentation area 237 | for qa_div in qa_div_list: 238 | full_div.append(qa_div) 239 | else: 240 | # text only without title 241 | text_div = find_by_class(soup, 'div', 'text_info').next() 242 | text_div.extract() 243 | 244 | for other in text_div.findAll('div'): 245 | other.extract() 246 | 247 | # full_div contains title 248 | full_div = soup.find('div', { 'id': 'content' }) 249 | for other in full_div.findAll('div'): 250 | other.extract() 251 | 252 | full_div.append(text_div) 253 | 254 | full_div.extract() 255 | 256 | nav_div = soup.body.div 257 | nav_div.extract() 258 | 259 | # keep nav_div and full_div in only 260 | for other in soup.body: 261 | other.extract() 262 | 263 | soup.body.append(nav_div) 264 | soup.body.append(full_div) 265 | 266 | return soup --------------------------------------------------------------------------------