├── LICENSE ├── README.md ├── greader.py ├── rss2mobi.config.example └── rss2mobi.py /LICENSE: -------------------------------------------------------------------------------- 1 | This software is provided 'as-is', without any express or implied 2 | warranty. In no event will the author be held liable for any damages 3 | arising from the use of this software. 4 | 5 | Permission is granted to anyone to use this software for any purpose, 6 | including commercial applications, and to alter it and redistribute it 7 | freely, subject to the following restrictions: 8 | 9 | 1. The origin of this software must not be misrepresented; you must not 10 | claim that you wrote the original software. If you use this software 11 | in a product, an acknowledgment in the product documentation would be 12 | appreciated but is not required. 13 | 2. Altered source versions must be plainly marked as such, and must not be 14 | misrepresented as being the original software. 15 | 3. This notice may not be removed or altered from any source distribution. 16 | 17 | Copyright (c) 2010 Greg Hewgill http://hewgill.com 18 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rss2mobi 2 | 3 | Greg Hewgill 4 | http://hewgill.com 5 | 6 | `rss2mobi` is a script that downloads new feeds from Google Reader and uses `kindlegen` to create a Mobipocket e-book output file. 7 | 8 | ## Requirements 9 | 10 | - Python 3.1 11 | - [`kindlegen`](http://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000234621) 12 | 13 | ## Configuration 14 | 15 | Copy the supplied `rss2mobi.config.example` to `rss2mobi.config`. 16 | Edit this file (it is in JSON format) and supply your Google Reader username and password. 17 | If `kindlegen` is not in your `PATH`, set the full pathname of `kindlegen` here too. 18 | 19 | ## Usage 20 | 21 | To run `rss2mobi`: 22 | 23 | python3.1 rss2mobi.py 24 | 25 | The output file is `tmp/reader-YYYY-MM-DD.mobi` where `YYYY-MM-DD` is the current date. 26 | -------------------------------------------------------------------------------- /greader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pprint 3 | import urllib.request 4 | 5 | class GoogleReader: 6 | def __init__(self, account, password): 7 | self.account = account 8 | self.password = password 9 | 10 | def login(self): 11 | r = urllib.request.urlopen("https://www.google.com/accounts/ClientLogin?service=reader&Email={0}&Passwd={1}".format(self.account, self.password)) 12 | page = r.read().decode("ascii") 13 | self.auth = dict(x.split("=") for x in page.split("\n") if x)["Auth"] 14 | print("Auth:", self.auth) 15 | r = urllib.request.urlopen(urllib.request.Request("http://www.google.com/reader/api/0/token", headers={"Authorization": "GoogleLogin auth={0}".format(self.auth)})) 16 | self.token = r.read().decode("utf-8") 17 | print("Token:", self.token) 18 | 19 | def reading_list(self, label=None): 20 | retval = None 21 | continuation = None 22 | while True: 23 | r = urllib.request.urlopen(urllib.request.Request("http://www.google.com/reader/api/0/stream/contents/user/-/{items}?xt=user/-/state/com.google/read{cont}".format( 24 | items="label/{0}".format(label) if label else "state/com.google/reading-list", 25 | cont="&c={0}".format(continuation) if continuation else "", 26 | ), headers={"Authorization": "GoogleLogin auth={0}".format(self.auth)})) 27 | data = json.loads(r.read().decode("utf-8")) 28 | if retval is None: 29 | retval = data 30 | else: 31 | retval['items'].extend(data['items']) 32 | print(len(data['items'])) 33 | if 'continuation' not in data: 34 | break 35 | continuation = data['continuation'] 36 | return retval 37 | 38 | def mark_read(self, feed, id): 39 | tries = 0 40 | while True: 41 | try: 42 | r = urllib.request.urlopen(urllib.request.Request("http://www.google.com/reader/api/0/edit-tag", headers={"Authorization": "GoogleLogin auth={0}".format(self.auth)}), "a=user/-/state/com.google/read&s={0}&i={1}&T={2}".format(feed, id, self.token).encode("utf-8")) 43 | assert r.read().decode("utf-8") == "OK" 44 | break 45 | except urllib.error.URLError as x: 46 | if tries == 0: 47 | print("mark_read id={0}".format(id)) 48 | tries += 1 49 | if tries < 3: 50 | print(" retry: {}".format(x)) 51 | else: 52 | raise 53 | -------------------------------------------------------------------------------- /rss2mobi.config.example: -------------------------------------------------------------------------------- 1 | { 2 | 'account': "username", 3 | 'password': "password", 4 | 'kindlegen': "kindlegen", 5 | } 6 | -------------------------------------------------------------------------------- /rss2mobi.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import cgi 3 | import hashlib 4 | import html.parser 5 | import os 6 | import pprint 7 | import re 8 | import shutil 9 | import subprocess 10 | import sys 11 | import tempfile 12 | import time 13 | import urllib 14 | 15 | import greader 16 | 17 | g_Images = set() 18 | 19 | def nameify(s): 20 | return s.replace("/", "_").replace(":", "_") 21 | 22 | def fetch_retry(url): 23 | tries = 0 24 | while True: 25 | try: 26 | return urllib.request.urlopen(url) 27 | except urllib.error.URLError as x: 28 | tries += 1 29 | if tries < 3: 30 | print(" retry: {}".format(x)) 31 | else: 32 | raise 33 | 34 | def lget(attrs, name): 35 | try: 36 | i = [x[0] for x in attrs].index(name) 37 | return attrs[i][1] 38 | except ValueError: 39 | return None 40 | 41 | class ImageRewriter(html.parser.HTMLParser): 42 | def __init__(self): 43 | super().__init__() 44 | self.output = "" 45 | self.images = [] 46 | def handle_starttag(self, tag, attrs): 47 | if tag == "img": 48 | if lget(attrs, "height") != "1" or lget(attrs, "width") != "1": 49 | i = [x[0] for x in attrs].index("src") 50 | src = attrs[i][1] 51 | id = hashlib.sha1(src.encode("utf-8")).hexdigest() 52 | 53 | print("fetching {}".format(src)) 54 | try: 55 | r = fetch_retry(src) 56 | ct = r.getheader("Content-Type") 57 | if ';' in ct: 58 | # handle bizarre Content-Type: image/png; charset=UTF-8 59 | ct = ct[:ct.index(';')] 60 | ext = { 61 | "image/gif": ".gif", 62 | "image/jpeg": ".jpg", 63 | "image/png": ".png", 64 | }.get(ct) 65 | if ext is not None: 66 | with open(os.path.join(dir, id + ext), "wb") as f: 67 | f.write(r.read()) 68 | if SizeLimit: 69 | p = subprocess.Popen(["identify", os.path.join(dir, id + ext)], stdout=subprocess.PIPE) 70 | stdout, _ = p.communicate() 71 | m = re.search(b"(\\d+)x(\\d+)", stdout) 72 | if m is not None: 73 | width = int(m.group(1)) 74 | if width > SizeLimit: 75 | subprocess.call(["mogrify", "-geometry", str(SizeLimit), os.path.join(dir, id + ext)]) 76 | attrs[i] = (attrs[i][0], id + ext) 77 | g_Images.add((id, id + ext, ct)) 78 | else: 79 | print(" {}".format(ct)) 80 | self.output += "<{0}{1} />".format(tag, "".join((' {0}="{1}"'.format(x[0], cgi.escape(x[1], True)) if x[1] else " {0}".format(x[0])) for x in attrs)) 81 | except Exception as x: 82 | print("oh well:", x) 83 | self.output += self.get_starttag_text() 84 | return 85 | else: 86 | self.output += self.get_starttag_text() 87 | def handle_endtag(self, tag): 88 | self.output += "".format(tag) 89 | def handle_data(self, data): 90 | self.output += data 91 | def handle_charref(self, name): 92 | self.output += "&#{};".format(name) 93 | def handle_entityref(self, name): 94 | self.output += "&{};".format(name) 95 | 96 | KeepUnread = False 97 | Label = None 98 | PostLimit = 200 99 | SizeLimit = None 100 | 101 | i = 1 102 | while i < len(sys.argv): 103 | a = sys.argv[i] 104 | if a in ("--keep-unread", "-k"): 105 | KeepUnread = True 106 | elif a in ("--label", "-l"): 107 | i += 1 108 | Label = sys.argv[i] 109 | elif a in ("--post-limit", "-p"): 110 | i += 1 111 | PostLimit = int(sys.argv[i]) 112 | elif a in ("--size-limit", "-s"): 113 | i += 1 114 | SizeLimit = int(sys.argv[i]) 115 | else: 116 | print("Unknown command line option: {}".format(a)) 117 | sys.exit(1) 118 | i += 1 119 | 120 | with open("rss2mobi.config") as f: 121 | Config = ast.literal_eval(f.read()) 122 | 123 | today = "{0.tm_year:04}-{0.tm_mon:02}-{0.tm_mday:02}".format(time.localtime(time.time())) 124 | 125 | reader = greader.GoogleReader(Config['account'], Config['password']) 126 | reader.login() 127 | feed = reader.reading_list(label=Label) 128 | 129 | feed['items'].reverse() 130 | feed['items'] = [x for x in feed['items'] if 'alternate' in x] 131 | feed['items'] = [x for i, x in enumerate(feed['items']) if x['id'] not in [y['id'] for y in feed['items'][:i]]] 132 | if len(feed['items']) > PostLimit: 133 | print("Posts limited to maximum of {} (from {})".format(PostLimit, len(feed['items']))) 134 | feed['items'] = feed['items'][:PostLimit] 135 | 136 | if os.access("tmp", os.F_OK): 137 | shutil.rmtree("tmp") 138 | dir = "tmp" #tempfile.mkdtemp(dir=".") 139 | os.mkdir(dir) 140 | try: 141 | with open(os.path.join(dir, "reading_list.out"), "w", encoding="utf-8") as f: 142 | pprint.pprint(feed, stream=f) 143 | for e in feed['items']: 144 | #pprint.pprint(e) 145 | fname = nameify(e['id']) + ".html" 146 | e['fname'] = fname 147 | if 'title' not in e: 148 | e['title'] = "(no title)" 149 | if 'title' not in e['origin']: 150 | e['origin']['title'] = "(no name)" 151 | f = open(os.path.join(dir, fname), "w", encoding="utf-8") 152 | print("""""", file=f) 153 | print("""""", file=f) 154 | print("", file=f) 155 | print("

{0}

".format(e['origin']['title']), file=f) 156 | print("""

{1}

""".format(e['alternate'][0]['href'], e['title']), file=f) 157 | which = "content" if "content" in e else "summary" 158 | if which in e: 159 | content = e[which]['content'] 160 | else: 161 | content = "" 162 | rw = ImageRewriter() 163 | rw.feed(content) 164 | print(rw.output, file=f) 165 | print("", file=f) 166 | print("", file=f) 167 | f.close() 168 | 169 | with open(os.path.join(dir, "contents.html"), "w", encoding="utf-8") as f: 170 | print("""""", file=f) 171 | print("""", file=f) 173 | print("

Google Reader {}

".format(today), file=f) 174 | print("

{0}

".format(len(feed['items'])), file=f) 175 | for e in feed['items']: 176 | print("""

{2} ({1})

""".format(e['fname'], e['origin']['title'], e['title']), file=f) 177 | print("", file=f) 178 | print("", file=f) 179 | 180 | with open(os.path.join(dir, "toc.ncx"), "w", encoding="utf-8") as f: 181 | print("""""", file=f) 182 | print("""""", file=f) 183 | print("""Google Reader {}""".format(today), file=f) 184 | print("", file=f) 185 | print("""""", file=f) 186 | print(""" Table of Contents""", file=f) 187 | print(""" """, file=f) 188 | print("""""", file=f) 189 | order = 2 190 | for e in feed['items']: 191 | print("""""".format(e['fname'], order), file=f) 192 | print(""" {}""".format(e['title']), file=f) 193 | print(""" """.format(e['fname']), file=f) 194 | print("""""", file=f) 195 | order += 1 196 | print("", file=f) 197 | print("", file=f) 198 | 199 | opfn = "reader-{0}.opf".format(today) 200 | f = open(os.path.join(dir, opfn), "w") 201 | print(""" 202 | 203 | 204 | 205 | reader-{0} 206 | Reader {0} ({1}) 207 | EN 208 | {0} 209 | 210 | 211 | """.format(today, len(feed['items'])), file=f) 212 | print("""""", file=f) 213 | for e in feed['items']: 214 | print("""""".format(nameify(e['id'])), file=f) 215 | for i in g_Images: 216 | print("""""".format(*i), file=f) 217 | print("""""", file=f) 218 | print(""" 219 | """, file=f) 220 | print("""""", file=f) 221 | for e in feed['items']: 222 | print("""""".format(nameify(e['id'])), file=f) 223 | print(""" 224 | """, file=f) 225 | f.close() 226 | os.system("{0} {1}".format(Config['kindlegen'], os.path.join(dir, opfn))) 227 | assert os.access(os.path.join(dir, "reader-{0}.mobi".format(today)), os.F_OK) 228 | if not KeepUnread: 229 | for e in feed['items']: 230 | reader.mark_read(e['origin']['streamId'], e['id']) 231 | finally: 232 | pass #shutil.rmtree(dir) 233 | --------------------------------------------------------------------------------