├── LICENSE
├── README.md
├── greader.py
├── rss2mobi.config.example
└── rss2mobi.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | This software is provided 'as-is', without any express or implied
 2 | warranty.  In no event will the author be held liable for any damages
 3 | arising from the use of this software.
 4 | 
 5 | Permission is granted to anyone to use this software for any purpose,
 6 | including commercial applications, and to alter it and redistribute it
 7 | freely, subject to the following restrictions:
 8 | 
 9 | 1. The origin of this software must not be misrepresented; you must not
10 |    claim that you wrote the original software. If you use this software
11 |    in a product, an acknowledgment in the product documentation would be
12 |    appreciated but is not required.
13 | 2. Altered source versions must be plainly marked as such, and must not be
14 |    misrepresented as being the original software.
15 | 3. This notice may not be removed or altered from any source distribution.
16 | 
17 | Copyright (c) 2010 Greg Hewgill http://hewgill.com
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # rss2mobi
 2 | 
 3 | Greg Hewgill  
 4 | http://hewgill.com
 5 | 
 6 | `rss2mobi` is a script that downloads new feeds from Google Reader and uses `kindlegen` to create a Mobipocket e-book output file.
 7 | 
 8 | ## Requirements
 9 | 
10 | - Python 3.1
11 | - [`kindlegen`](http://www.amazon.com/gp/feature.html?ie=UTF8&docId=1000234621)
12 | 
13 | ## Configuration
14 | 
15 | Copy the supplied `rss2mobi.config.example` to `rss2mobi.config`.
16 | Edit this file (it is in JSON format) and supply your Google Reader username and password.
17 | If `kindlegen` is not in your `PATH`, set the full pathname of `kindlegen` here too.
18 | 
19 | ## Usage
20 | 
21 | To run `rss2mobi`:
22 | 
23 |     python3.1 rss2mobi.py
24 | 
25 | The output file is `tmp/reader-YYYY-MM-DD.mobi` where `YYYY-MM-DD` is the current date.
26 | 


--------------------------------------------------------------------------------
/greader.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pprint
 3 | import urllib.request
 4 | 
 5 | class GoogleReader:
 6 |     def __init__(self, account, password):
 7 |         self.account = account
 8 |         self.password = password
 9 | 
10 |     def login(self):
11 |         r = urllib.request.urlopen("https://www.google.com/accounts/ClientLogin?service=reader&Email={0}&Passwd={1}".format(self.account, self.password))
12 |         page = r.read().decode("ascii")
13 |         self.auth = dict(x.split("=") for x in page.split("\n") if x)["Auth"]
14 |         print("Auth:", self.auth)
15 |         r = urllib.request.urlopen(urllib.request.Request("http://www.google.com/reader/api/0/token", headers={"Authorization": "GoogleLogin auth={0}".format(self.auth)}))
16 |         self.token = r.read().decode("utf-8")
17 |         print("Token:", self.token)
18 | 
19 |     def reading_list(self, label=None):
20 |         retval = None
21 |         continuation = None
22 |         while True:
23 |             r = urllib.request.urlopen(urllib.request.Request("http://www.google.com/reader/api/0/stream/contents/user/-/{items}?xt=user/-/state/com.google/read{cont}".format(
24 |                 items="label/{0}".format(label) if label else "state/com.google/reading-list",
25 |                 cont="&c={0}".format(continuation) if continuation else "",
26 |             ), headers={"Authorization": "GoogleLogin auth={0}".format(self.auth)}))
27 |             data = json.loads(r.read().decode("utf-8"))
28 |             if retval is None:
29 |                 retval = data
30 |             else:
31 |                 retval['items'].extend(data['items'])
32 |             print(len(data['items']))
33 |             if 'continuation' not in data:
34 |                 break
35 |             continuation = data['continuation']
36 |         return retval
37 | 
38 |     def mark_read(self, feed, id):
39 |         tries = 0
40 |         while True:
41 |             try:
42 |                 r = urllib.request.urlopen(urllib.request.Request("http://www.google.com/reader/api/0/edit-tag", headers={"Authorization": "GoogleLogin auth={0}".format(self.auth)}), "a=user/-/state/com.google/read&s={0}&i={1}&T={2}".format(feed, id, self.token).encode("utf-8"))
43 |                 assert r.read().decode("utf-8") == "OK"
44 |                 break
45 |             except urllib.error.URLError as x:
46 |                 if tries == 0:
47 |                     print("mark_read id={0}".format(id))
48 |                 tries += 1
49 |                 if tries < 3:
50 |                     print("  retry: {}".format(x))
51 |                 else:
52 |                     raise
53 | 


--------------------------------------------------------------------------------
/rss2mobi.config.example:
--------------------------------------------------------------------------------
1 | {
2 |     'account': "username",
3 |     'password': "password",
4 |     'kindlegen': "kindlegen",
5 | }
6 | 


--------------------------------------------------------------------------------
/rss2mobi.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | import cgi
  3 | import hashlib
  4 | import html.parser
  5 | import os
  6 | import pprint
  7 | import re
  8 | import shutil
  9 | import subprocess
 10 | import sys
 11 | import tempfile
 12 | import time
 13 | import urllib
 14 | 
 15 | import greader
 16 | 
 17 | g_Images = set()
 18 | 
 19 | def nameify(s):
 20 |     return s.replace("/", "_").replace(":", "_")
 21 | 
 22 | def fetch_retry(url):
 23 |     tries = 0
 24 |     while True:
 25 |         try:
 26 |             return urllib.request.urlopen(url)
 27 |         except urllib.error.URLError as x:
 28 |             tries += 1
 29 |             if tries < 3:
 30 |                 print("  retry: {}".format(x))
 31 |             else:
 32 |                 raise
 33 | 
 34 | def lget(attrs, name):
 35 |     try:
 36 |         i = [x[0] for x in attrs].index(name)
 37 |         return attrs[i][1]
 38 |     except ValueError:
 39 |         return None
 40 | 
 41 | class ImageRewriter(html.parser.HTMLParser):
 42 |     def __init__(self):
 43 |         super().__init__()
 44 |         self.output = ""
 45 |         self.images = []
 46 |     def handle_starttag(self, tag, attrs):
 47 |         if tag == "img":
 48 |             if lget(attrs, "height") != "1" or lget(attrs, "width") != "1":
 49 |                 i = [x[0] for x in attrs].index("src")
 50 |                 src = attrs[i][1]
 51 |                 id = hashlib.sha1(src.encode("utf-8")).hexdigest()
 52 | 
 53 |                 print("fetching {}".format(src))
 54 |                 try:
 55 |                     r = fetch_retry(src)
 56 |                     ct = r.getheader("Content-Type")
 57 |                     if ';' in ct:
 58 |                         # handle bizarre Content-Type: image/png; charset=UTF-8
 59 |                         ct = ct[:ct.index(';')]
 60 |                     ext = {
 61 |                         "image/gif":  ".gif",
 62 |                         "image/jpeg": ".jpg",
 63 |                         "image/png":  ".png",
 64 |                     }.get(ct)
 65 |                     if ext is not None:
 66 |                         with open(os.path.join(dir, id + ext), "wb") as f:
 67 |                             f.write(r.read())
 68 |                         if SizeLimit:
 69 |                             p = subprocess.Popen(["identify", os.path.join(dir, id + ext)], stdout=subprocess.PIPE)
 70 |                             stdout, _ = p.communicate()
 71 |                             m = re.search(b"(\\d+)x(\\d+)", stdout)
 72 |                             if m is not None:
 73 |                                 width = int(m.group(1))
 74 |                                 if width > SizeLimit:
 75 |                                     subprocess.call(["mogrify", "-geometry", str(SizeLimit), os.path.join(dir, id + ext)])
 76 |                         attrs[i] = (attrs[i][0], id + ext)
 77 |                         g_Images.add((id, id + ext, ct))
 78 |                     else:
 79 |                         print("  {}".format(ct))
 80 |                     self.output += "<{0}{1} />".format(tag, "".join((' {0}="{1}"'.format(x[0], cgi.escape(x[1], True)) if x[1] else " {0}".format(x[0])) for x in attrs))
 81 |                 except Exception as x:
 82 |                     print("oh well:", x)
 83 |                     self.output += self.get_starttag_text()
 84 |                     return
 85 |         else:
 86 |             self.output += self.get_starttag_text()
 87 |     def handle_endtag(self, tag):
 88 |         self.output += "</{}>".format(tag)
 89 |     def handle_data(self, data):
 90 |         self.output += data
 91 |     def handle_charref(self, name):
 92 |         self.output += "&#{};".format(name)
 93 |     def handle_entityref(self, name):
 94 |         self.output += "&{};".format(name)
 95 | 
 96 | KeepUnread = False
 97 | Label = None
 98 | PostLimit = 200
 99 | SizeLimit = None
100 | 
101 | i = 1
102 | while i < len(sys.argv):
103 |     a = sys.argv[i]
104 |     if a in ("--keep-unread", "-k"):
105 |         KeepUnread = True
106 |     elif a in ("--label", "-l"):
107 |         i += 1
108 |         Label = sys.argv[i]
109 |     elif a in ("--post-limit", "-p"):
110 |         i += 1
111 |         PostLimit = int(sys.argv[i])
112 |     elif a in ("--size-limit", "-s"):
113 |         i += 1
114 |         SizeLimit = int(sys.argv[i])
115 |     else:
116 |         print("Unknown command line option: {}".format(a))
117 |         sys.exit(1)
118 |     i += 1
119 | 
120 | with open("rss2mobi.config") as f:
121 |     Config = ast.literal_eval(f.read())
122 | 
123 | today = "{0.tm_year:04}-{0.tm_mon:02}-{0.tm_mday:02}".format(time.localtime(time.time()))
124 | 
125 | reader = greader.GoogleReader(Config['account'], Config['password'])
126 | reader.login()
127 | feed = reader.reading_list(label=Label)
128 | 
129 | feed['items'].reverse()
130 | feed['items'] = [x for x in feed['items'] if 'alternate' in x]
131 | feed['items'] = [x for i, x in enumerate(feed['items']) if x['id'] not in [y['id'] for y in feed['items'][:i]]]
132 | if len(feed['items']) > PostLimit:
133 |     print("Posts limited to maximum of {} (from {})".format(PostLimit, len(feed['items'])))
134 |     feed['items'] = feed['items'][:PostLimit]
135 | 
136 | if os.access("tmp", os.F_OK):
137 |     shutil.rmtree("tmp")
138 | dir = "tmp" #tempfile.mkdtemp(dir=".")
139 | os.mkdir(dir)
140 | try:
141 |     with open(os.path.join(dir, "reading_list.out"), "w", encoding="utf-8") as f:
142 |         pprint.pprint(feed, stream=f)
143 |     for e in feed['items']:
144 |         #pprint.pprint(e)
145 |         fname = nameify(e['id']) + ".html"
146 |         e['fname'] = fname
147 |         if 'title' not in e:
148 |             e['title'] = "(no title)"
149 |         if 'title' not in e['origin']:
150 |             e['origin']['title'] = "(no name)"
151 |         f = open(os.path.join(dir, fname), "w", encoding="utf-8")
152 |         print("""<?xml version="1.0" encoding="UTF-8"?>""", file=f)
153 |         print("""<html xmlns="http://www.w3.org/1999/xhtml">""", file=f)
154 |         print("<body>", file=f)
155 |         print("<h1>{0}</h1>".format(e['origin']['title']), file=f)
156 |         print("""<h2><a href="{0}">{1}</a></h2>""".format(e['alternate'][0]['href'], e['title']), file=f)
157 |         which = "content" if "content" in e else "summary"
158 |         if which in e:
159 |             content = e[which]['content']
160 |         else:
161 |             content = ""
162 |         rw = ImageRewriter()
163 |         rw.feed(content)
164 |         print(rw.output, file=f)
165 |         print("</body>", file=f)
166 |         print("</html>", file=f)
167 |         f.close()
168 | 
169 |     with open(os.path.join(dir, "contents.html"), "w", encoding="utf-8") as f:
170 |         print("""<?xml version="1.0" encoding="UTF-8"?>""", file=f)
171 |         print("""<html xmlns="http://www.w3.org/1999/xhtml>""", file=f)
172 |         print("<body>", file=f)
173 |         print("<h1>Google Reader {}</h1>".format(today), file=f)
174 |         print("<h2>{0}</h2>".format(len(feed['items'])), file=f)
175 |         for e in feed['items']:
176 |             print("""<p><a href="{0}">{2} ({1})</a></p>""".format(e['fname'], e['origin']['title'], e['title']), file=f)
177 |         print("</body>", file=f)
178 |         print("</html>", file=f)
179 | 
180 |     with open(os.path.join(dir, "toc.ncx"), "w", encoding="utf-8") as f:
181 |         print("""<?xml version="1.0" encoding="UTF-8"?>""", file=f)
182 |         print("""<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="en-US">""", file=f)
183 |         print("""<docTitle><text>Google Reader {}</text></docTitle>""".format(today), file=f)
184 |         print("<navMap>", file=f)
185 |         print("""<navPoint class="toc" id="toc" playOrder="1">""", file=f)
186 |         print("""  <navLabel><text>Table of Contents</text></navLabel>""", file=f)
187 |         print("""  <content src="contents.html" />""", file=f)
188 |         print("""</navPoint>""", file=f)
189 |         order = 2
190 |         for e in feed['items']:
191 |             print("""<navPoint class="chapter" id="{0}" playOrder="{1}">""".format(e['fname'], order), file=f)
192 |             print("""  <navLabel><text>{}</text></navLabel>""".format(e['title']), file=f)
193 |             print("""  <content src="{}" />""".format(e['fname']), file=f)
194 |             print("""</navPoint>""", file=f)
195 |             order += 1
196 |         print("</navMap>", file=f)
197 |         print("</ncx>", file=f)
198 | 
199 |     opfn = "reader-{0}.opf".format(today)
200 |     f = open(os.path.join(dir, opfn), "w")
201 |     print("""<?xml version="1.0" encoding="UTF-8"?>
202 |     <package unique-identifier="uid" xmlns:dc="Dublin Core">
203 |         <metadata>
204 |             <dc-metadata>
205 |                 <dc:Identifier id="uid">reader-{0}</dc:Identifier>
206 |                 <dc:Title>Reader {0} ({1})</dc:Title>
207 |                 <dc:Language>EN</dc:Language>
208 |                 <dc:Date>{0}</dc:Date>
209 |             </dc-metadata>
210 |         </metadata>
211 |         <manifest>""".format(today, len(feed['items'])), file=f)
212 |     print("""<item id="contents" href="contents.html" media-type="text/html" />""", file=f)
213 |     for e in feed['items']:
214 |         print("""<item id="{0}" href="{0}.html" media-type="text/html" />""".format(nameify(e['id'])), file=f)
215 |     for i in g_Images:
216 |         print("""<item id="{0}" href="{1}" media-type="{2}" />""".format(*i), file=f)
217 |     print("""<item id="toc" media-type="application/x-dtbncx+xml" href="toc.ncx" />""", file=f)
218 |     print("""</manifest>
219 |         <spine toc="toc">""", file=f)
220 |     print("""<itemref idref="contents" />""", file=f)
221 |     for e in feed['items']:
222 |         print("""<itemref idref="{0}" />""".format(nameify(e['id'])), file=f)
223 |     print("""</spine>
224 |     </package>""", file=f)
225 |     f.close()
226 |     os.system("{0} {1}".format(Config['kindlegen'], os.path.join(dir, opfn)))
227 |     assert os.access(os.path.join(dir, "reader-{0}.mobi".format(today)), os.F_OK)
228 |     if not KeepUnread:
229 |         for e in feed['items']:
230 |             reader.mark_read(e['origin']['streamId'], e['id'])
231 | finally:
232 |     pass #shutil.rmtree(dir)
233 | 


--------------------------------------------------------------------------------