├── saved
    └── .gitkeep
├── README.md
└── totext.py


/saved/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # totext - Convert URL or RSS feed to text with readability
  2 | 
  3 | Love plaintext? This script downloads an URL, parses it with readability and 
  4 | returns the plaintext (as markdown). It supports RSS feeds (will convert every
  5 | article in the feed) and saves every article. 
  6 | 
  7 | My usecase is twofold. One is to convert RSS feeds to a [Gopher site][1], the 
  8 | second is to get full text in my RSS reader. 
  9 | 
 10 | The script contains a few workarounds for so-called cookiewalls. It also pauses
 11 | between RSS feed articles to not do excessive requests. 
 12 | 
 13 | The readability part is handled by Python, no external services are used.
 14 | 
 15 | Here's an example of a news article. On the left, the text-only parsed version,
 16 | on the right, the webpage:
 17 | 
 18 | ![example][2]
 19 | 
 20 | [Demo site via Gopher][5]
 21 | 
 22 | [Github repo with source code][3]
 23 | 
 24 | [More info over at raymii.org][4]
 25 | 
 26 | ## Installation
 27 | 
 28 | First install the required libraries. 
 29 | 
 30 | On Ubuntu:
 31 |     
 32 |     apt-get install python python-pip #python2
 33 |     pip install html2text requests readability-lxml feedparser lxml
 34 | 
 35 | Other distro's, use the `pip` command above.
 36 | 
 37 | Clone the repository:
 38 | 
 39 |     git clone https://github.com/RaymiiOrg/to-text.py
 40 | 
 41 | ## Usage
 42 | 
 43 |     usage: totext.py [-h] -u URL [-s SLEEP] [-r] [-n]
 44 | 
 45 |     Convert HTML page to text using readability and html2text.
 46 | 
 47 |     arguments:
 48 |       -h, --help            show this help message and exit
 49 |       -u URL, --url URL     URL to convert (Required)
 50 |       -s SLEEP, --sleep SLEEP
 51 |                             Sleep X seconds between URLs (only in rss)
 52 |       -r, --rss             URL is RSS feed. Parse every item in feed
 53 |       -n, --noprint         Dont print converted contents
 54 |       -o, --original        Dont parse content with readability
 55 | 
 56 | If you want to run the script via a cronjob, use the `-n` option to not have output.
 57 | 
 58 | If an article doesnt look good (for example you get comments instead of content),
 59 | use the `--original` option to just convert the page to text instead of using 
 60 | readability. You will get extra markup and stuff.
 61 | 
 62 | If the parsing failed, the article will contain the text: `parsing failed`.
 63 | 
 64 | ## Examples
 65 | 
 66 | 
 67 |     python totext.py --rss --url https://raymii.org/s/feed.xml
 68 |     python totext.py --url https://www.rd.nl/vandaag/binnenland/grootste-stijging-verkeersdoden-in-jaren-1.1562067
 69 | 
 70 | ## Saved text
 71 | 
 72 | Every file converted will also be saved to the folder `saved/$hostname`. The 
 73 | filenames are sorted by date.
 74 | 
 75 | ## License 
 76 | 
 77 | GNU GPLv2.
 78 | 
 79 | ## Pygopherd reverse directory sort
 80 | 
 81 |     vim /usr/lib/python2.7/dist-packages/pygopherd/handlers/UMN.py
 82 | 
 83 |     class UMNDirHandler(DirHandler):
 84 |     """This module strives to be bug-compatible with UMN gopherd."""
 85 | 
 86 |     def prepare(self):
 87 |         """Override parent to do a few more things and override sort order."""
 88 |         # Initialize.
 89 |         self.linkentries = []
 90 | 
 91 |         # Let the parent do the directory walking for us.  Will call
 92 |         # prep_initfiles_canaddfile and prep_entriesappend.
 93 |         if DirHandler.prepare(self):
 94 |             # Returns 1 if it didn't load from the cache.
 95 |             # Merge and sort.
 96 |             self.MergeLinkFiles()
 97 |     -        self.fileentries.sort(self.entrycmp)
 98 |     +        self.fileentries.sort(self.entrycmp, reverse=True)
 99 | 
100 | [1]: https://raymii.org/s/blog/Site_updates_raymii.org_now_on_gopher.html
101 | [2]: https://raymii.org/s/inc/img/txtnws.png
102 | [3]: https://github.com/RaymiiOrg/to-text.py
103 | [4]: https://raymii.org/s/software/totext.py-Convert_URL_or_RSS_feed_to_plaintext_with_readability.html
104 | [5]: https://txtn.ws


--------------------------------------------------------------------------------
/totext.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # Author: Remy van Elst <raymii.org>
  3 | # License: GNU GPLv2
  4 | 
  5 | # pip install html2text requests readability-lxml feedparser lxml
  6 | import requests
  7 | import lxml
  8 | from readability import Document
  9 | import html2text
 10 | import argparse 
 11 | import feedparser
 12 | import sys
 13 | import time
 14 | import re
 15 | import os
 16 | from datetime import datetime
 17 | from time import mktime
 18 | from urlparse import urlparse
 19 | import urllib3
 20 | #The only insecure call we do is on purpose.
 21 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 22 | import warnings
 23 | warnings.simplefilter(action='ignore', category=FutureWarning)
 24 | 
 25 | parser = argparse.ArgumentParser(description='Convert HTML page to text using readability and html2text.')
 26 | parser.add_argument('-u','--url', help='URL to convert', required=True)
 27 | parser.add_argument('-s','--sleep', help='Sleep X seconds between URLs (only in rss)', default=3)
 28 | parser.add_argument('-t','--timeout', help='Timeout for HTTP requests', default=30)
 29 | parser.add_argument('-r', '--rss', help="URL is RSS feed. Parse every item in feed", 
 30 |     default=False, action="store_true")
 31 | parser.add_argument('-n', '--noprint', help="Dont print converted contents", 
 32 |     default=False, action="store_true")
 33 | parser.add_argument('-o', '--original', help="Dont parse contents with readability.", 
 34 |     default=False, action="store_true")
 35 | parser.add_argument('-f', '--forcedownload', 
 36 |     help="Force download even if file seems to be something else than text based on the content-type header.", 
 37 |     default=False, action="store_true")
 38 | args = vars(parser.parse_args())
 39 | 
 40 | class mockResponse(object):
 41 |     text = ""
 42 |     def __init__(self, text):
 43 |         super(mockResponse, self).__init__()
 44 |         self.text = text
 45 |         
 46 | def cookie_workaround_ad(url):
 47 |     hostname = urlparse(url).hostname
 48 |     if hostname == "ad.nl" or hostname == "www.ad.nl":
 49 |         url = "https://www.ad.nl/accept?url=" + url
 50 |     return url
 51 | 
 52 | def custom_workaround_twitter(url):
 53 |     hostname = urlparse(url).hostname
 54 |     if hostname == "twitter.com" or hostname == "www.twitter.com":
 55 |         session_requests = requests.session()
 56 |         path = urlparse(url).path
 57 |         jar = requests.cookies.RequestsCookieJar()
 58 |         content = session_requests.get(url, headers=headers, 
 59 |             timeout=args['timeout'], cookies=jar)
 60 |         content.encoding = content.apparent_encoding
 61 |         tree = lxml.html.fromstring(content.text)
 62 |         auth_token = list(set(
 63 |             tree.xpath("//input[@name='authenticity_token']/@value")))[0]
 64 |         url = "https://mobile.twitter.com/" + path
 65 |         payload = {"authenticity_token": auth_token}
 66 |         content2 = session_requests.get(url, headers=headers, 
 67 |             timeout=args['timeout'], cookies=jar, allow_redirects=True)
 68 |         content2.encoding = content2.apparent_encoding
 69 |         content2.raise_for_status()
 70 |         args['original'] = True
 71 |         return content2
 72 |        
 73 | def custom_workaround_noparse(url):
 74 |     hostname = urlparse(url).hostname
 75 |     if hostname == "news.ycombinator.com" or \
 76 |     hostname == "youtube.com" or \
 77 |     hostname == "www.youtube.com":
 78 |         args['original'] = True
 79 | 
 80 | def cookie_workaround_tweakers(url):
 81 |     hostname = urlparse(url).hostname
 82 |     if hostname == "tweakers.net" or hostname == "www.tweakers.net":
 83 |         headers['X-Cookies-Accepted'] = '1'
 84 | 
 85 | def custom_workaround_verisimilitudes(url):
 86 |     hostname = urlparse(url).hostname
 87 |     if hostname == "verisimilitudes.net" or hostname == "www.verisimilitudes.net":
 88 |         path = urlparse(url).path.replace("/","",1)
 89 |         if(path):
 90 |             response = get_url(args['url'], workarounds=False)
 91 |             doc = convert_doc(response.text)
 92 |             text = str(convert_doc_to_text(doc.content()))
 93 |             if not text:
 94 |                 title = "Parsing failed"
 95 |             else:
 96 |                 title = doc.short_title().encode('utf-8').strip()
 97 |             save_gophermap("", title, hostname, 1, path, 70)
 98 | 
 99 | def cookie_workaround_rd(url):
100 |     hostname = urlparse(url).hostname
101 |     if hostname == "rd.nl" or hostname == "www.rd.nl":
102 |         headers['cookieInfoV4'] = "1"
103 | 
104 | def cookie_workaround_geenstijl(url):
105 |     hostname = urlparse(url).hostname
106 |     if hostname == "geenstijl.nl" or hostname == "www.geenstijl.nl":
107 |         headers['Cookie'] = "cpc=10"
108 | 
109 | def cookie_workarounds_url(url):
110 |     url = cookie_workaround_ad(url)
111 |     return url
112 | 
113 | def cookie_workarounds_header(url):
114 |     cookie_workaround_tweakers(url)
115 |     cookie_workaround_rd(url)
116 |     cookie_workaround_geenstijl(url)
117 | 
118 | def custom_content_workaround(url):
119 |     custom_content = custom_workaround_twitter(url)
120 |     custom_workaround_noparse(url)
121 |     custom_workaround_verisimilitudes(url)
122 |     return custom_content
123 | 
124 | def get_url(url, workarounds=True):
125 |     if workarounds:
126 |         url = cookie_workarounds_url(url)
127 |         cookie_workarounds_header(url)
128 |         custom_content = custom_content_workaround(url)
129 |         if custom_content:
130 |             return custom_content
131 |     try:
132 |         r = requests.get(url, headers=headers, 
133 |             timeout=args['timeout'])
134 |         r.raise_for_status()
135 |     except requests.exceptions.SSLError as e:
136 |         print("SSL Error. Retrying once")
137 |         time.sleep(5)
138 |         r = requests.get(url, headers=headers, 
139 |             timeout=args['timeout'], verify=False)
140 |         r.raise_for_status()
141 |     except requests.exceptions.HTTPError as e:
142 |         print("HTTP Error. Retrying once")
143 |         time.sleep(5)
144 |         r = requests.get(url, headers=headers, 
145 |             timeout=args['timeout'])
146 |     except requests.exceptions.ReadTimeout as e:
147 |         print("ReadTimeout, retrying once")
148 |         time.sleep(5)
149 |         r = requests.get(url, headers=headers, 
150 |             timeout=args['timeout'])
151 |         r.raise_for_status()
152 |     
153 |     r.encoding = r.apparent_encoding
154 |     try:
155 |         if "message" in r.headers['content-type'] or \
156 |            "image" in r.headers['content-type'] or \
157 |            "pdf" in r.headers['content-type'] or \
158 |            "model" in r.headers['content-type'] or \
159 |            "multipart" in r.headers['content-type'] or \
160 |            "audio" in r.headers['content-type'] or \
161 |            "font" in r.headers['content-type'] or \
162 |            "video" in r.headers['content-type']:
163 |             if args['forcedownload']:
164 |                 return r
165 |             else:
166 |                 return mockResponse("""This might not be a html file but something 
167 |                 else, like a PDF or an audio file. Use the --forcedownload flag
168 |                 to download and parse this anyway. The content type reported for 
169 |                 this file is: %s\n\n""" % (r.headers['content-type']))
170 |     except KeyError:
171 |         pass
172 |     if len(r.text) > 5:
173 |         return r 
174 |     else:
175 |         return mockResponse("Empty Response")
176 | 
177 | 
178 | def convert_doc(html_text):
179 |     return Document(input=html_text, 
180 |         positive_keywords=["articleColumn", "article", "content", 
181 |         "category_news"],
182 |         negative_keywords=["commentColumn", "comment", "comments", 
183 |         "posting_list reply", "posting reply", "reply"])
184 | 
185 | 
186 | def convert_doc_to_text(doc_summary):
187 |     h = html2text.HTML2Text()
188 |     h.inline_links = False # reference style links
189 |     h.wrap_links = False 
190 |     h.body_width = 72
191 |     doc = h.handle(doc_summary).encode('utf-8').strip()
192 |     if len(doc) > 500:
193 |         return doc
194 |     
195 | def save_doc(text, title, url, rssDate=0):
196 |     hostname = urlparse(url).hostname
197 |     if not os.path.exists("saved/" + hostname):
198 |         os.makedirs("saved/" + hostname)
199 |     filename = re.sub(r'[^A-Za-z0-9]', '_', title)
200 |     if rssDate:
201 |         posttime = datetime.fromtimestamp(mktime(rssDate)).strftime("%Y%m%dT%H%M")
202 |     else:
203 |         posttime = datetime.now().strftime("%Y%m%dT%H%M")
204 |     filename = "saved/" + hostname + "/" + posttime + "_" + filename + ".txt"
205 |     if not os.path.exists(filename):
206 |         with open(filename, "w") as textfile:
207 |             textfile.write("# " + title)
208 |             textfile.write("\nSource URL: \t" + url)
209 |             textfile.write("\nDate: \t\t" + posttime)
210 |             textfile.write("\n\n")
211 |             textfile.write(text)  
212 |     return filename
213 | 
214 | def save_gophermap(text, title, server, gophertype, filename, gopherport):
215 |     if not os.path.exists("saved/" + server):
216 |         os.makedirs("saved/" + server)
217 |     posttime = datetime.now().strftime("%Y%m%dT%H%M")
218 |     if not os.path.exists("saved/" + server + "/" + posttime):
219 |         os.makedirs("saved/" + server + "/" + posttime)
220 |     gmfile = "saved/" + server + "/" + posttime + "/gophermap"
221 |     if not os.path.exists(gmfile):
222 |         with open(gmfile, "w") as gophermap:
223 |             gophermap.write("i\t/\tlocalhost\t70\n")
224 |             gophermap.write("iThis article is available via Gopher. \t/\tlocalhost\t70\n")
225 |             gophermap.write("iPlease follow the link\t/\tlocalhost\t70\n")
226 |             gophermap.write(("%i%s on %s\t/%s\t%s\t%i\n") % (gophertype, 
227 |                 title, server, filename, server, gopherport))
228 |             gophermap.write("i\t/\tlocalhost\t70\n")
229 |             gophermap.write(("iLast update: %s\t/\tlocalhost\t70\n") % (posttime))
230 |             gophermap.write(text)
231 |     return gmfile
232 | 
233 | headers = {'User-Agent': 'Tiny Tiny RSS/19.2 (1a484ec) (http://tt-rss.org/)'} 
234 | 
235 | response = get_url(args['url'])
236 | 
237 | if args['rss']:
238 |     feed = feedparser.parse(response.text)
239 |     if feed.bozo:
240 |         print("Invalid XML.")
241 |         sys.exit(1)
242 |     else:
243 |         for post in feed.entries:
244 |             try:
245 |                 response = get_url(post['link'])
246 |             except Exception as e:
247 |                 response = mockResponse("Failed to get RSS article.")
248 |             doc = convert_doc(response.text)
249 |             if args['original']:
250 |                 text = str(convert_doc_to_text(doc.content()))
251 |             else:
252 |                 text = convert_doc_to_text(doc.summary())
253 |                 if not text:
254 |                     text = "Parsing with Readability failed. Original content:\n\n"
255 |                     text += str(convert_doc_to_text(doc.content()))
256 |             title = doc.short_title().encode('utf-8').strip()
257 |             try:
258 |                 rssDate = post['published_parsed']
259 |             except KeyError:
260 |                 try:
261 |                     rssDate = post['updated_parsed']
262 |                 except KeyError:
263 |                     rssDate = post['created_parsed']
264 |             filename = save_doc(text, title[:150], post['link'], rssDate)
265 |             if not args['noprint']:
266 |                 print("\n\n========================\n\n") 
267 |                 print("# " + title)
268 |                 print("Source URL: " + post['link'])
269 |                 print("\n")
270 |                 print(text)
271 |                 print("file saved as " + filename)
272 |             if args['sleep']:
273 |                 time.sleep(args['sleep'])
274 | else:
275 |     doc = convert_doc(response.text)
276 |     if args['original']:
277 |         text = str(convert_doc_to_text(doc.content()))
278 |     else:
279 |         text = convert_doc_to_text(doc.summary())
280 |         if not text:
281 |             text = "Parsing with Readability failed. Original content:\n\n"
282 |             text += str(convert_doc_to_text(doc.content()))
283 |     title = doc.short_title().encode('utf-8').strip()
284 |     filename = save_doc(text, title[:150], args['url'])
285 |     if not args['noprint']:
286 |         print("\n\n========================\n\n") 
287 |         print("# " + title)
288 |         print("Source URL: " + args['url'])
289 |         print("\n")
290 |         print(text)
291 |         print("file saved as " + filename)


--------------------------------------------------------------------------------