├── saved └── .gitkeep ├── README.md └── totext.py /saved/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # totext - Convert URL or RSS feed to text with readability 2 | 3 | Love plaintext? This script downloads an URL, parses it with readability and 4 | returns the plaintext (as markdown). It supports RSS feeds (will convert every 5 | article in the feed) and saves every article. 6 | 7 | My usecase is twofold. One is to convert RSS feeds to a [Gopher site][1], the 8 | second is to get full text in my RSS reader. 9 | 10 | The script contains a few workarounds for so-called cookiewalls. It also pauses 11 | between RSS feed articles to not do excessive requests. 12 | 13 | The readability part is handled by Python, no external services are used. 14 | 15 | Here's an example of a news article. On the left, the text-only parsed version, 16 | on the right, the webpage: 17 | 18 | ![example][2] 19 | 20 | [Demo site via Gopher][5] 21 | 22 | [Github repo with source code][3] 23 | 24 | [More info over at raymii.org][4] 25 | 26 | ## Installation 27 | 28 | First install the required libraries. 29 | 30 | On Ubuntu: 31 | 32 | apt-get install python python-pip #python2 33 | pip install html2text requests readability-lxml feedparser lxml 34 | 35 | Other distro's, use the `pip` command above. 36 | 37 | Clone the repository: 38 | 39 | git clone https://github.com/RaymiiOrg/to-text.py 40 | 41 | ## Usage 42 | 43 | usage: totext.py [-h] -u URL [-s SLEEP] [-r] [-n] 44 | 45 | Convert HTML page to text using readability and html2text. 46 | 47 | arguments: 48 | -h, --help show this help message and exit 49 | -u URL, --url URL URL to convert (Required) 50 | -s SLEEP, --sleep SLEEP 51 | Sleep X seconds between URLs (only in rss) 52 | -r, --rss URL is RSS feed. Parse every item in feed 53 | -n, --noprint Dont print converted contents 54 | -o, --original Dont parse content with readability 55 | 56 | If you want to run the script via a cronjob, use the `-n` option to not have output. 57 | 58 | If an article doesnt look good (for example you get comments instead of content), 59 | use the `--original` option to just convert the page to text instead of using 60 | readability. You will get extra markup and stuff. 61 | 62 | If the parsing failed, the article will contain the text: `parsing failed`. 63 | 64 | ## Examples 65 | 66 | 67 | python totext.py --rss --url https://raymii.org/s/feed.xml 68 | python totext.py --url https://www.rd.nl/vandaag/binnenland/grootste-stijging-verkeersdoden-in-jaren-1.1562067 69 | 70 | ## Saved text 71 | 72 | Every file converted will also be saved to the folder `saved/$hostname`. The 73 | filenames are sorted by date. 74 | 75 | ## License 76 | 77 | GNU GPLv2. 78 | 79 | ## Pygopherd reverse directory sort 80 | 81 | vim /usr/lib/python2.7/dist-packages/pygopherd/handlers/UMN.py 82 | 83 | class UMNDirHandler(DirHandler): 84 | """This module strives to be bug-compatible with UMN gopherd.""" 85 | 86 | def prepare(self): 87 | """Override parent to do a few more things and override sort order.""" 88 | # Initialize. 89 | self.linkentries = [] 90 | 91 | # Let the parent do the directory walking for us. Will call 92 | # prep_initfiles_canaddfile and prep_entriesappend. 93 | if DirHandler.prepare(self): 94 | # Returns 1 if it didn't load from the cache. 95 | # Merge and sort. 96 | self.MergeLinkFiles() 97 | - self.fileentries.sort(self.entrycmp) 98 | + self.fileentries.sort(self.entrycmp, reverse=True) 99 | 100 | [1]: https://raymii.org/s/blog/Site_updates_raymii.org_now_on_gopher.html 101 | [2]: https://raymii.org/s/inc/img/txtnws.png 102 | [3]: https://github.com/RaymiiOrg/to-text.py 103 | [4]: https://raymii.org/s/software/totext.py-Convert_URL_or_RSS_feed_to_plaintext_with_readability.html 104 | [5]: https://txtn.ws -------------------------------------------------------------------------------- /totext.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # Author: Remy van Elst 3 | # License: GNU GPLv2 4 | 5 | # pip install html2text requests readability-lxml feedparser lxml 6 | import requests 7 | import lxml 8 | from readability import Document 9 | import html2text 10 | import argparse 11 | import feedparser 12 | import sys 13 | import time 14 | import re 15 | import os 16 | from datetime import datetime 17 | from time import mktime 18 | from urlparse import urlparse 19 | import urllib3 20 | #The only insecure call we do is on purpose. 21 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 22 | import warnings 23 | warnings.simplefilter(action='ignore', category=FutureWarning) 24 | 25 | parser = argparse.ArgumentParser(description='Convert HTML page to text using readability and html2text.') 26 | parser.add_argument('-u','--url', help='URL to convert', required=True) 27 | parser.add_argument('-s','--sleep', help='Sleep X seconds between URLs (only in rss)', default=3) 28 | parser.add_argument('-t','--timeout', help='Timeout for HTTP requests', default=30) 29 | parser.add_argument('-r', '--rss', help="URL is RSS feed. Parse every item in feed", 30 | default=False, action="store_true") 31 | parser.add_argument('-n', '--noprint', help="Dont print converted contents", 32 | default=False, action="store_true") 33 | parser.add_argument('-o', '--original', help="Dont parse contents with readability.", 34 | default=False, action="store_true") 35 | parser.add_argument('-f', '--forcedownload', 36 | help="Force download even if file seems to be something else than text based on the content-type header.", 37 | default=False, action="store_true") 38 | args = vars(parser.parse_args()) 39 | 40 | class mockResponse(object): 41 | text = "" 42 | def __init__(self, text): 43 | super(mockResponse, self).__init__() 44 | self.text = text 45 | 46 | def cookie_workaround_ad(url): 47 | hostname = urlparse(url).hostname 48 | if hostname == "ad.nl" or hostname == "www.ad.nl": 49 | url = "https://www.ad.nl/accept?url=" + url 50 | return url 51 | 52 | def custom_workaround_twitter(url): 53 | hostname = urlparse(url).hostname 54 | if hostname == "twitter.com" or hostname == "www.twitter.com": 55 | session_requests = requests.session() 56 | path = urlparse(url).path 57 | jar = requests.cookies.RequestsCookieJar() 58 | content = session_requests.get(url, headers=headers, 59 | timeout=args['timeout'], cookies=jar) 60 | content.encoding = content.apparent_encoding 61 | tree = lxml.html.fromstring(content.text) 62 | auth_token = list(set( 63 | tree.xpath("//input[@name='authenticity_token']/@value")))[0] 64 | url = "https://mobile.twitter.com/" + path 65 | payload = {"authenticity_token": auth_token} 66 | content2 = session_requests.get(url, headers=headers, 67 | timeout=args['timeout'], cookies=jar, allow_redirects=True) 68 | content2.encoding = content2.apparent_encoding 69 | content2.raise_for_status() 70 | args['original'] = True 71 | return content2 72 | 73 | def custom_workaround_noparse(url): 74 | hostname = urlparse(url).hostname 75 | if hostname == "news.ycombinator.com" or \ 76 | hostname == "youtube.com" or \ 77 | hostname == "www.youtube.com": 78 | args['original'] = True 79 | 80 | def cookie_workaround_tweakers(url): 81 | hostname = urlparse(url).hostname 82 | if hostname == "tweakers.net" or hostname == "www.tweakers.net": 83 | headers['X-Cookies-Accepted'] = '1' 84 | 85 | def custom_workaround_verisimilitudes(url): 86 | hostname = urlparse(url).hostname 87 | if hostname == "verisimilitudes.net" or hostname == "www.verisimilitudes.net": 88 | path = urlparse(url).path.replace("/","",1) 89 | if(path): 90 | response = get_url(args['url'], workarounds=False) 91 | doc = convert_doc(response.text) 92 | text = str(convert_doc_to_text(doc.content())) 93 | if not text: 94 | title = "Parsing failed" 95 | else: 96 | title = doc.short_title().encode('utf-8').strip() 97 | save_gophermap("", title, hostname, 1, path, 70) 98 | 99 | def cookie_workaround_rd(url): 100 | hostname = urlparse(url).hostname 101 | if hostname == "rd.nl" or hostname == "www.rd.nl": 102 | headers['cookieInfoV4'] = "1" 103 | 104 | def cookie_workaround_geenstijl(url): 105 | hostname = urlparse(url).hostname 106 | if hostname == "geenstijl.nl" or hostname == "www.geenstijl.nl": 107 | headers['Cookie'] = "cpc=10" 108 | 109 | def cookie_workarounds_url(url): 110 | url = cookie_workaround_ad(url) 111 | return url 112 | 113 | def cookie_workarounds_header(url): 114 | cookie_workaround_tweakers(url) 115 | cookie_workaround_rd(url) 116 | cookie_workaround_geenstijl(url) 117 | 118 | def custom_content_workaround(url): 119 | custom_content = custom_workaround_twitter(url) 120 | custom_workaround_noparse(url) 121 | custom_workaround_verisimilitudes(url) 122 | return custom_content 123 | 124 | def get_url(url, workarounds=True): 125 | if workarounds: 126 | url = cookie_workarounds_url(url) 127 | cookie_workarounds_header(url) 128 | custom_content = custom_content_workaround(url) 129 | if custom_content: 130 | return custom_content 131 | try: 132 | r = requests.get(url, headers=headers, 133 | timeout=args['timeout']) 134 | r.raise_for_status() 135 | except requests.exceptions.SSLError as e: 136 | print("SSL Error. Retrying once") 137 | time.sleep(5) 138 | r = requests.get(url, headers=headers, 139 | timeout=args['timeout'], verify=False) 140 | r.raise_for_status() 141 | except requests.exceptions.HTTPError as e: 142 | print("HTTP Error. Retrying once") 143 | time.sleep(5) 144 | r = requests.get(url, headers=headers, 145 | timeout=args['timeout']) 146 | except requests.exceptions.ReadTimeout as e: 147 | print("ReadTimeout, retrying once") 148 | time.sleep(5) 149 | r = requests.get(url, headers=headers, 150 | timeout=args['timeout']) 151 | r.raise_for_status() 152 | 153 | r.encoding = r.apparent_encoding 154 | try: 155 | if "message" in r.headers['content-type'] or \ 156 | "image" in r.headers['content-type'] or \ 157 | "pdf" in r.headers['content-type'] or \ 158 | "model" in r.headers['content-type'] or \ 159 | "multipart" in r.headers['content-type'] or \ 160 | "audio" in r.headers['content-type'] or \ 161 | "font" in r.headers['content-type'] or \ 162 | "video" in r.headers['content-type']: 163 | if args['forcedownload']: 164 | return r 165 | else: 166 | return mockResponse("""This might not be a html file but something 167 | else, like a PDF or an audio file. Use the --forcedownload flag 168 | to download and parse this anyway. The content type reported for 169 | this file is: %s\n\n""" % (r.headers['content-type'])) 170 | except KeyError: 171 | pass 172 | if len(r.text) > 5: 173 | return r 174 | else: 175 | return mockResponse("Empty Response") 176 | 177 | 178 | def convert_doc(html_text): 179 | return Document(input=html_text, 180 | positive_keywords=["articleColumn", "article", "content", 181 | "category_news"], 182 | negative_keywords=["commentColumn", "comment", "comments", 183 | "posting_list reply", "posting reply", "reply"]) 184 | 185 | 186 | def convert_doc_to_text(doc_summary): 187 | h = html2text.HTML2Text() 188 | h.inline_links = False # reference style links 189 | h.wrap_links = False 190 | h.body_width = 72 191 | doc = h.handle(doc_summary).encode('utf-8').strip() 192 | if len(doc) > 500: 193 | return doc 194 | 195 | def save_doc(text, title, url, rssDate=0): 196 | hostname = urlparse(url).hostname 197 | if not os.path.exists("saved/" + hostname): 198 | os.makedirs("saved/" + hostname) 199 | filename = re.sub(r'[^A-Za-z0-9]', '_', title) 200 | if rssDate: 201 | posttime = datetime.fromtimestamp(mktime(rssDate)).strftime("%Y%m%dT%H%M") 202 | else: 203 | posttime = datetime.now().strftime("%Y%m%dT%H%M") 204 | filename = "saved/" + hostname + "/" + posttime + "_" + filename + ".txt" 205 | if not os.path.exists(filename): 206 | with open(filename, "w") as textfile: 207 | textfile.write("# " + title) 208 | textfile.write("\nSource URL: \t" + url) 209 | textfile.write("\nDate: \t\t" + posttime) 210 | textfile.write("\n\n") 211 | textfile.write(text) 212 | return filename 213 | 214 | def save_gophermap(text, title, server, gophertype, filename, gopherport): 215 | if not os.path.exists("saved/" + server): 216 | os.makedirs("saved/" + server) 217 | posttime = datetime.now().strftime("%Y%m%dT%H%M") 218 | if not os.path.exists("saved/" + server + "/" + posttime): 219 | os.makedirs("saved/" + server + "/" + posttime) 220 | gmfile = "saved/" + server + "/" + posttime + "/gophermap" 221 | if not os.path.exists(gmfile): 222 | with open(gmfile, "w") as gophermap: 223 | gophermap.write("i\t/\tlocalhost\t70\n") 224 | gophermap.write("iThis article is available via Gopher. \t/\tlocalhost\t70\n") 225 | gophermap.write("iPlease follow the link\t/\tlocalhost\t70\n") 226 | gophermap.write(("%i%s on %s\t/%s\t%s\t%i\n") % (gophertype, 227 | title, server, filename, server, gopherport)) 228 | gophermap.write("i\t/\tlocalhost\t70\n") 229 | gophermap.write(("iLast update: %s\t/\tlocalhost\t70\n") % (posttime)) 230 | gophermap.write(text) 231 | return gmfile 232 | 233 | headers = {'User-Agent': 'Tiny Tiny RSS/19.2 (1a484ec) (http://tt-rss.org/)'} 234 | 235 | response = get_url(args['url']) 236 | 237 | if args['rss']: 238 | feed = feedparser.parse(response.text) 239 | if feed.bozo: 240 | print("Invalid XML.") 241 | sys.exit(1) 242 | else: 243 | for post in feed.entries: 244 | try: 245 | response = get_url(post['link']) 246 | except Exception as e: 247 | response = mockResponse("Failed to get RSS article.") 248 | doc = convert_doc(response.text) 249 | if args['original']: 250 | text = str(convert_doc_to_text(doc.content())) 251 | else: 252 | text = convert_doc_to_text(doc.summary()) 253 | if not text: 254 | text = "Parsing with Readability failed. Original content:\n\n" 255 | text += str(convert_doc_to_text(doc.content())) 256 | title = doc.short_title().encode('utf-8').strip() 257 | try: 258 | rssDate = post['published_parsed'] 259 | except KeyError: 260 | try: 261 | rssDate = post['updated_parsed'] 262 | except KeyError: 263 | rssDate = post['created_parsed'] 264 | filename = save_doc(text, title[:150], post['link'], rssDate) 265 | if not args['noprint']: 266 | print("\n\n========================\n\n") 267 | print("# " + title) 268 | print("Source URL: " + post['link']) 269 | print("\n") 270 | print(text) 271 | print("file saved as " + filename) 272 | if args['sleep']: 273 | time.sleep(args['sleep']) 274 | else: 275 | doc = convert_doc(response.text) 276 | if args['original']: 277 | text = str(convert_doc_to_text(doc.content())) 278 | else: 279 | text = convert_doc_to_text(doc.summary()) 280 | if not text: 281 | text = "Parsing with Readability failed. Original content:\n\n" 282 | text += str(convert_doc_to_text(doc.content())) 283 | title = doc.short_title().encode('utf-8').strip() 284 | filename = save_doc(text, title[:150], args['url']) 285 | if not args['noprint']: 286 | print("\n\n========================\n\n") 287 | print("# " + title) 288 | print("Source URL: " + args['url']) 289 | print("\n") 290 | print(text) 291 | print("file saved as " + filename) --------------------------------------------------------------------------------