├── r2e
├── r2e.bat
├── .gitignore
├── README.md
├── config.py.example
├── test_rss2email.py
├── CHANGELOG
├── readme.html
├── html2text.py
├── rss2email.py
└── BeautifulSoup.py
/r2e:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python rss2email.py feeds.dat $*
3 |
--------------------------------------------------------------------------------
/r2e.bat:
--------------------------------------------------------------------------------
1 | @python rss2email.py feeds.dat %1 %2 %3 %4 %5 %6 %7 %8 %9
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | temp/*
3 | Old/*
4 | feeds.dat
5 | *.pyc
6 | build/*
7 | *.sublime-project
8 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This project is largely on hold. There is another, active rss2email project for Python 3 that is located at https://github.com/wking/rss2email
2 |
--------------------------------------------------------------------------------
/config.py.example:
--------------------------------------------------------------------------------
1 | ### Options for configuring rss2email ###
2 |
3 | # The email address messages are from by default:
4 | DEFAULT_FROM = "bozo@dev.null.invalid"
5 |
6 | # 1: Send text/html messages when possible.
7 | # 0: Convert HTML to plain text.
8 | HTML_MAIL = 1
9 |
10 | # 1: Only use the DEFAULT_FROM address.
11 | # 0: Use the email address specified by the feed, when possible.
12 | FORCE_FROM = 0
13 |
14 | # 1: Receive one email per post.
15 | # 0: Receive an email every time a post changes.
16 | TRUST_GUID = 1
17 |
18 | # 1: Generate Date header based on item's date, when possible.
19 | # 0: Generate Date header based on time sent.
20 | DATE_HEADER = 1
21 |
22 | # A tuple consisting of some combination of
23 | # ('issued', 'created', 'modified', 'expired')
24 | # expressing ordered list of preference in dates
25 | # to use for the Date header of the email.
26 | DATE_HEADER_ORDER = ('modified', 'issued', 'created')
27 |
28 | # 1: Apply Q-P conversion (required for some MUAs).
29 | # 0: Send message in 8-bits.
30 | # http://cr.yp.to/smtp/8bitmime.html
31 | #DEPRECATED
32 | QP_REQUIRED = 0
33 | #DEPRECATED
34 |
35 | # 1: Name feeds as they're being processed.
36 | # 0: Keep quiet.
37 | VERBOSE = 0
38 |
39 | # 1: Use the publisher's email if you can't find the author's.
40 | # 0: Just use the DEFAULT_FROM email instead.
41 | USE_PUBLISHER_EMAIL = 0
42 |
43 | # 1: Use SMTP_SERVER to send mail.
44 | # 0: Call /usr/sbin/sendmail to send mail.
45 | SMTP_SEND = 1
46 |
47 | SMTP_SERVER = "smtp.yourisp.net:25"
48 | AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
49 | SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here
50 | SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here
51 |
52 | # Connect to the SMTP server using SSL
53 |
54 | SMTP_SSL = 0
55 |
56 |
57 |
58 | # Set this to add a bonus header to all emails (start with '\n').
59 | BONUS_HEADER = ''
60 | # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
61 |
62 | # Set this to override From addresses. Keys are feed URLs, values are new titles.
63 | OVERRIDE_FROM = {}
64 |
65 | # Set this to override From email addresses. Keys are feed URLs, values are new emails.
66 |
67 | OVERRIDE_EMAIL = {}
68 |
69 |
70 |
71 | # Set this to default From email addresses. Keys are feed URLs, values are new email addresses.
72 |
73 | DEFAULT_EMAIL = {}
74 |
75 |
76 | # Only use the email from address rather than friendly name plus email address
77 |
78 | NO_FRIENDLY_NAME = 0
79 |
80 |
81 |
82 | # Set this to override the timeout (in seconds) for feed server response
83 | FEED_TIMEOUT = 60
84 |
85 | # Optional CSS styling
86 | USE_CSS_STYLING = 1
87 | STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; } .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }'
88 |
89 | # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/'
90 | PROXY=""
91 |
92 | # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works
93 | # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes
94 | CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8'
95 |
--------------------------------------------------------------------------------
/test_rss2email.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Unit tests for rss2email.
3 |
4 | These tests make sure that rss2email works as it should. If you
5 | find a bug, the best way to express it is as a test
6 | case like this that fails."""
7 |
8 | import unittest
9 | from rss2email import *
10 | import rss2email
11 | import feedparser
12 |
13 | class test_validateEmail(unittest.TestCase):
14 | """"""
15 | def test_valid_email(self):
16 | email = validateEmail("valid@example.com", "planb@example.com")
17 | self.assertEqual(email, "valid@example.com")
18 |
19 | def test_no_mail_server(self):
20 | email = validateEmail("invalid", "planb@example.com")
21 | self.assertEqual(email, "planb@example.com")
22 |
23 | def test_no_email_name(self):
24 | email = validateEmail("@invalid", "planb@example.com")
25 | self.assertEqual(email, "planb@example.com")
26 |
27 | def test_no_at(self):
28 | email = validateEmail("invalid", "planb@example.com")
29 | self.assertEqual(email, "planb@example.com")
30 |
31 | class test_getName(unittest.TestCase):
32 | """"""
33 | def setUp(self):
34 | self.feed = feedparser.parse("""
35 |
We highly recommend that you subscribe to the rss2email project feed so you can keep up to date with the latest version, bugfixes and features: http://feeds.feedburner.com/allthingsrss/hJBr
9 |Instructions for Windows Users
10 | Instructions for UNIX Users
11 | Customizing rss2email
Before you install rss2email, you'll need to make sure that a few things are in place. First, is that a version of Python 2.x installed. Second, determine your outgoing email server's address. That should be all you need.
19 | 20 |Edit the config.py file and fill in your outoing email server's details. If your server requires you to login, change "AUTHREQUIRED = 0" to "AUTHREQUIRED = 1" and enter your email username and password.
From the command line, change to the folder you created. Now create a new feed database to send updates to your email address:
34 | 35 |36 |38 | 39 |37 |
r2e new you@yourdomain.com
Subscribe to some feeds:
40 | 41 |42 |44 | 45 |43 |
r2e add http://feeds.feedburner.com/allthingsrss/hJBr
That's the feed to be notified when there's a new version of rss2email. Repeat this for each feed you want to subscribe to.
46 | 47 |When you run rss2email, it emails you about every story it hasn't seen before. But the first time you run it, that will be every story. To avoid this, you can ask rss2email not to send you any stories the first time you run it:
48 | 49 |50 |52 | 53 |51 |
r2e run --no-send
Then later, you can ask it to email you new stories:
54 | 55 |56 |59 | 60 |57 | 58 |
r2e run
If you get an error message "Sender domain must exist", add a line to config.py like this:
63 |65 | 66 |64 |
DEFAULT_FROM = rss2email@yoursite.com
You can make the email address whatever you want, but your mail server requires that the yoursite.com part actually exists.
67 | 68 |More than likely you will want rss2email to run automatically at a regular interval. Under Windows this is can be easily accomplished using the Windows Task Scheduler. This site has a nice tutorial on it. Just select r2e.bat as the program to run. Once you've created the task, double click on it in the task list and change the Run entry so that "run" comes after r2e.bat. For example, if you installed rss2email in the C:\rss2email folder, then you would change the Run entry from "C:\rss2email\r2e.bat" to "C:\rss2email\r2e.bat run".
71 | 72 |Now jump down to the section on customizing rss2email to your needs.
73 | 74 |Simply replace all of the files from the .ZIP package to your install directory EXCEPT config.py
76 | 77 |Before you install rss2email, you'll need to make sure that a few things are in place. First, is a version of Python 2.x installed. Second, is whether you have sendmail (or a compatible replacement like postfix) installed. If sendmail isn't installed, determine your outgoing email server's address. That should be all you need.
82 | 83 |A quick way to get rss2email going is using pre-made packages. Here are releases for Debian Linux, Ubuntu Linux and NetBSD.
86 | 87 |If you are unable to use these packages or you want the latest and greatest version, here's what you do:
88 | 89 |
90 | Unarchive (probably 'tar -xzf') the rss2email .tar.gz package to [folder where you want rss2email files to live]
91 | cd [yourfolder]
92 | chmod +x r2e
93 |
94 |
95 | Create a new feed database with your target email address:
98 | 99 |100 |102 | 103 |101 |
./r2e new you@yourdomain.com
Subscribe to some feeds:
104 | 105 |106 |109 | 110 |107 | 108 |
./r2e add http://feeds.feedburner.com/allthingsrss/hJBr
That's the feed to be notified when there's a new version of rss2email. Repeat this for each feed you want to subscribe to.
111 | 112 |When you run rss2email, it emails you about every story it hasn't seen before. But the first time you run it, that will be every story. To avoid this, you can ask rss2email not to send you any stories the first time you run it:
113 | 114 |115 |117 | 118 |116 |
./r2e run --no-send
Then later, you can ask it to email you new stories:
119 | 120 |121 |123 | 124 |122 |
./r2e run
You probably want to set things up so that this command is run repeatedly. (One good way is via a cron job.)
125 | 126 |If you get an error message "Sender domain must exist", add a line to config.py like this:
129 |131 | 132 |130 |
DEFAULT_FROM = rss2email@yoursite.com
You can make the email address whatever you want, but your mail server requires that the yoursite.com part actually exists.
133 | 134 |Simply replace all of the files from the .tar.gz package to your install directory EXCEPT config.py
136 | 137 | 138 | 139 |There are a number of options, described in full at the top of rss2email.py file, to customize the way rss2email behaves. If you want to change something, edit the config.py file. If you're not using rss2email under Windows, you'll have to create this file if it doesn't already exist.
For example, if you want to receive HTML mail, instead of having entries converted to plain text:
144 | 145 |146 |149 | 150 |147 | 148 |
HTML_MAIL = 1
To be notified every time a post changes, instead of just when it's first posted:
151 | 152 |153 |155 | 156 |154 |
TRUST_GUID = 0
And to make the emails look as if they were sent when the item was posted:
157 | 158 |159 |160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /html2text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """html2text: Turn HTML into equivalent Markdown-structured text.""" 3 | __version__ = "3.01" 4 | __author__ = "Aaron Swartz (me@aaronsw.com)" 5 | __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." 6 | __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] 7 | 8 | # TODO: 9 | # Support decoded entities with unifiable. 10 | 11 | try: 12 | True 13 | except NameError: 14 | setattr(__builtins__, 'True', 1) 15 | setattr(__builtins__, 'False', 0) 16 | 17 | def has_key(x, y): 18 | if hasattr(x, 'has_key'): return x.has_key(y) 19 | else: return y in x 20 | 21 | try: 22 | import htmlentitydefs 23 | import urlparse 24 | import HTMLParser 25 | except ImportError: #Python3 26 | import html.entities as htmlentitydefs 27 | import urllib.parse as urlparse 28 | import html.parser as HTMLParser 29 | try: #Python3 30 | import urllib.request as urllib 31 | except: 32 | import urllib 33 | import re, sys, codecs, types 34 | 35 | try: from textwrap import wrap 36 | except: pass 37 | 38 | # Use Unicode characters instead of their ascii psuedo-replacements 39 | UNICODE_SNOB = 0 40 | 41 | # Put the links after each paragraph instead of at the end. 42 | LINKS_EACH_PARAGRAPH = 0 43 | 44 | # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) 45 | BODY_WIDTH = 78 46 | 47 | # Don't show internal links (href="#local-anchor") -- corresponding link targets 48 | # won't be visible in the plain text file anyway. 49 | SKIP_INTERNAL_LINKS = False 50 | 51 | ### Entity Nonsense ### 52 | 53 | def name2cp(k): 54 | if k == 'apos': return ord("'") 55 | if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 56 | return htmlentitydefs.name2codepoint[k] 57 | else: 58 | k = htmlentitydefs.entitydefs[k] 59 | if k.startswith("") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 60 | return ord(codecs.latin_1_decode(k)[0]) 61 | 62 | unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 63 | 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 64 | 'ndash':'-', 'oelig':'oe', 'aelig':'ae', 65 | 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 66 | 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 67 | 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 68 | 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 69 | 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} 70 | 71 | unifiable_n = {} 72 | 73 | for k in unifiable.keys(): 74 | unifiable_n[name2cp(k)] = unifiable[k] 75 | 76 | def charref(name): 77 | if name[0] in ['x','X']: 78 | c = int(name[1:], 16) 79 | else: 80 | c = int(name) 81 | 82 | if not UNICODE_SNOB and c in unifiable_n.keys(): 83 | return unifiable_n[c] 84 | else: 85 | try: 86 | return unichr(c) 87 | except NameError: #Python3 88 | return chr(c) 89 | 90 | def entityref(c): 91 | if not UNICODE_SNOB and c in unifiable.keys(): 92 | return unifiable[c] 93 | else: 94 | try: name2cp(c) 95 | except KeyError: return "&" + c + ';' 96 | else: 97 | try: 98 | return unichr(name2cp(c)) 99 | except NameError: #Python3 100 | return chr(name2cp(c)) 101 | 102 | def replaceEntities(s): 103 | s = s.group(1) 104 | if s[0] == "#": 105 | return charref(s[1:]) 106 | else: return entityref(s) 107 | 108 | r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") 109 | def unescape(s): 110 | return r_unescape.sub(replaceEntities, s) 111 | 112 | ### End Entity Nonsense ### 113 | 114 | def onlywhite(line): 115 | """Return true if the line does only consist of whitespace characters.""" 116 | for c in line: 117 | if c is not ' ' and c is not ' ': 118 | return c is ' ' 119 | return line 120 | 121 | def optwrap(text): 122 | """Wrap all paragraphs in the provided text.""" 123 | if not BODY_WIDTH: 124 | return text 125 | 126 | assert wrap, "Requires Python 2.3." 127 | result = '' 128 | newlines = 0 129 | for para in text.split("\n"): 130 | if len(para) > 0: 131 | if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': 132 | for line in wrap(para, BODY_WIDTH): 133 | result += line + "\n" 134 | result += "\n" 135 | newlines = 2 136 | else: 137 | if not onlywhite(para): 138 | result += para + "\n" 139 | newlines = 1 140 | else: 141 | if newlines < 2: 142 | result += "\n" 143 | newlines += 1 144 | return result 145 | 146 | def hn(tag): 147 | if tag[0] == 'h' and len(tag) == 2: 148 | try: 149 | n = int(tag[1]) 150 | if n in range(1, 10): return n 151 | except ValueError: return 0 152 | 153 | class _html2text(HTMLParser.HTMLParser): 154 | def __init__(self, out=None, baseurl=''): 155 | HTMLParser.HTMLParser.__init__(self) 156 | 157 | if out is None: self.out = self.outtextf 158 | else: self.out = out 159 | try: 160 | self.outtext = unicode() 161 | except NameError: # Python3 162 | self.outtext = str() 163 | self.quiet = 0 164 | self.p_p = 0 165 | self.outcount = 0 166 | self.start = 1 167 | self.space = 0 168 | self.a = [] 169 | self.astack = [] 170 | self.acount = 0 171 | self.list = [] 172 | self.blockquote = 0 173 | self.pre = 0 174 | self.startpre = 0 175 | self.lastWasNL = 0 176 | self.abbr_title = None # current abbreviation definition 177 | self.abbr_data = None # last inner HTML (for abbr being defined) 178 | self.abbr_list = {} # stack of abbreviations to write later 179 | self.baseurl = baseurl 180 | 181 | def outtextf(self, s): 182 | self.outtext += s 183 | 184 | def close(self): 185 | HTMLParser.HTMLParser.close(self) 186 | 187 | self.pbr() 188 | self.o('', 0, 'end') 189 | 190 | return self.outtext 191 | 192 | def handle_charref(self, c): 193 | self.o(charref(c)) 194 | 195 | def handle_entityref(self, c): 196 | self.o(entityref(c)) 197 | 198 | def handle_starttag(self, tag, attrs): 199 | self.handle_tag(tag, attrs, 1) 200 | 201 | def handle_endtag(self, tag): 202 | self.handle_tag(tag, None, 0) 203 | 204 | def previousIndex(self, attrs): 205 | """ returns the index of certain set of attributes (of a link) in the 206 | self.a list 207 | 208 | If the set of attributes is not found, returns None 209 | """ 210 | if not has_key(attrs, 'href'): return None 211 | 212 | i = -1 213 | for a in self.a: 214 | i += 1 215 | match = 0 216 | 217 | if has_key(a, 'href') and a['href'] == attrs['href']: 218 | if has_key(a, 'title') or has_key(attrs, 'title'): 219 | if (has_key(a, 'title') and has_key(attrs, 'title') and 220 | a['title'] == attrs['title']): 221 | match = True 222 | else: 223 | match = True 224 | 225 | if match: return i 226 | 227 | def handle_tag(self, tag, attrs, start): 228 | #attrs = fixattrs(attrs) 229 | 230 | if hn(tag): 231 | self.p() 232 | if start: self.o(hn(tag)*"#" + ' ') 233 | 234 | if tag in ['p', 'div']: self.p() 235 | 236 | if tag == "br" and start: self.o(" \n") 237 | 238 | if tag == "hr" and start: 239 | self.p() 240 | self.o("* * *") 241 | self.p() 242 | 243 | if tag in ["head", "style", 'script']: 244 | if start: self.quiet += 1 245 | else: self.quiet -= 1 246 | 247 | if tag in ["body"]: 248 | self.quiet = 0 # sites like 9rules.com never close 249 | 250 | if tag == "blockquote": 251 | if start: 252 | self.p(); self.o('> ', 0, 1); self.start = 1 253 | self.blockquote += 1 254 | else: 255 | self.blockquote -= 1 256 | self.p() 257 | 258 | if tag in ['em', 'i', 'u']: self.o("_") 259 | if tag in ['strong', 'b']: self.o("**") 260 | if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` 261 | if tag == "abbr": 262 | if start: 263 | attrsD = {} 264 | for (x, y) in attrs: attrsD[x] = y 265 | attrs = attrsD 266 | 267 | self.abbr_title = None 268 | self.abbr_data = '' 269 | if has_key(attrs, 'title'): 270 | self.abbr_title = attrs['title'] 271 | else: 272 | if self.abbr_title != None: 273 | self.abbr_list[self.abbr_data] = self.abbr_title 274 | self.abbr_title = None 275 | self.abbr_data = '' 276 | 277 | if tag == "a": 278 | if start: 279 | attrsD = {} 280 | for (x, y) in attrs: attrsD[x] = y 281 | attrs = attrsD 282 | if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 283 | self.astack.append(attrs) 284 | self.o("[") 285 | else: 286 | self.astack.append(None) 287 | else: 288 | if self.astack: 289 | a = self.astack.pop() 290 | if a: 291 | i = self.previousIndex(a) 292 | if i is not None: 293 | a = self.a[i] 294 | else: 295 | self.acount += 1 296 | a['count'] = self.acount 297 | a['outcount'] = self.outcount 298 | self.a.append(a) 299 | self.o("][" + str(a['count']) + "]") 300 | 301 | if tag == "img" and start: 302 | attrsD = {} 303 | for (x, y) in attrs: attrsD[x] = y 304 | attrs = attrsD 305 | if has_key(attrs, 'src'): 306 | attrs['href'] = attrs['src'] 307 | alt = attrs.get('alt', '') 308 | i = self.previousIndex(attrs) 309 | if i is not None: 310 | attrs = self.a[i] 311 | else: 312 | self.acount += 1 313 | attrs['count'] = self.acount 314 | attrs['outcount'] = self.outcount 315 | self.a.append(attrs) 316 | self.o("![") 317 | self.o(alt) 318 | self.o("]["+ str(attrs['count']) +"]") 319 | 320 | if tag == 'dl' and start: self.p() 321 | if tag == 'dt' and not start: self.pbr() 322 | if tag == 'dd' and start: self.o(' ') 323 | if tag == 'dd' and not start: self.pbr() 324 | 325 | if tag in ["ol", "ul"]: 326 | if start: 327 | self.list.append({'name':tag, 'num':0}) 328 | else: 329 | if self.list: self.list.pop() 330 | 331 | self.p() 332 | 333 | if tag == 'li': 334 | if start: 335 | self.pbr() 336 | if self.list: li = self.list[-1] 337 | else: li = {'name':'ul', 'num':0} 338 | self.o(" "*len(self.list)) #TODO: line up
DATE_HEADER = 1
- s > 9 correctly. 339 | if li['name'] == "ul": self.o("* ") 340 | elif li['name'] == "ol": 341 | li['num'] += 1 342 | self.o(str(li['num'])+". ") 343 | self.start = 1 344 | else: 345 | self.pbr() 346 | 347 | if tag in ["table", "tr"] and start: self.p() 348 | if tag == 'td': self.pbr() 349 | 350 | if tag == "pre": 351 | if start: 352 | self.startpre = 1 353 | self.pre = 1 354 | else: 355 | self.pre = 0 356 | self.p() 357 | 358 | def pbr(self): 359 | if self.p_p == 0: self.p_p = 1 360 | 361 | def p(self): self.p_p = 2 362 | 363 | def o(self, data, puredata=0, force=0): 364 | if self.abbr_data is not None: self.abbr_data += data 365 | 366 | if not self.quiet: 367 | if puredata and not self.pre: 368 | data = re.sub('\s+', ' ', data) 369 | if data and data[0] == ' ': 370 | self.space = 1 371 | data = data[1:] 372 | if not data and not force: return 373 | 374 | if self.startpre: 375 | #self.out(" :") #TODO: not output when already one there 376 | self.startpre = 0 377 | 378 | bq = (">" * self.blockquote) 379 | if not (force and data and data[0] == ">") and self.blockquote: bq += " " 380 | 381 | if self.pre: 382 | bq += " " 383 | data = data.replace("\n", "\n"+bq) 384 | 385 | if self.start: 386 | self.space = 0 387 | self.p_p = 0 388 | self.start = 0 389 | 390 | if force == 'end': 391 | # It's the end. 392 | self.p_p = 0 393 | self.out("\n") 394 | self.space = 0 395 | 396 | 397 | if self.p_p: 398 | self.out(('\n'+bq)*self.p_p) 399 | self.space = 0 400 | 401 | if self.space: 402 | if not self.lastWasNL: self.out(' ') 403 | self.space = 0 404 | 405 | if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): 406 | if force == "end": self.out("\n") 407 | 408 | newa = [] 409 | for link in self.a: 410 | if self.outcount > link['outcount']: 411 | self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) 412 | if has_key(link, 'title'): self.out(" ("+link['title']+")") 413 | self.out("\n") 414 | else: 415 | newa.append(link) 416 | 417 | if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. 418 | 419 | self.a = newa 420 | 421 | if self.abbr_list and force == "end": 422 | for abbr, definition in self.abbr_list.items(): 423 | self.out(" *[" + abbr + "]: " + definition + "\n") 424 | 425 | self.p_p = 0 426 | self.out(data) 427 | self.lastWasNL = data and data[-1] == '\n' 428 | self.outcount += 1 429 | 430 | def handle_data(self, data): 431 | if r'\/script>' in data: self.quiet -= 1 432 | self.o(data, 1) 433 | 434 | def unknown_decl(self, data): pass 435 | 436 | def wrapwrite(text): sys.stdout.write(text) 437 | 438 | def html2text_file(html, out=wrapwrite, baseurl=''): 439 | h = _html2text(out, baseurl) 440 | h.feed(html) 441 | h.feed("") 442 | return h.close() 443 | 444 | def html2text(html, baseurl=''): 445 | return optwrap(html2text_file(html, None, baseurl)) 446 | 447 | if __name__ == "__main__": 448 | baseurl = '' 449 | if sys.argv[1:]: 450 | arg = sys.argv[1] 451 | if arg.startswith('http://') or arg.startswith('https://'): 452 | baseurl = arg 453 | j = urllib.urlopen(baseurl) 454 | try: 455 | from feedparser import _getCharacterEncoding as enc 456 | except ImportError: 457 | enc = lambda x, y: ('utf-8', 1) 458 | text = j.read() 459 | encoding = enc(j.headers, text)[0] 460 | if encoding == 'us-ascii': encoding = 'utf-8' 461 | data = text.decode(encoding) 462 | 463 | else: 464 | encoding = 'utf8' 465 | if len(sys.argv) > 2: 466 | encoding = sys.argv[2] 467 | try: #Python3 468 | data = open(arg, 'r', encoding=encoding).read() 469 | except TypeError: 470 | data = open(arg, 'r').read().decode(encoding) 471 | else: 472 | data = sys.stdin.read() 473 | wrapwrite(html2text(data, baseurl)) 474 | -------------------------------------------------------------------------------- /rss2email.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """rss2email: get RSS feeds emailed to you 3 | http://rss2email.infogami.com 4 | 5 | Usage: 6 | new [emailaddress] (create new feedfile) 7 | email newemailaddress (update default email) 8 | run [--no-send] [num] 9 | add feedurl [emailaddress] 10 | list 11 | reset 12 | delete n 13 | pause n 14 | unpause n 15 | opmlexport 16 | opmlimport filename 17 | """ 18 | __version__ = "2.72" 19 | __author__ = "Lindsey Smith (lindsey@allthingsrss.com)" 20 | __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3." 21 | ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 22 | "Matej Cepl", "Martin 'Joey' Schulze", 23 | "Marcel Ackermann (http://www.DreamFlasher.de)", 24 | "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ] 25 | 26 | import urllib2 27 | import BeautifulSoup 28 | urllib2.install_opener(urllib2.build_opener()) 29 | 30 | ### Vaguely Customizable Options ### 31 | 32 | # The email address messages are from by default: 33 | DEFAULT_FROM = "bozo@dev.null.invalid" 34 | 35 | # 1: Send text/html messages when possible. 36 | # 0: Convert HTML to plain text. 37 | HTML_MAIL = 0 38 | 39 | # 1: Only use the DEFAULT_FROM address. 40 | # 0: Use the email address specified by the feed, when possible. 41 | FORCE_FROM = 0 42 | 43 | # 1: Receive one email per post. 44 | # 0: Receive an email every time a post changes. 45 | TRUST_GUID = 1 46 | 47 | # 1: Generate Date header based on item's date, when possible. 48 | # 0: Generate Date header based on time sent. 49 | DATE_HEADER = 0 50 | 51 | # A tuple consisting of some combination of 52 | # ('issued', 'created', 'modified', 'expired') 53 | # expressing ordered list of preference in dates 54 | # to use for the Date header of the email. 55 | DATE_HEADER_ORDER = ('modified', 'issued', 'created') 56 | 57 | # 1: Apply Q-P conversion (required for some MUAs). 58 | # 0: Send message in 8-bits. 59 | # http://cr.yp.to/smtp/8bitmime.html 60 | #DEPRECATED 61 | QP_REQUIRED = 0 62 | #DEPRECATED 63 | 64 | # 1: Name feeds as they're being processed. 65 | # 0: Keep quiet. 66 | VERBOSE = 0 67 | 68 | # 1: Use the publisher's email if you can't find the author's. 69 | # 0: Just use the DEFAULT_FROM email instead. 70 | USE_PUBLISHER_EMAIL = 0 71 | 72 | # 1: Use SMTP_SERVER to send mail. 73 | # 0: Call /usr/sbin/sendmail to send mail. 74 | SMTP_SEND = 0 75 | 76 | SMTP_SERVER = "smtp.yourisp.net:25" 77 | AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1 78 | SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here 79 | SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here 80 | 81 | # Connect to the SMTP server using SSL 82 | SMTP_SSL = 0 83 | 84 | # Set this to add a bonus header to all emails (start with '\n'). 85 | BONUS_HEADER = '' 86 | # Example: BONUS_HEADER = '\nApproved: joe@bob.org' 87 | 88 | # Set this to override From addresses. Keys are feed URLs, values are new titles. 89 | OVERRIDE_FROM = {} 90 | 91 | # Set this to override From email addresses. Keys are feed URLs, values are new emails. 92 | OVERRIDE_EMAIL = {} 93 | 94 | # Set this to default From email addresses. Keys are feed URLs, values are new email addresses. 95 | DEFAULT_EMAIL = {} 96 | 97 | # Only use the email from address rather than friendly name plus email address 98 | NO_FRIENDLY_NAME = 0 99 | 100 | # Set this to override the timeout (in seconds) for feed server response 101 | FEED_TIMEOUT = 60 102 | 103 | # Optional CSS styling 104 | USE_CSS_STYLING = 0 105 | STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; } .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }' 106 | 107 | # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/' 108 | PROXY="" 109 | 110 | # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works 111 | # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes 112 | CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8' 113 | 114 | from email.MIMEText import MIMEText 115 | from email.Header import Header 116 | from email.Utils import parseaddr, formataddr 117 | 118 | # Note: You can also override the send function. 119 | 120 | def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None): 121 | """Send an email. 122 | 123 | All arguments should be Unicode strings (plain ASCII works as well). 124 | 125 | Only the real name part of sender and recipient addresses may contain 126 | non-ASCII characters. 127 | 128 | The email will be properly MIME encoded and delivered though SMTP to 129 | localhost port 25. This is easy to change if you want something different. 130 | 131 | The charset of the email will be the first one out of the list 132 | that can represent all the characters occurring in the email. 133 | """ 134 | 135 | # Header class is smart enough to try US-ASCII, then the charset we 136 | # provide, then fall back to UTF-8. 137 | header_charset = 'ISO-8859-1' 138 | 139 | # We must choose the body charset manually 140 | for body_charset in CHARSET_LIST: 141 | try: 142 | body.encode(body_charset) 143 | except (UnicodeError, LookupError): 144 | pass 145 | else: 146 | break 147 | 148 | # Split real name (which is optional) and email address parts 149 | sender_name, sender_addr = parseaddr(sender) 150 | recipient_name, recipient_addr = parseaddr(recipient) 151 | 152 | # We must always pass Unicode strings to Header, otherwise it will 153 | # use RFC 2047 encoding even on plain ASCII strings. 154 | sender_name = str(Header(unicode(sender_name), header_charset)) 155 | recipient_name = str(Header(unicode(recipient_name), header_charset)) 156 | 157 | # Make sure email addresses do not contain non-ASCII characters 158 | sender_addr = sender_addr.encode('ascii') 159 | recipient_addr = recipient_addr.encode('ascii') 160 | 161 | # Create the message ('plain' stands for Content-Type: text/plain) 162 | msg = MIMEText(body.encode(body_charset), contenttype, body_charset) 163 | msg['To'] = formataddr((recipient_name, recipient_addr)) 164 | msg['Subject'] = Header(unicode(subject), header_charset) 165 | for hdr in extraheaders.keys(): 166 | try: 167 | msg[hdr] = Header(unicode(extraheaders[hdr], header_charset)) 168 | except: 169 | msg[hdr] = Header(extraheaders[hdr]) 170 | 171 | fromhdr = formataddr((sender_name, sender_addr)) 172 | msg['From'] = fromhdr 173 | 174 | msg_as_string = msg.as_string() 175 | #DEPRECATED if QP_REQUIRED: 176 | #DEPRECATED ins, outs = SIO(msg_as_string), SIO() 177 | #DEPRECATED mimify.mimify(ins, outs) 178 | #DEPRECATED msg_as_string = outs.getvalue() 179 | 180 | if SMTP_SEND: 181 | if not smtpserver: 182 | import smtplib 183 | 184 | try: 185 | if SMTP_SSL: 186 | smtpserver = smtplib.SMTP_SSL() 187 | else: 188 | smtpserver = smtplib.SMTP() 189 | smtpserver.connect(SMTP_SERVER) 190 | except KeyboardInterrupt: 191 | raise 192 | except Exception, e: 193 | print >>warn, "" 194 | print >>warn, ('Fatal error: could not connect to mail server "%s"' % SMTP_SERVER) 195 | print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly') 196 | if hasattr(e, 'reason'): 197 | print >>warn, "Reason:", e.reason 198 | sys.exit(1) 199 | 200 | if AUTHREQUIRED: 201 | try: 202 | smtpserver.ehlo() 203 | if not SMTP_SSL: smtpserver.starttls() 204 | smtpserver.ehlo() 205 | smtpserver.login(SMTP_USER, SMTP_PASS) 206 | except KeyboardInterrupt: 207 | raise 208 | except Exception, e: 209 | print >>warn, "" 210 | print >>warn, ('Fatal error: could not authenticate with mail server "%s" as user "%s"' % (SMTP_SERVER, SMTP_USER)) 211 | print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly') 212 | if hasattr(e, 'reason'): 213 | print >>warn, "Reason:", e.reason 214 | sys.exit(1) 215 | 216 | smtpserver.sendmail(sender, recipient, msg_as_string) 217 | return smtpserver 218 | 219 | else: 220 | try: 221 | p = subprocess.Popen(["/usr/sbin/sendmail", recipient], stdin=subprocess.PIPE, stdout=subprocess.PIPE) 222 | p.communicate(msg_as_string) 223 | status = p.returncode 224 | assert status != None, "just a sanity check" 225 | if status != 0: 226 | print >>warn, "" 227 | print >>warn, ('Fatal error: sendmail exited with code %s' % status) 228 | sys.exit(1) 229 | except: 230 | print '''Error attempting to send email via sendmail. Possibly you need to configure your config.py to use a SMTP server? Please refer to the rss2email documentation or website (http://rss2email.infogami.com) for complete documentation of config.py. The options below may suffice for configuring email: 231 | # 1: Use SMTP_SERVER to send mail. 232 | # 0: Call /usr/sbin/sendmail to send mail. 233 | SMTP_SEND = 0 234 | 235 | SMTP_SERVER = "smtp.yourisp.net:25" 236 | AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1 237 | SMTP_USER = 'username' # for SMTP AUTH, set SMTP username here 238 | SMTP_PASS = 'password' # for SMTP AUTH, set SMTP password here 239 | ''' 240 | sys.exit(1) 241 | return None 242 | 243 | ## html2text options ## 244 | 245 | # Use Unicode characters instead of their ascii psuedo-replacements 246 | UNICODE_SNOB = 0 247 | 248 | # Put the links after each paragraph instead of at the end. 249 | LINKS_EACH_PARAGRAPH = 0 250 | 251 | # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) 252 | BODY_WIDTH = 0 253 | 254 | ### Load the Options ### 255 | 256 | # Read options from config file if present. 257 | import sys 258 | sys.path.insert(0,".") 259 | try: 260 | from config import * 261 | except: 262 | pass 263 | 264 | warn = sys.stderr 265 | 266 | if QP_REQUIRED: 267 | print >>warn, "QP_REQUIRED has been deprecated in rss2email." 268 | 269 | ### Import Modules ### 270 | 271 | import cPickle as pickle, time, os, traceback, sys, types, subprocess 272 | hash = () 273 | try: 274 | import hashlib 275 | hash = hashlib.md5 276 | except ImportError: 277 | import md5 278 | hash = md5.new 279 | 280 | unix = 0 281 | try: 282 | import fcntl 283 | # A pox on SunOS file locking methods 284 | if (sys.platform.find('sunos') == -1): 285 | unix = 1 286 | except: 287 | pass 288 | 289 | import socket; socket_errors = [] 290 | for e in ['error', 'gaierror']: 291 | if hasattr(socket, e): socket_errors.append(getattr(socket, e)) 292 | 293 | #DEPRECATED import mimify 294 | #DEPRECATED from StringIO import StringIO as SIO 295 | #DEPRECATED mimify.CHARSET = 'utf-8' 296 | 297 | import feedparser 298 | feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.allthingsrss.com/rss2email/" 299 | feedparser.SANITIZE_HTML = 0 300 | 301 | import html2text as h2t 302 | 303 | h2t.UNICODE_SNOB = UNICODE_SNOB 304 | h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH 305 | h2t.BODY_WIDTH = BODY_WIDTH 306 | html2text = h2t.html2text 307 | 308 | from types import * 309 | 310 | ### Utility Functions ### 311 | 312 | import threading 313 | class TimeoutError(Exception): pass 314 | 315 | class InputError(Exception): pass 316 | 317 | def timelimit(timeout, function): 318 | # def internal(function): 319 | def internal2(*args, **kw): 320 | """ 321 | from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/473878 322 | """ 323 | class Calculator(threading.Thread): 324 | def __init__(self): 325 | threading.Thread.__init__(self) 326 | self.result = None 327 | self.error = None 328 | 329 | def run(self): 330 | try: 331 | self.result = function(*args, **kw) 332 | except: 333 | self.error = sys.exc_info() 334 | 335 | c = Calculator() 336 | c.setDaemon(True) # don't hold up exiting 337 | c.start() 338 | c.join(timeout) 339 | if c.isAlive(): 340 | raise TimeoutError 341 | if c.error: 342 | raise c.error[0], c.error[1] 343 | return c.result 344 | return internal2 345 | # return internal 346 | 347 | 348 | def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u'')) 349 | def ishtml(t): return type(t) is type(()) 350 | def contains(a,b): return a.find(b) != -1 351 | def unu(s): # I / freakin' hate / that unicode 352 | if type(s) is types.UnicodeType: return s.encode('utf-8') 353 | else: return s 354 | 355 | ### Parsing Utilities ### 356 | 357 | def getContent(entry, HTMLOK=0): 358 | """Select the best content from an entry, deHTMLizing if necessary. 359 | If raw HTML is best, an ('HTML', best) tuple is returned. """ 360 | 361 | # How this works: 362 | # * We have a bunch of potential contents. 363 | # * We go thru looking for our first choice. 364 | # (HTML or text, depending on HTMLOK) 365 | # * If that doesn't work, we go thru looking for our second choice. 366 | # * If that still doesn't work, we just take the first one. 367 | # 368 | # Possible future improvement: 369 | # * Instead of just taking the first one 370 | # pick the one in the "best" language. 371 | # * HACK: hardcoded HTMLOK, should take a tuple of media types 372 | 373 | conts = entry.get('content', []) 374 | 375 | if entry.get('summary_detail', {}): 376 | conts += [entry.summary_detail] 377 | 378 | if conts: 379 | if HTMLOK: 380 | for c in conts: 381 | if contains(c.type, 'html'): return ('HTML', c.value) 382 | 383 | if not HTMLOK: # Only need to convert to text if HTML isn't OK 384 | for c in conts: 385 | if contains(c.type, 'html'): 386 | cleanerhtml = BeautifulSoup.BeautifulSoup(c.value) 387 | return html2text(unicode(cleanerhtml)) 388 | 389 | for c in conts: 390 | if c.type == 'text/plain': return c.value 391 | 392 | return conts[0].value 393 | 394 | return "" 395 | 396 | def getID(entry): 397 | """Get best ID from an entry. 398 | NEEDS UNIT TESTS""" 399 | if TRUST_GUID: 400 | if 'id' in entry and entry.id: 401 | # Newer versions of feedparser could return a dictionary 402 | if type(entry.id) is DictType: 403 | return entry.id.values()[0] 404 | 405 | return entry.id 406 | 407 | content = getContent(entry) 408 | if content and content != "\n": return hash(unu(content)).hexdigest() 409 | if 'link' in entry: return entry.link 410 | if 'title' in entry: return hash(unu(entry.title)).hexdigest() 411 | 412 | def getName(fullfeed, entry): 413 | """Get the best name. 414 | NEEDS UNIT TESTS""" 415 | 416 | if NO_FRIENDLY_NAME: return '' 417 | 418 | feedinfo = fullfeed.feed 419 | if hasattr(fullfeed, "url") and fullfeed.url in OVERRIDE_FROM.keys(): 420 | return OVERRIDE_FROM[fullfeed.url] 421 | 422 | name = feedinfo.get('title', '') 423 | 424 | if 'name' in entry.get('author_detail', []): # normally {} but py2.1 425 | if entry.author_detail.name: 426 | if name: name += ": " 427 | det=entry.author_detail.name 428 | try: 429 | name += entry.author_detail.name 430 | except UnicodeDecodeError: 431 | name += unicode(entry.author_detail.name, 'utf-8') 432 | 433 | elif 'name' in feedinfo.get('author_detail', []): 434 | if feedinfo.author_detail.name: 435 | if name: name += ", " 436 | name += feedinfo.author_detail.name 437 | 438 | return name 439 | 440 | def validateEmail(email, planb): 441 | """Do a basic quality check on email address, but return planb if email doesn't appear to be well-formed""" 442 | email_parts = email.split('@') 443 | if (len(email_parts) != 2) or not email_parts[0] or not email_parts[1]: 444 | return planb 445 | return email 446 | 447 | def getEmail(r, entry): 448 | """Get the best email_address. If the best guess isn't well-formed (something@somthing.com), use DEFAULT_FROM instead. 449 | NEEDS UNIT TESTS""" 450 | 451 | feed = r.feed 452 | 453 | if FORCE_FROM: return DEFAULT_FROM 454 | 455 | if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys(): 456 | return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM) 457 | 458 | if 'email' in entry.get('author_detail', []): 459 | return validateEmail(entry.author_detail.email, DEFAULT_FROM) 460 | 461 | if 'email' in feed.get('author_detail', []): 462 | return validateEmail(feed.author_detail.email, DEFAULT_FROM) 463 | 464 | if USE_PUBLISHER_EMAIL: 465 | if 'email' in feed.get('publisher_detail', []): 466 | return validateEmail(feed.publisher_detail.email, DEFAULT_FROM) 467 | 468 | if feed.get("errorreportsto", ''): 469 | return validateEmail(feed.errorreportsto, DEFAULT_FROM) 470 | 471 | if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys(): 472 | return DEFAULT_EMAIL[r.url] 473 | return DEFAULT_FROM 474 | 475 | def getTags(entry): 476 | """If the entry has any tags, build a tagline and return as a string. Otherwise returns empty string""" 477 | tagline = "" 478 | if 'tags' in entry: 479 | tags = entry.get('tags') 480 | taglist = [] 481 | if tags: 482 | for tag in tags: 483 | if tag.has_key('term'): taglist.append(tag['term']) 484 | if taglist: 485 | tagline = ",".join(taglist) 486 | 487 | return tagline 488 | 489 | 490 | ### Simple Database of Feeds ### 491 | 492 | class Feed: 493 | def __init__(self, url, to): 494 | self.url, self.etag, self.modified, self.seen = url, None, None, {} 495 | self.active = True 496 | self.to = to 497 | 498 | def load(lock=1): 499 | if not os.path.exists(feedfile): 500 | print 'Feedfile "%s" does not exist. If you\'re using r2e for the first time, you' % feedfile 501 | print "have to run 'r2e new' first." 502 | sys.exit(1) 503 | try: 504 | feedfileObject = open(feedfile, 'r') 505 | except IOError, e: 506 | print "Feedfile could not be opened: %s" % e 507 | sys.exit(1) 508 | feeds = pickle.load(feedfileObject) 509 | 510 | if lock: 511 | locktype = 0 512 | if unix: 513 | locktype = fcntl.LOCK_EX 514 | fcntl.flock(feedfileObject.fileno(), locktype) 515 | #HACK: to deal with lock caching 516 | feedfileObject = open(feedfile, 'r') 517 | feeds = pickle.load(feedfileObject) 518 | if unix: 519 | fcntl.flock(feedfileObject.fileno(), locktype) 520 | if feeds: 521 | for feed in feeds[1:]: 522 | if not hasattr(feed, 'active'): 523 | feed.active = True 524 | 525 | return feeds, feedfileObject 526 | 527 | def unlock(feeds, feedfileObject): 528 | if not unix: 529 | pickle.dump(feeds, open(feedfile, 'w')) 530 | else: 531 | fd = open(feedfile+'.tmp', 'w') 532 | pickle.dump(feeds, fd) 533 | fd.flush() 534 | os.fsync(fd.fileno()) 535 | fd.close() 536 | os.rename(feedfile+'.tmp', feedfile) 537 | fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN) 538 | 539 | #@timelimit(FEED_TIMEOUT) 540 | def parse(url, etag, modified): 541 | if PROXY == '': 542 | return feedparser.parse(url, etag, modified) 543 | else: 544 | proxy = urllib2.ProxyHandler( {"http":PROXY} ) 545 | return feedparser.parse(url, etag, modified, handlers = [proxy]) 546 | 547 | 548 | ### Program Functions ### 549 | 550 | def add(*args): 551 | if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'): 552 | urls, to = [args[0]], args[1] 553 | else: 554 | urls, to = args, None 555 | 556 | feeds, feedfileObject = load() 557 | if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None): 558 | print "No email address has been defined. Please run 'r2e email emailaddress' or" 559 | print "'r2e add url emailaddress'." 560 | sys.exit(1) 561 | for url in urls: feeds.append(Feed(url, to)) 562 | unlock(feeds, feedfileObject) 563 | 564 | def run(num=None): 565 | feeds, feedfileObject = load() 566 | smtpserver = None 567 | try: 568 | # We store the default to address as the first item in the feeds list. 569 | # Here we take it out and save it for later. 570 | default_to = "" 571 | if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 572 | else: ifeeds = feeds 573 | 574 | if num: ifeeds = [feeds[num]] 575 | feednum = 0 576 | 577 | for f in ifeeds: 578 | try: 579 | feednum += 1 580 | if not f.active: continue 581 | 582 | if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) 583 | r = {} 584 | try: 585 | r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) 586 | except TimeoutError: 587 | print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) 588 | continue 589 | 590 | # Handle various status conditions, as required 591 | if 'status' in r: 592 | if r.status == 301: 593 | print >>warn, "W: feed moved; updating", f.url, "to", r['url'] 594 | f.url = r['url'] 595 | elif r.status == 410: 596 | print >>warn, "W: feed gone; deleting", f.url 597 | feeds.remove(f) 598 | continue 599 | 600 | http_status = r.get('status', 200) 601 | if VERBOSE > 1: print >>warn, "I: http status", http_status 602 | http_headers = r.get('headers', { 603 | 'content-type': 'application/rss+xml', 604 | 'content-length':'1'}) 605 | exc_type = r.get("bozo_exception", Exception()).__class__ 606 | if http_status != 304 and not r.entries and not r.get('version', ''): 607 | if http_status not in [200, 302]: 608 | print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) 609 | 610 | elif contains(http_headers.get('content-type', 'rss'), 'html'): 611 | print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) 612 | 613 | elif http_headers.get('content-length', '1') == '0': 614 | print >>warn, "W: empty page [%d] %s" % (feednum, f.url) 615 | 616 | elif hasattr(socket, 'timeout') and exc_type == socket.timeout: 617 | print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) 618 | 619 | elif exc_type == IOError: 620 | print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) 621 | 622 | elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: 623 | print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) 624 | 625 | elif exc_type in socket_errors: 626 | exc_reason = r.bozo_exception.args[1] 627 | print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) 628 | 629 | elif exc_type == urllib2.URLError: 630 | if r.bozo_exception.reason.__class__ in socket_errors: 631 | exc_reason = r.bozo_exception.reason.args[1] 632 | else: 633 | exc_reason = r.bozo_exception.reason 634 | print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) 635 | 636 | elif exc_type == AttributeError: 637 | print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) 638 | 639 | elif exc_type == KeyboardInterrupt: 640 | raise r.bozo_exception 641 | 642 | elif r.bozo: 643 | print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) 644 | 645 | else: 646 | print >>warn, "=== rss2email encountered a problem with this feed ===" 647 | print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" 648 | print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ===" 649 | print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url 650 | print >>warn, r 651 | print >>warn, "rss2email", __version__ 652 | print >>warn, "feedparser", feedparser.__version__ 653 | print >>warn, "html2text", h2t.__version__ 654 | print >>warn, "Python", sys.version 655 | print >>warn, "=== END HERE ===" 656 | continue 657 | 658 | r.entries.reverse() 659 | 660 | for entry in r.entries: 661 | id = getID(entry) 662 | 663 | # If TRUST_GUID isn't set, we get back hashes of the content. 664 | # Instead of letting these run wild, we put them in context 665 | # by associating them with the actual ID (if it exists). 666 | 667 | frameid = entry.get('id') 668 | if not(frameid): frameid = id 669 | if type(frameid) is DictType: 670 | frameid = frameid.values()[0] 671 | 672 | # If this item's ID is in our database 673 | # then it's already been sent 674 | # and we don't need to do anything more. 675 | 676 | if frameid in f.seen: 677 | if f.seen[frameid] == id: continue 678 | 679 | if not (f.to or default_to): 680 | print "No default email address defined. Please run 'r2e email emailaddress'" 681 | print "Ignoring feed %s" % f.url 682 | break 683 | 684 | if 'title_detail' in entry and entry.title_detail: 685 | title = entry.title_detail.value 686 | if contains(entry.title_detail.type, 'html'): 687 | title = html2text(title) 688 | else: 689 | title = getContent(entry)[:70] 690 | 691 | title = title.replace("\n", " ").strip() 692 | 693 | datetime = time.gmtime() 694 | 695 | if DATE_HEADER: 696 | for datetype in DATE_HEADER_ORDER: 697 | kind = datetype+"_parsed" 698 | if kind in entry and entry[kind]: datetime = entry[kind] 699 | 700 | link = entry.get('link', "") 701 | 702 | from_addr = getEmail(r, entry) 703 | 704 | name = h2t.unescape(getName(r, entry)) 705 | fromhdr = formataddr((name, from_addr,)) 706 | tohdr = (f.to or default_to) 707 | subjecthdr = title 708 | datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) 709 | useragenthdr = "rss2email" 710 | 711 | # Add post tags, if available 712 | tagline = getTags(entry) 713 | 714 | extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline} 715 | if BONUS_HEADER != '': 716 | for hdr in BONUS_HEADER.strip().splitlines(): 717 | pos = hdr.strip().find(':') 718 | if pos > 0: 719 | extraheaders[hdr[:pos]] = hdr[pos+1:].strip() 720 | else: 721 | print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 722 | 723 | entrycontent = getContent(entry, HTMLOK=HTML_MAIL) 724 | contenttype = 'plain' 725 | content = '' 726 | if USE_CSS_STYLING and HTML_MAIL: 727 | contenttype = 'html' 728 | content = "\n" 729 | content += '\n' 730 | content += '\n' 731 | content += '
\n' 732 | content += '\n' 758 | content += "\n\n" 759 | else: 760 | if ishtml(entrycontent): 761 | contenttype = 'html' 762 | content = "\n" 763 | content = ("\n\n" + 764 | ''+subjecthdr+'
\n' 735 | if ishtml(entrycontent): 736 | body = entrycontent[1].strip() 737 | else: 738 | body = entrycontent.strip() 739 | if body != '': 740 | content += '\n' 741 | content += '\n
\n' + body + ' '+subjecthdr+'
\n\n' + 765 | entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) 766 | 'URL: '+link+'
' ) 767 | 768 | if hasattr(entry,'enclosures'): 769 | for enclosure in entry.enclosures: 770 | if enclosure.url != "": 771 | content += ('Enclosure: '+enclosure.url+"
\n") 772 | if 'links' in entry: 773 | for extralink in entry.links: 774 | if ('rel' in extralink) and extralink['rel'] == u'via': 775 | content += 'Via: '+extralink['title']+'
\n' 776 | 777 | content += ("\n") 778 | else: 779 | content = entrycontent.strip() + "\n\nURL: "+link 780 | if hasattr(entry,'enclosures'): 781 | for enclosure in entry.enclosures: 782 | if enclosure.url != "": 783 | content += ('\nEnclosure: ' + enclosure.url + "\n") 784 | if 'links' in entry: 785 | for extralink in entry.links: 786 | if ('rel' in extralink) and extralink['rel'] == u'via': 787 | content += 'Via: '+extralink['title']+'\n' 788 | 789 | smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver) 790 | 791 | f.seen[frameid] = id 792 | 793 | f.etag, f.modified = r.get('etag', None), r.get('modified', None) 794 | except (KeyboardInterrupt, SystemExit): 795 | raise 796 | except: 797 | print >>warn, "=== rss2email encountered a problem with this feed ===" 798 | print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" 799 | print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ===" 800 | print >>warn, "E: could not parse", f.url 801 | traceback.print_exc(file=warn) 802 | print >>warn, "rss2email", __version__ 803 | print >>warn, "feedparser", feedparser.__version__ 804 | print >>warn, "html2text", h2t.__version__ 805 | print >>warn, "Python", sys.version 806 | print >>warn, "=== END HERE ===" 807 | continue 808 | 809 | finally: 810 | unlock(feeds, feedfileObject) 811 | if smtpserver: 812 | smtpserver.quit() 813 | 814 | def list(): 815 | feeds, feedfileObject = load(lock=0) 816 | default_to = "" 817 | 818 | if feeds and isstr(feeds[0]): 819 | default_to = feeds[0]; ifeeds = feeds[1:]; i=1 820 | print "default email:", default_to 821 | else: ifeeds = feeds; i = 0 822 | for f in ifeeds: 823 | active = ('[ ]', '[*]')[f.active] 824 | print `i`+':',active, f.url, '('+(f.to or ('default: '+default_to))+')' 825 | if not (f.to or default_to): 826 | print " W: Please define a default address with 'r2e email emailaddress'" 827 | i+= 1 828 | 829 | def opmlexport(): 830 | import xml.sax.saxutils 831 | feeds, feedfileObject = load(lock=0) 832 | 833 | if feeds: 834 | print '\n\n\n ' 839 | 840 | def opmlimport(importfile): 841 | importfileObject = None 842 | print 'Importing feeds from', importfile 843 | if not os.path.exists(importfile): 844 | print 'OPML import file "%s" does not exist.' % importfile 845 | try: 846 | importfileObject = open(importfile, 'r') 847 | except IOError, e: 848 | print "OPML import file could not be opened: %s" % e 849 | sys.exit(1) 850 | try: 851 | import xml.dom.minidom 852 | dom = xml.dom.minidom.parse(importfileObject) 853 | newfeeds = dom.getElementsByTagName('outline') 854 | except: 855 | print 'E: Unable to parse OPML file' 856 | sys.exit(1) 857 | 858 | feeds, feedfileObject = load(lock=1) 859 | 860 | import xml.sax.saxutils 861 | 862 | for f in newfeeds: 863 | if f.hasAttribute('xmlUrl'): 864 | feedurl = f.getAttribute('xmlUrl') 865 | print 'Adding %s' % xml.sax.saxutils.unescape(feedurl) 866 | feeds.append(Feed(feedurl, None)) 867 | 868 | unlock(feeds, feedfileObject) 869 | 870 | def delete(n): 871 | feeds, feedfileObject = load() 872 | if (n == 0) and (feeds and isstr(feeds[0])): 873 | print >>warn, "W: ID has to be equal to or higher than 1" 874 | elif n >= len(feeds): 875 | print >>warn, "W: no such feed" 876 | else: 877 | print >>warn, "W: deleting feed %s" % feeds[n].url 878 | feeds = feeds[:n] + feeds[n+1:] 879 | if n != len(feeds): 880 | print >>warn, "W: feed IDs have changed, list before deleting again" 881 | unlock(feeds, feedfileObject) 882 | 883 | def toggleactive(n, active): 884 | feeds, feedfileObject = load() 885 | if (n == 0) and (feeds and isstr(feeds[0])): 886 | print >>warn, "W: ID has to be equal to or higher than 1" 887 | elif n >= len(feeds): 888 | print >>warn, "W: no such feed" 889 | else: 890 | action = ('Pausing', 'Unpausing')[active] 891 | print >>warn, "%s feed %s" % (action, feeds[n].url) 892 | feeds[n].active = active 893 | unlock(feeds, feedfileObject) 894 | 895 | def reset(): 896 | feeds, feedfileObject = load() 897 | if feeds and isstr(feeds[0]): 898 | ifeeds = feeds[1:] 899 | else: ifeeds = feeds 900 | for f in ifeeds: 901 | if VERBOSE: print "Resetting %d already seen items" % len(f.seen) 902 | f.seen = {} 903 | f.etag = None 904 | f.modified = None 905 | 906 | unlock(feeds, feedfileObject) 907 | 908 | def email(addr): 909 | feeds, feedfileObject = load() 910 | if feeds and isstr(feeds[0]): feeds[0] = addr 911 | else: feeds = [addr] + feeds 912 | unlock(feeds, feedfileObject) 913 | 914 | if __name__ == '__main__': 915 | args = sys.argv 916 | try: 917 | if len(args) < 3: raise InputError, "insufficient args" 918 | feedfile, action, args = args[1], args[2], args[3:] 919 | 920 | if action == "run": 921 | if args and args[0] == "--no-send": 922 | def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None): 923 | if VERBOSE: print 'Not sending:', unu(subject) 924 | 925 | if args and args[-1].isdigit(): run(int(args[-1])) 926 | else: run() 927 | 928 | elif action == "email": 929 | if not args: 930 | raise InputError, "Action '%s' requires an argument" % action 931 | else: 932 | email(args[0]) 933 | 934 | elif action == "add": add(*args) 935 | 936 | elif action == "new": 937 | if len(args) == 1: d = [args[0]] 938 | else: d = [] 939 | pickle.dump(d, open(feedfile, 'w')) 940 | 941 | elif action == "list": list() 942 | 943 | elif action in ("help", "--help", "-h"): print __doc__ 944 | 945 | elif action == "delete": 946 | if not args: 947 | raise InputError, "Action '%s' requires an argument" % action 948 | elif args[0].isdigit(): 949 | delete(int(args[0])) 950 | else: 951 | raise InputError, "Action '%s' requires a number as its argument" % action 952 | 953 | elif action in ("pause", "unpause"): 954 | if not args: 955 | raise InputError, "Action '%s' requires an argument" % action 956 | elif args[0].isdigit(): 957 | active = (action == "unpause") 958 | toggleactive(int(args[0]), active) 959 | else: 960 | raise InputError, "Action '%s' requires a number as its argument" % action 961 | 962 | elif action == "reset": reset() 963 | 964 | elif action == "opmlexport": opmlexport() 965 | 966 | elif action == "opmlimport": 967 | if not args: 968 | raise InputError, "OPML import '%s' requires a filename argument" % action 969 | opmlimport(args[0]) 970 | 971 | else: 972 | raise InputError, "Invalid action" 973 | 974 | except InputError, e: 975 | print "E:", e 976 | print 977 | print __doc__ 978 | 979 | -------------------------------------------------------------------------------- /BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | """Beautiful Soup 2 | Elixir and Tonic 3 | "The Screen-Scraper's Friend" 4 | http://www.crummy.com/software/BeautifulSoup/ 5 | 6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7 | tree representation. It provides methods and Pythonic idioms that make 8 | it easy to navigate, search, and modify the tree. 9 | 10 | A well-formed XML/HTML document yields a well-formed data 11 | structure. An ill-formed XML/HTML document yields a correspondingly 12 | ill-formed data structure. If your document is only locally 13 | well-formed, you can use this library to find and process the 14 | well-formed part of it. 15 | 16 | Beautiful Soup works with Python 2.2 and up. It has no external 17 | dependencies, but you'll have more success at converting data to UTF-8 18 | if you also install these three packages: 19 | 20 | * chardet, for auto-detecting character encodings 21 | http://chardet.feedparser.org/ 22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 | by stock Python. 24 | http://cjkpython.i18n.org/ 25 | 26 | Beautiful Soup defines classes for two main parsing strategies: 27 | 28 | * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 | language that kind of looks like XML. 30 | 31 | * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 | or invalid. This class has web browser-like heuristics for 33 | obtaining a sensible parse tree in the face of common HTML errors. 34 | 35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36 | the encoding of an HTML or XML document, and converting it to 37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 | 39 | For more than you ever wanted to know about Beautiful Soup, see the 40 | documentation: 41 | http://www.crummy.com/software/BeautifulSoup/documentation.html 42 | 43 | Here, have some legalese: 44 | 45 | Copyright (c) 2004-2010, Leonard Richardson 46 | 47 | All rights reserved. 48 | 49 | Redistribution and use in source and binary forms, with or without 50 | modification, are permitted provided that the following conditions are 51 | met: 52 | 53 | * Redistributions of source code must retain the above copyright 54 | notice, this list of conditions and the following disclaimer. 55 | 56 | * Redistributions in binary form must reproduce the above 57 | copyright notice, this list of conditions and the following 58 | disclaimer in the documentation and/or other materials provided 59 | with the distribution. 60 | 61 | * Neither the name of the the Beautiful Soup Consortium and All 62 | Night Kosher Bakery nor the names of its contributors may be 63 | used to endorse or promote products derived from this software 64 | without specific prior written permission. 65 | 66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 | 78 | """ 79 | from __future__ import generators 80 | 81 | __author__ = "Leonard Richardson (leonardr@segfault.org)" 82 | __version__ = "3.2.0" 83 | __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" 84 | __license__ = "New-style BSD" 85 | 86 | from sgmllib import SGMLParser, SGMLParseError 87 | import codecs 88 | import markupbase 89 | import types 90 | import re 91 | import sgmllib 92 | try: 93 | from htmlentitydefs import name2codepoint 94 | except ImportError: 95 | name2codepoint = {} 96 | try: 97 | set 98 | except NameError: 99 | from sets import Set as set 100 | 101 | #These hacks make Beautiful Soup able to parse XML with namespaces 102 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 103 | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 104 | 105 | DEFAULT_OUTPUT_ENCODING = "utf-8" 106 | 107 | def _match_css_class(str): 108 | """Build a RE to match the given CSS class.""" 109 | return re.compile(r"(^|.*\s)%s($|\s)" % str) 110 | 111 | # First, the classes that represent markup elements. 112 | 113 | class PageElement(object): 114 | """Contains the navigational information for some part of the page 115 | (either a tag or a piece of text)""" 116 | 117 | def setup(self, parent=None, previous=None): 118 | """Sets up the initial relations between this element and 119 | other elements.""" 120 | self.parent = parent 121 | self.previous = previous 122 | self.next = None 123 | self.previousSibling = None 124 | self.nextSibling = None 125 | if self.parent and self.parent.contents: 126 | self.previousSibling = self.parent.contents[-1] 127 | self.previousSibling.nextSibling = self 128 | 129 | def replaceWith(self, replaceWith): 130 | oldParent = self.parent 131 | myIndex = self.parent.index(self) 132 | if hasattr(replaceWith, "parent")\ 133 | and replaceWith.parent is self.parent: 134 | # We're replacing this element with one of its siblings. 135 | index = replaceWith.parent.index(replaceWith) 136 | if index and index < myIndex: 137 | # Furthermore, it comes before this element. That 138 | # means that when we extract it, the index of this 139 | # element will change. 140 | myIndex = myIndex - 1 141 | self.extract() 142 | oldParent.insert(myIndex, replaceWith) 143 | 144 | def replaceWithChildren(self): 145 | myParent = self.parent 146 | myIndex = self.parent.index(self) 147 | self.extract() 148 | reversedChildren = list(self.contents) 149 | reversedChildren.reverse() 150 | for child in reversedChildren: 151 | myParent.insert(myIndex, child) 152 | 153 | def extract(self): 154 | """Destructively rips this element out of the tree.""" 155 | if self.parent: 156 | try: 157 | del self.parent.contents[self.parent.index(self)] 158 | except ValueError: 159 | pass 160 | 161 | #Find the two elements that would be next to each other if 162 | #this element (and any children) hadn't been parsed. Connect 163 | #the two. 164 | lastChild = self._lastRecursiveChild() 165 | nextElement = lastChild.next 166 | 167 | if self.previous: 168 | self.previous.next = nextElement 169 | if nextElement: 170 | nextElement.previous = self.previous 171 | self.previous = None 172 | lastChild.next = None 173 | 174 | self.parent = None 175 | if self.previousSibling: 176 | self.previousSibling.nextSibling = self.nextSibling 177 | if self.nextSibling: 178 | self.nextSibling.previousSibling = self.previousSibling 179 | self.previousSibling = self.nextSibling = None 180 | return self 181 | 182 | def _lastRecursiveChild(self): 183 | "Finds the last element beneath this object to be parsed." 184 | lastChild = self 185 | while hasattr(lastChild, 'contents') and lastChild.contents: 186 | lastChild = lastChild.contents[-1] 187 | return lastChild 188 | 189 | def insert(self, position, newChild): 190 | if isinstance(newChild, basestring) \ 191 | and not isinstance(newChild, NavigableString): 192 | newChild = NavigableString(newChild) 193 | 194 | position = min(position, len(self.contents)) 195 | if hasattr(newChild, 'parent') and newChild.parent is not None: 196 | # We're 'inserting' an element that's already one 197 | # of this object's children. 198 | if newChild.parent is self: 199 | index = self.index(newChild) 200 | if index > position: 201 | # Furthermore we're moving it further down the 202 | # list of this object's children. That means that 203 | # when we extract this element, our target index 204 | # will jump down one. 205 | position = position - 1 206 | newChild.extract() 207 | 208 | newChild.parent = self 209 | previousChild = None 210 | if position == 0: 211 | newChild.previousSibling = None 212 | newChild.previous = self 213 | else: 214 | previousChild = self.contents[position-1] 215 | newChild.previousSibling = previousChild 216 | newChild.previousSibling.nextSibling = newChild 217 | newChild.previous = previousChild._lastRecursiveChild() 218 | if newChild.previous: 219 | newChild.previous.next = newChild 220 | 221 | newChildsLastElement = newChild._lastRecursiveChild() 222 | 223 | if position >= len(self.contents): 224 | newChild.nextSibling = None 225 | 226 | parent = self 227 | parentsNextSibling = None 228 | while not parentsNextSibling: 229 | parentsNextSibling = parent.nextSibling 230 | parent = parent.parent 231 | if not parent: # This is the last element in the document. 232 | break 233 | if parentsNextSibling: 234 | newChildsLastElement.next = parentsNextSibling 235 | else: 236 | newChildsLastElement.next = None 237 | else: 238 | nextChild = self.contents[position] 239 | newChild.nextSibling = nextChild 240 | if newChild.nextSibling: 241 | newChild.nextSibling.previousSibling = newChild 242 | newChildsLastElement.next = nextChild 243 | 244 | if newChildsLastElement.next: 245 | newChildsLastElement.next.previous = newChildsLastElement 246 | self.contents.insert(position, newChild) 247 | 248 | def append(self, tag): 249 | """Appends the given tag to the contents of this tag.""" 250 | self.insert(len(self.contents), tag) 251 | 252 | def findNext(self, name=None, attrs={}, text=None, **kwargs): 253 | """Returns the first item that matches the given criteria and 254 | appears after this Tag in the document.""" 255 | return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 256 | 257 | def findAllNext(self, name=None, attrs={}, text=None, limit=None, 258 | **kwargs): 259 | """Returns all items that match the given criteria and appear 260 | after this Tag in the document.""" 261 | return self._findAll(name, attrs, text, limit, self.nextGenerator, 262 | **kwargs) 263 | 264 | def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 265 | """Returns the closest sibling to this Tag that matches the 266 | given criteria and appears after this Tag in the document.""" 267 | return self._findOne(self.findNextSiblings, name, attrs, text, 268 | **kwargs) 269 | 270 | def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 271 | **kwargs): 272 | """Returns the siblings of this Tag that match the given 273 | criteria and appear after this Tag in the document.""" 274 | return self._findAll(name, attrs, text, limit, 275 | self.nextSiblingGenerator, **kwargs) 276 | fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 277 | 278 | def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 279 | """Returns the first item that matches the given criteria and 280 | appears before this Tag in the document.""" 281 | return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 282 | 283 | def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 284 | **kwargs): 285 | """Returns all items that match the given criteria and appear 286 | before this Tag in the document.""" 287 | return self._findAll(name, attrs, text, limit, self.previousGenerator, 288 | **kwargs) 289 | fetchPrevious = findAllPrevious # Compatibility with pre-3.x 290 | 291 | def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 292 | """Returns the closest sibling to this Tag that matches the 293 | given criteria and appears before this Tag in the document.""" 294 | return self._findOne(self.findPreviousSiblings, name, attrs, text, 295 | **kwargs) 296 | 297 | def findPreviousSiblings(self, name=None, attrs={}, text=None, 298 | limit=None, **kwargs): 299 | """Returns the siblings of this Tag that match the given 300 | criteria and appear before this Tag in the document.""" 301 | return self._findAll(name, attrs, text, limit, 302 | self.previousSiblingGenerator, **kwargs) 303 | fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 304 | 305 | def findParent(self, name=None, attrs={}, **kwargs): 306 | """Returns the closest parent of this Tag that matches the given 307 | criteria.""" 308 | # NOTE: We can't use _findOne because findParents takes a different 309 | # set of arguments. 310 | r = None 311 | l = self.findParents(name, attrs, 1) 312 | if l: 313 | r = l[0] 314 | return r 315 | 316 | def findParents(self, name=None, attrs={}, limit=None, **kwargs): 317 | """Returns the parents of this Tag that match the given 318 | criteria.""" 319 | 320 | return self._findAll(name, attrs, None, limit, self.parentGenerator, 321 | **kwargs) 322 | fetchParents = findParents # Compatibility with pre-3.x 323 | 324 | #These methods do the real heavy lifting. 325 | 326 | def _findOne(self, method, name, attrs, text, **kwargs): 327 | r = None 328 | l = method(name, attrs, text, 1, **kwargs) 329 | if l: 330 | r = l[0] 331 | return r 332 | 333 | def _findAll(self, name, attrs, text, limit, generator, **kwargs): 334 | "Iterates over a generator looking for things that match." 335 | 336 | if isinstance(name, SoupStrainer): 337 | strainer = name 338 | # (Possibly) special case some findAll*(...) searches 339 | elif text is None and not limit and not attrs and not kwargs: 340 | # findAll*(True) 341 | if name is True: 342 | return [element for element in generator() 343 | if isinstance(element, Tag)] 344 | # findAll*('tag-name') 345 | elif isinstance(name, basestring): 346 | return [element for element in generator() 347 | if isinstance(element, Tag) and 348 | element.name == name] 349 | else: 350 | strainer = SoupStrainer(name, attrs, text, **kwargs) 351 | # Build a SoupStrainer 352 | else: 353 | strainer = SoupStrainer(name, attrs, text, **kwargs) 354 | results = ResultSet(strainer) 355 | g = generator() 356 | while True: 357 | try: 358 | i = g.next() 359 | except StopIteration: 360 | break 361 | if i: 362 | found = strainer.search(i) 363 | if found: 364 | results.append(found) 365 | if limit and len(results) >= limit: 366 | break 367 | return results 368 | 369 | #These Generators can be used to navigate starting from both 370 | #NavigableStrings and Tags. 371 | def nextGenerator(self): 372 | i = self 373 | while i is not None: 374 | i = i.next 375 | yield i 376 | 377 | def nextSiblingGenerator(self): 378 | i = self 379 | while i is not None: 380 | i = i.nextSibling 381 | yield i 382 | 383 | def previousGenerator(self): 384 | i = self 385 | while i is not None: 386 | i = i.previous 387 | yield i 388 | 389 | def previousSiblingGenerator(self): 390 | i = self 391 | while i is not None: 392 | i = i.previousSibling 393 | yield i 394 | 395 | def parentGenerator(self): 396 | i = self 397 | while i is not None: 398 | i = i.parent 399 | yield i 400 | 401 | # Utility methods 402 | def substituteEncoding(self, str, encoding=None): 403 | encoding = encoding or "utf-8" 404 | return str.replace("%SOUP-ENCODING%", encoding) 405 | 406 | def toEncoding(self, s, encoding=None): 407 | """Encodes an object to a string in some encoding, or to Unicode. 408 | .""" 409 | if isinstance(s, unicode): 410 | if encoding: 411 | s = s.encode(encoding) 412 | elif isinstance(s, str): 413 | if encoding: 414 | s = s.encode(encoding) 415 | else: 416 | s = unicode(s) 417 | else: 418 | if encoding: 419 | s = self.toEncoding(str(s), encoding) 420 | else: 421 | s = unicode(s) 422 | return s 423 | 424 | class NavigableString(unicode, PageElement): 425 | 426 | def __new__(cls, value): 427 | """Create a new NavigableString. 428 | 429 | When unpickling a NavigableString, this method is called with 430 | the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 431 | passed in to the superclass's __new__ or the superclass won't know 432 | how to handle non-ASCII characters. 433 | """ 434 | if isinstance(value, unicode): 435 | return unicode.__new__(cls, value) 436 | return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 437 | 438 | def __getnewargs__(self): 439 | return (NavigableString.__str__(self),) 440 | 441 | def __getattr__(self, attr): 442 | """text.string gives you text. This is for backwards 443 | compatibility for Navigable*String, but for CData* it lets you 444 | get the string without the CData wrapper.""" 445 | if attr == 'string': 446 | return self 447 | else: 448 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 449 | 450 | def __unicode__(self): 451 | return str(self).decode(DEFAULT_OUTPUT_ENCODING) 452 | 453 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 454 | if encoding: 455 | return self.encode(encoding) 456 | else: 457 | return self 458 | 459 | class CData(NavigableString): 460 | 461 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 462 | return "" % NavigableString.__str__(self, encoding) 463 | 464 | class ProcessingInstruction(NavigableString): 465 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 466 | output = self 467 | if "%SOUP-ENCODING%" in output: 468 | output = self.substituteEncoding(output, encoding) 469 | return "%s?>" % self.toEncoding(output, encoding) 470 | 471 | class Comment(NavigableString): 472 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 473 | return "" % NavigableString.__str__(self, encoding) 474 | 475 | class Declaration(NavigableString): 476 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 477 | return "" % NavigableString.__str__(self, encoding) 478 | 479 | class Tag(PageElement): 480 | 481 | """Represents a found HTML tag with its attributes and contents.""" 482 | 483 | def _invert(h): 484 | "Cheap function to invert a hash." 485 | i = {} 486 | for k,v in h.items(): 487 | i[v] = k 488 | return i 489 | 490 | XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 491 | "quot" : '"', 492 | "amp" : "&", 493 | "lt" : "<", 494 | "gt" : ">" } 495 | 496 | XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 497 | 498 | def _convertEntities(self, match): 499 | """Used in a call to re.sub to replace HTML, XML, and numeric 500 | entities with the appropriate Unicode characters. If HTML 501 | entities are being converted, any unrecognized entities are 502 | escaped.""" 503 | x = match.group(1) 504 | if self.convertHTMLEntities and x in name2codepoint: 505 | return unichr(name2codepoint[x]) 506 | elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 507 | if self.convertXMLEntities: 508 | return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 509 | else: 510 | return u'&%s;' % x 511 | elif len(x) > 0 and x[0] == '#': 512 | # Handle numeric entities 513 | if len(x) > 1 and x[1] == 'x': 514 | return unichr(int(x[2:], 16)) 515 | else: 516 | return unichr(int(x[1:])) 517 | 518 | elif self.escapeUnrecognizedEntities: 519 | return u'&%s;' % x 520 | else: 521 | return u'&%s;' % x 522 | 523 | def __init__(self, parser, name, attrs=None, parent=None, 524 | previous=None): 525 | "Basic constructor." 526 | 527 | # We don't actually store the parser object: that lets extracted 528 | # chunks be garbage-collected 529 | self.parserClass = parser.__class__ 530 | self.isSelfClosing = parser.isSelfClosingTag(name) 531 | self.name = name 532 | if attrs is None: 533 | attrs = [] 534 | elif isinstance(attrs, dict): 535 | attrs = attrs.items() 536 | self.attrs = attrs 537 | self.contents = [] 538 | self.setup(parent, previous) 539 | self.hidden = False 540 | self.containsSubstitutions = False 541 | self.convertHTMLEntities = parser.convertHTMLEntities 542 | self.convertXMLEntities = parser.convertXMLEntities 543 | self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 544 | 545 | # Convert any HTML, XML, or numeric entities in the attribute values. 546 | convert = lambda(k, val): (k, 547 | re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 548 | self._convertEntities, 549 | val)) 550 | self.attrs = map(convert, self.attrs) 551 | 552 | def getString(self): 553 | if (len(self.contents) == 1 554 | and isinstance(self.contents[0], NavigableString)): 555 | return self.contents[0] 556 | 557 | def setString(self, string): 558 | """Replace the contents of the tag with a string""" 559 | self.clear() 560 | self.append(string) 561 | 562 | string = property(getString, setString) 563 | 564 | def getText(self, separator=u""): 565 | if not len(self.contents): 566 | return u"" 567 | stopNode = self._lastRecursiveChild().next 568 | strings = [] 569 | current = self.contents[0] 570 | while current is not stopNode: 571 | if isinstance(current, NavigableString): 572 | strings.append(current.strip()) 573 | current = current.next 574 | return separator.join(strings) 575 | 576 | text = property(getText) 577 | 578 | def get(self, key, default=None): 579 | """Returns the value of the 'key' attribute for the tag, or 580 | the value given for 'default' if it doesn't have that 581 | attribute.""" 582 | return self._getAttrMap().get(key, default) 583 | 584 | def clear(self): 585 | """Extract all children.""" 586 | for child in self.contents[:]: 587 | child.extract() 588 | 589 | def index(self, element): 590 | for i, child in enumerate(self.contents): 591 | if child is element: 592 | return i 593 | raise ValueError("Tag.index: element not in tag") 594 | 595 | def has_key(self, key): 596 | return self._getAttrMap().has_key(key) 597 | 598 | def __getitem__(self, key): 599 | """tag[key] returns the value of the 'key' attribute for the tag, 600 | and throws an exception if it's not there.""" 601 | return self._getAttrMap()[key] 602 | 603 | def __iter__(self): 604 | "Iterating over a tag iterates over its contents." 605 | return iter(self.contents) 606 | 607 | def __len__(self): 608 | "The length of a tag is the length of its list of contents." 609 | return len(self.contents) 610 | 611 | def __contains__(self, x): 612 | return x in self.contents 613 | 614 | def __nonzero__(self): 615 | "A tag is non-None even if it has no contents." 616 | return True 617 | 618 | def __setitem__(self, key, value): 619 | """Setting tag[key] sets the value of the 'key' attribute for the 620 | tag.""" 621 | self._getAttrMap() 622 | self.attrMap[key] = value 623 | found = False 624 | for i in range(0, len(self.attrs)): 625 | if self.attrs[i][0] == key: 626 | self.attrs[i] = (key, value) 627 | found = True 628 | if not found: 629 | self.attrs.append((key, value)) 630 | self._getAttrMap()[key] = value 631 | 632 | def __delitem__(self, key): 633 | "Deleting tag[key] deletes all 'key' attributes for the tag." 634 | for item in self.attrs: 635 | if item[0] == key: 636 | self.attrs.remove(item) 637 | #We don't break because bad HTML can define the same 638 | #attribute multiple times. 639 | self._getAttrMap() 640 | if self.attrMap.has_key(key): 641 | del self.attrMap[key] 642 | 643 | def __call__(self, *args, **kwargs): 644 | """Calling a tag like a function is the same as calling its 645 | findAll() method. Eg. tag('a') returns a list of all the A tags 646 | found within this tag.""" 647 | return apply(self.findAll, args, kwargs) 648 | 649 | def __getattr__(self, tag): 650 | #print "Getattr %s.%s" % (self.__class__, tag) 651 | if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 652 | return self.find(tag[:-3]) 653 | elif tag.find('__') != 0: 654 | return self.find(tag) 655 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) 656 | 657 | def __eq__(self, other): 658 | """Returns true iff this tag has the same name, the same attributes, 659 | and the same contents (recursively) as the given tag. 660 | 661 | NOTE: right now this will return false if two tags have the 662 | same attributes in a different order. Should this be fixed?""" 663 | if other is self: 664 | return True 665 | if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 666 | return False 667 | for i in range(0, len(self.contents)): 668 | if self.contents[i] != other.contents[i]: 669 | return False 670 | return True 671 | 672 | def __ne__(self, other): 673 | """Returns true iff this tag is not identical to the other tag, 674 | as defined in __eq__.""" 675 | return not self == other 676 | 677 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 678 | """Renders this tag as a string.""" 679 | return self.__str__(encoding) 680 | 681 | def __unicode__(self): 682 | return self.__str__(None) 683 | 684 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 685 | + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 686 | + ")") 687 | 688 | def _sub_entity(self, x): 689 | """Used with a regular expression to substitute the 690 | appropriate XML entity for an XML special character.""" 691 | return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 692 | 693 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 694 | prettyPrint=False, indentLevel=0): 695 | """Returns a string or Unicode representation of this tag and 696 | its contents. To get Unicode, pass None for encoding. 697 | 698 | NOTE: since Python's HTML parser consumes whitespace, this 699 | method is not certain to reproduce the whitespace present in 700 | the original string.""" 701 | 702 | encodedName = self.toEncoding(self.name, encoding) 703 | 704 | attrs = [] 705 | if self.attrs: 706 | for key, val in self.attrs: 707 | fmt = '%s="%s"' 708 | if isinstance(val, basestring): 709 | if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 710 | val = self.substituteEncoding(val, encoding) 711 | 712 | # The attribute value either: 713 | # 714 | # * Contains no embedded double quotes or single quotes. 715 | # No problem: we enclose it in double quotes. 716 | # * Contains embedded single quotes. No problem: 717 | # double quotes work here too. 718 | # * Contains embedded double quotes. No problem: 719 | # we enclose it in single quotes. 720 | # * Embeds both single _and_ double quotes. This 721 | # can't happen naturally, but it can happen if 722 | # you modify an attribute value after parsing 723 | # the document. Now we have a bit of a 724 | # problem. We solve it by enclosing the 725 | # attribute in single quotes, and escaping any 726 | # embedded single quotes to XML entities. 727 | if '"' in val: 728 | fmt = "%s='%s'" 729 | if "'" in val: 730 | # TODO: replace with apos when 731 | # appropriate. 732 | val = val.replace("'", "&squot;") 733 | 734 | # Now we're okay w/r/t quotes. But the attribute 735 | # value might also contain angle brackets, or 736 | # ampersands that aren't part of entities. We need 737 | # to escape those to XML entities too. 738 | val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 739 | 740 | attrs.append(fmt % (self.toEncoding(key, encoding), 741 | self.toEncoding(val, encoding))) 742 | close = '' 743 | closeTag = '' 744 | if self.isSelfClosing: 745 | close = ' /' 746 | else: 747 | closeTag = '%s>' % encodedName 748 | 749 | indentTag, indentContents = 0, 0 750 | if prettyPrint: 751 | indentTag = indentLevel 752 | space = (' ' * (indentTag-1)) 753 | indentContents = indentTag + 1 754 | contents = self.renderContents(encoding, prettyPrint, indentContents) 755 | if self.hidden: 756 | s = contents 757 | else: 758 | s = [] 759 | attributeString = '' 760 | if attrs: 761 | attributeString = ' ' + ' '.join(attrs) 762 | if prettyPrint: 763 | s.append(space) 764 | s.append('<%s%s%s>' % (encodedName, attributeString, close)) 765 | if prettyPrint: 766 | s.append("\n") 767 | s.append(contents) 768 | if prettyPrint and contents and contents[-1] != "\n": 769 | s.append("\n") 770 | if prettyPrint and closeTag: 771 | s.append(space) 772 | s.append(closeTag) 773 | if prettyPrint and closeTag and self.nextSibling: 774 | s.append("\n") 775 | s = ''.join(s) 776 | return s 777 | 778 | def decompose(self): 779 | """Recursively destroys the contents of this tree.""" 780 | self.extract() 781 | if len(self.contents) == 0: 782 | return 783 | current = self.contents[0] 784 | while current is not None: 785 | next = current.next 786 | if isinstance(current, Tag): 787 | del current.contents[:] 788 | current.parent = None 789 | current.previous = None 790 | current.previousSibling = None 791 | current.next = None 792 | current.nextSibling = None 793 | current = next 794 | 795 | def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 796 | return self.__str__(encoding, True) 797 | 798 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 799 | prettyPrint=False, indentLevel=0): 800 | """Renders the contents of this tag as a string in the given 801 | encoding. If encoding is None, returns a Unicode string..""" 802 | s=[] 803 | for c in self: 804 | text = None 805 | if isinstance(c, NavigableString): 806 | text = c.__str__(encoding) 807 | elif isinstance(c, Tag): 808 | s.append(c.__str__(encoding, prettyPrint, indentLevel)) 809 | if text and prettyPrint: 810 | text = text.strip() 811 | if text: 812 | if prettyPrint: 813 | s.append(" " * (indentLevel-1)) 814 | s.append(text) 815 | if prettyPrint: 816 | s.append("\n") 817 | return ''.join(s) 818 | 819 | #Soup methods 820 | 821 | def find(self, name=None, attrs={}, recursive=True, text=None, 822 | **kwargs): 823 | """Return only the first child of this Tag matching the given 824 | criteria.""" 825 | r = None 826 | l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 827 | if l: 828 | r = l[0] 829 | return r 830 | findChild = find 831 | 832 | def findAll(self, name=None, attrs={}, recursive=True, text=None, 833 | limit=None, **kwargs): 834 | """Extracts a list of Tag objects that match the given 835 | criteria. You can specify the name of the Tag and any 836 | attributes you want the Tag to have. 837 | 838 | The value of a key-value pair in the 'attrs' map can be a 839 | string, a list of strings, a regular expression object, or a 840 | callable that takes a string and returns whether or not the 841 | string matches for some custom definition of 'matches'. The 842 | same is true of the tag name.""" 843 | generator = self.recursiveChildGenerator 844 | if not recursive: 845 | generator = self.childGenerator 846 | return self._findAll(name, attrs, text, limit, generator, **kwargs) 847 | findChildren = findAll 848 | 849 | # Pre-3.x compatibility methods 850 | first = find 851 | fetch = findAll 852 | 853 | def fetchText(self, text=None, recursive=True, limit=None): 854 | return self.findAll(text=text, recursive=recursive, limit=limit) 855 | 856 | def firstText(self, text=None, recursive=True): 857 | return self.find(text=text, recursive=recursive) 858 | 859 | #Private methods 860 | 861 | def _getAttrMap(self): 862 | """Initializes a map representation of this tag's attributes, 863 | if not already initialized.""" 864 | if not getattr(self, 'attrMap'): 865 | self.attrMap = {} 866 | for (key, value) in self.attrs: 867 | self.attrMap[key] = value 868 | return self.attrMap 869 | 870 | #Generator methods 871 | def childGenerator(self): 872 | # Just use the iterator from the contents 873 | return iter(self.contents) 874 | 875 | def recursiveChildGenerator(self): 876 | if not len(self.contents): 877 | raise StopIteration 878 | stopNode = self._lastRecursiveChild().next 879 | current = self.contents[0] 880 | while current is not stopNode: 881 | yield current 882 | current = current.next 883 | 884 | 885 | # Next, a couple classes to represent queries and their results. 886 | class SoupStrainer: 887 | """Encapsulates a number of ways of matching a markup element (tag or 888 | text).""" 889 | 890 | def __init__(self, name=None, attrs={}, text=None, **kwargs): 891 | self.name = name 892 | if isinstance(attrs, basestring): 893 | kwargs['class'] = _match_css_class(attrs) 894 | attrs = None 895 | if kwargs: 896 | if attrs: 897 | attrs = attrs.copy() 898 | attrs.update(kwargs) 899 | else: 900 | attrs = kwargs 901 | self.attrs = attrs 902 | self.text = text 903 | 904 | def __str__(self): 905 | if self.text: 906 | return self.text 907 | else: 908 | return "%s|%s" % (self.name, self.attrs) 909 | 910 | def searchTag(self, markupName=None, markupAttrs={}): 911 | found = None 912 | markup = None 913 | if isinstance(markupName, Tag): 914 | markup = markupName 915 | markupAttrs = markup 916 | callFunctionWithTagData = callable(self.name) \ 917 | and not isinstance(markupName, Tag) 918 | 919 | if (not self.name) \ 920 | or callFunctionWithTagData \ 921 | or (markup and self._matches(markup, self.name)) \ 922 | or (not markup and self._matches(markupName, self.name)): 923 | if callFunctionWithTagData: 924 | match = self.name(markupName, markupAttrs) 925 | else: 926 | match = True 927 | markupAttrMap = None 928 | for attr, matchAgainst in self.attrs.items(): 929 | if not markupAttrMap: 930 | if hasattr(markupAttrs, 'get'): 931 | markupAttrMap = markupAttrs 932 | else: 933 | markupAttrMap = {} 934 | for k,v in markupAttrs: 935 | markupAttrMap[k] = v 936 | attrValue = markupAttrMap.get(attr) 937 | if not self._matches(attrValue, matchAgainst): 938 | match = False 939 | break 940 | if match: 941 | if markup: 942 | found = markup 943 | else: 944 | found = markupName 945 | return found 946 | 947 | def search(self, markup): 948 | #print 'looking for %s in %s' % (self, markup) 949 | found = None 950 | # If given a list of items, scan it for a text element that 951 | # matches. 952 | if hasattr(markup, "__iter__") \ 953 | and not isinstance(markup, Tag): 954 | for element in markup: 955 | if isinstance(element, NavigableString) \ 956 | and self.search(element): 957 | found = element 958 | break 959 | # If it's a Tag, make sure its name or attributes match. 960 | # Don't bother with Tags if we're searching for text. 961 | elif isinstance(markup, Tag): 962 | if not self.text: 963 | found = self.searchTag(markup) 964 | # If it's text, make sure the text matches. 965 | elif isinstance(markup, NavigableString) or \ 966 | isinstance(markup, basestring): 967 | if self._matches(markup, self.text): 968 | found = markup 969 | else: 970 | raise Exception, "I don't know how to match against a %s" \ 971 | % markup.__class__ 972 | return found 973 | 974 | def _matches(self, markup, matchAgainst): 975 | #print "Matching %s against %s" % (markup, matchAgainst) 976 | result = False 977 | if matchAgainst is True: 978 | result = markup is not None 979 | elif callable(matchAgainst): 980 | result = matchAgainst(markup) 981 | else: 982 | #Custom match methods take the tag as an argument, but all 983 | #other ways of matching match the tag name as a string. 984 | if isinstance(markup, Tag): 985 | markup = markup.name 986 | if markup and not isinstance(markup, basestring): 987 | markup = unicode(markup) 988 | #Now we know that chunk is either a string, or None. 989 | if hasattr(matchAgainst, 'match'): 990 | # It's a regexp object. 991 | result = markup and matchAgainst.search(markup) 992 | elif hasattr(matchAgainst, '__iter__'): # list-like 993 | result = markup in matchAgainst 994 | elif hasattr(matchAgainst, 'items'): 995 | result = markup.has_key(matchAgainst) 996 | elif matchAgainst and isinstance(markup, basestring): 997 | if isinstance(markup, unicode): 998 | matchAgainst = unicode(matchAgainst) 999 | else: 1000 | matchAgainst = str(matchAgainst) 1001 | 1002 | if not result: 1003 | result = matchAgainst == markup 1004 | return result 1005 | 1006 | class ResultSet(list): 1007 | """A ResultSet is just a list that keeps track of the SoupStrainer 1008 | that created it.""" 1009 | def __init__(self, source): 1010 | list.__init__([]) 1011 | self.source = source 1012 | 1013 | # Now, some helper functions. 1014 | 1015 | def buildTagMap(default, *args): 1016 | """Turns a list of maps, lists, or scalars into a single map. 1017 | Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1018 | NESTING_RESET_TAGS maps out of lists and partial maps.""" 1019 | built = {} 1020 | for portion in args: 1021 | if hasattr(portion, 'items'): 1022 | #It's a map. Merge it. 1023 | for k,v in portion.items(): 1024 | built[k] = v 1025 | elif hasattr(portion, '__iter__'): # is a list 1026 | #It's a list. Map each item to the default. 1027 | for k in portion: 1028 | built[k] = default 1029 | else: 1030 | #It's a scalar. Map it to the default. 1031 | built[portion] = default 1032 | return built 1033 | 1034 | # Now, the parser classes. 1035 | 1036 | class BeautifulStoneSoup(Tag, SGMLParser): 1037 | 1038 | """This class contains the basic parser and search code. It defines 1039 | a parser that knows nothing about tag behavior except for the 1040 | following: 1041 | 1042 | You can't close a tag without closing all the tags it encloses. 1043 | That is, "rss2email OPML export \n\n' 835 | for f in feeds[1:]: 836 | url = xml.sax.saxutils.escape(f.url) 837 | print '' % (url, url) 838 | print '\n " actually means 1044 | " ". 1045 | 1046 | [Another possible explanation is " ", but since 1047 | this class defines no SELF_CLOSING_TAGS, it will never use that 1048 | explanation.] 1049 | 1050 | This class is useful for parsing XML or made-up markup languages, 1051 | or when BeautifulSoup makes an assumption counter to what you were 1052 | expecting.""" 1053 | 1054 | SELF_CLOSING_TAGS = {} 1055 | NESTABLE_TAGS = {} 1056 | RESET_NESTING_TAGS = {} 1057 | QUOTE_TAGS = {} 1058 | PRESERVE_WHITESPACE_TAGS = [] 1059 | 1060 | MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1061 | lambda x: x.group(1) + ' />'), 1062 | (re.compile(']*)>'), 1063 | lambda x: '') 1064 | ] 1065 | 1066 | ROOT_TAG_NAME = u'[document]' 1067 | 1068 | HTML_ENTITIES = "html" 1069 | XML_ENTITIES = "xml" 1070 | XHTML_ENTITIES = "xhtml" 1071 | # TODO: This only exists for backwards-compatibility 1072 | ALL_ENTITIES = XHTML_ENTITIES 1073 | 1074 | # Used when determining whether a text node is all whitespace and 1075 | # can be replaced with a single space. A text node that contains 1076 | # fancy Unicode spaces (usually non-breaking) should be left 1077 | # alone. 1078 | STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1079 | 1080 | def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1081 | markupMassage=True, smartQuotesTo=XML_ENTITIES, 1082 | convertEntities=None, selfClosingTags=None, isHTML=False): 1083 | """The Soup object is initialized as the 'root tag', and the 1084 | provided markup (which can be a string or a file-like object) 1085 | is fed into the underlying parser. 1086 | 1087 | sgmllib will process most bad HTML, and the BeautifulSoup 1088 | class has some tricks for dealing with some HTML that kills 1089 | sgmllib, but Beautiful Soup can nonetheless choke or lose data 1090 | if your data uses self-closing tags or declarations 1091 | incorrectly. 1092 | 1093 | By default, Beautiful Soup uses regexes to sanitize input, 1094 | avoiding the vast majority of these problems. If the problems 1095 | don't apply to you, pass in False for markupMassage, and 1096 | you'll get better performance. 1097 | 1098 | The default parser massage techniques fix the two most common 1099 | instances of invalid HTML that choke sgmllib: 1100 | 1101 |
(No space between name of closing tag and tag close) 1102 | (Extraneous whitespace in declaration) 1103 | 1104 | You can pass in a custom list of (RE object, replace method) 1105 | tuples to get Beautiful Soup to scrub your input the way you 1106 | want.""" 1107 | 1108 | self.parseOnlyThese = parseOnlyThese 1109 | self.fromEncoding = fromEncoding 1110 | self.smartQuotesTo = smartQuotesTo 1111 | self.convertEntities = convertEntities 1112 | # Set the rules for how we'll deal with the entities we 1113 | # encounter 1114 | if self.convertEntities: 1115 | # It doesn't make sense to convert encoded characters to 1116 | # entities even while you're converting entities to Unicode. 1117 | # Just convert it all to Unicode. 1118 | self.smartQuotesTo = None 1119 | if convertEntities == self.HTML_ENTITIES: 1120 | self.convertXMLEntities = False 1121 | self.convertHTMLEntities = True 1122 | self.escapeUnrecognizedEntities = True 1123 | elif convertEntities == self.XHTML_ENTITIES: 1124 | self.convertXMLEntities = True 1125 | self.convertHTMLEntities = True 1126 | self.escapeUnrecognizedEntities = False 1127 | elif convertEntities == self.XML_ENTITIES: 1128 | self.convertXMLEntities = True 1129 | self.convertHTMLEntities = False 1130 | self.escapeUnrecognizedEntities = False 1131 | else: 1132 | self.convertXMLEntities = False 1133 | self.convertHTMLEntities = False 1134 | self.escapeUnrecognizedEntities = False 1135 | 1136 | self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1137 | SGMLParser.__init__(self) 1138 | 1139 | if hasattr(markup, 'read'): # It's a file-type object. 1140 | markup = markup.read() 1141 | self.markup = markup 1142 | self.markupMassage = markupMassage 1143 | try: 1144 | self._feed(isHTML=isHTML) 1145 | except StopParsing: 1146 | pass 1147 | self.markup = None # The markup can now be GCed 1148 | 1149 | def convert_charref(self, name): 1150 | """This method fixes a bug in Python's SGMLParser.""" 1151 | try: 1152 | n = int(name) 1153 | except ValueError: 1154 | return 1155 | if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1156 | return 1157 | return self.convert_codepoint(n) 1158 | 1159 | def _feed(self, inDocumentEncoding=None, isHTML=False): 1160 | # Convert the document to Unicode. 1161 | markup = self.markup 1162 | if isinstance(markup, unicode): 1163 | if not hasattr(self, 'originalEncoding'): 1164 | self.originalEncoding = None 1165 | else: 1166 | dammit = UnicodeDammit\ 1167 | (markup, [self.fromEncoding, inDocumentEncoding], 1168 | smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1169 | markup = dammit.unicode 1170 | self.originalEncoding = dammit.originalEncoding 1171 | self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1172 | if markup: 1173 | if self.markupMassage: 1174 | if not hasattr(self.markupMassage, "__iter__"): 1175 | self.markupMassage = self.MARKUP_MASSAGE 1176 | for fix, m in self.markupMassage: 1177 | markup = fix.sub(m, markup) 1178 | # TODO: We get rid of markupMassage so that the 1179 | # soup object can be deepcopied later on. Some 1180 | # Python installations can't copy regexes. If anyone 1181 | # was relying on the existence of markupMassage, this 1182 | # might cause problems. 1183 | del(self.markupMassage) 1184 | self.reset() 1185 | 1186 | SGMLParser.feed(self, markup) 1187 | # Close out any unfinished strings and close all the open tags. 1188 | self.endData() 1189 | while self.currentTag.name != self.ROOT_TAG_NAME: 1190 | self.popTag() 1191 | 1192 | def __getattr__(self, methodName): 1193 | """This method routes method call requests to either the SGMLParser 1194 | superclass or the Tag superclass, depending on the method name.""" 1195 | #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1196 | 1197 | if methodName.startswith('start_') or methodName.startswith('end_') \ 1198 | or methodName.startswith('do_'): 1199 | return SGMLParser.__getattr__(self, methodName) 1200 | elif not methodName.startswith('__'): 1201 | return Tag.__getattr__(self, methodName) 1202 | else: 1203 | raise AttributeError 1204 | 1205 | def isSelfClosingTag(self, name): 1206 | """Returns true iff the given string is the name of a 1207 | self-closing tag according to this parser.""" 1208 | return self.SELF_CLOSING_TAGS.has_key(name) \ 1209 | or self.instanceSelfClosingTags.has_key(name) 1210 | 1211 | def reset(self): 1212 | Tag.__init__(self, self, self.ROOT_TAG_NAME) 1213 | self.hidden = 1 1214 | SGMLParser.reset(self) 1215 | self.currentData = [] 1216 | self.currentTag = None 1217 | self.tagStack = [] 1218 | self.quoteStack = [] 1219 | self.pushTag(self) 1220 | 1221 | def popTag(self): 1222 | tag = self.tagStack.pop() 1223 | 1224 | #print "Pop", tag.name 1225 | if self.tagStack: 1226 | self.currentTag = self.tagStack[-1] 1227 | return self.currentTag 1228 | 1229 | def pushTag(self, tag): 1230 | #print "Push", tag.name 1231 | if self.currentTag: 1232 | self.currentTag.contents.append(tag) 1233 | self.tagStack.append(tag) 1234 | self.currentTag = self.tagStack[-1] 1235 | 1236 | def endData(self, containerClass=NavigableString): 1237 | if self.currentData: 1238 | currentData = u''.join(self.currentData) 1239 | if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1240 | not set([tag.name for tag in self.tagStack]).intersection( 1241 | self.PRESERVE_WHITESPACE_TAGS)): 1242 | if '\n' in currentData: 1243 | currentData = '\n' 1244 | else: 1245 | currentData = ' ' 1246 | self.currentData = [] 1247 | if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1248 | (not self.parseOnlyThese.text or \ 1249 | not self.parseOnlyThese.search(currentData)): 1250 | return 1251 | o = containerClass(currentData) 1252 | o.setup(self.currentTag, self.previous) 1253 | if self.previous: 1254 | self.previous.next = o 1255 | self.previous = o 1256 | self.currentTag.contents.append(o) 1257 | 1258 | 1259 | def _popToTag(self, name, inclusivePop=True): 1260 | """Pops the tag stack up to and including the most recent 1261 | instance of the given tag. If inclusivePop is false, pops the tag 1262 | stack up to but *not* including the most recent instqance of 1263 | the given tag.""" 1264 | #print "Popping to %s" % name 1265 | if name == self.ROOT_TAG_NAME: 1266 | return 1267 | 1268 | numPops = 0 1269 | mostRecentTag = None 1270 | for i in range(len(self.tagStack)-1, 0, -1): 1271 | if name == self.tagStack[i].name: 1272 | numPops = len(self.tagStack)-i 1273 | break 1274 | if not inclusivePop: 1275 | numPops = numPops - 1 1276 | 1277 | for i in range(0, numPops): 1278 | mostRecentTag = self.popTag() 1279 | return mostRecentTag 1280 | 1281 | def _smartPop(self, name): 1282 | 1283 | """We need to pop up to the previous tag of this type, unless 1284 | one of this tag's nesting reset triggers comes between this 1285 | tag and the previous tag of this type, OR unless this tag is a 1286 | generic nesting trigger and another generic nesting trigger 1287 | comes between this tag and the previous tag of this type. 1288 | 1289 | Examples: 1290 |FooBar *
* should pop to 'p', not 'b'. 1291 |
Foo
Bar *
* should pop to 'table', not 'p'. 1292 |
Foo
Bar * * should pop to 'tr', not 'p'. 1293 | 1294 |
- *
- * should pop to 'ul', not the first 'li'. 1295 |
* * should pop to 'table', not the first 'tr' 1296 | * * should pop to 'tr', not the first 'td' 1297 | """ 1298 | 1299 | nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1300 | isNestable = nestingResetTriggers != None 1301 | isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1302 | popTo = None 1303 | inclusive = True 1304 | for i in range(len(self.tagStack)-1, 0, -1): 1305 | p = self.tagStack[i] 1306 | if (not p or p.name == name) and not isNestable: 1307 | #Non-nestable tags get popped to the top or to their 1308 | #last occurance. 1309 | popTo = name 1310 | break 1311 | if (nestingResetTriggers is not None 1312 | and p.name in nestingResetTriggers) \ 1313 | or (nestingResetTriggers is None and isResetNesting 1314 | and self.RESET_NESTING_TAGS.has_key(p.name)): 1315 | 1316 | #If we encounter one of the nesting reset triggers 1317 | #peculiar to this tag, or we encounter another tag 1318 | #that causes nesting to reset, pop up to but not 1319 | #including that tag. 1320 | popTo = p.name 1321 | inclusive = False 1322 | break 1323 | p = p.parent 1324 | if popTo: 1325 | self._popToTag(popTo, inclusive) 1326 | 1327 | def unknown_starttag(self, name, attrs, selfClosing=0): 1328 | #print "Start tag %s: %s" % (name, attrs) 1329 | if self.quoteStack: 1330 | #This is not a real tag. 1331 | #print "<%s> is not real!" % name 1332 | attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1333 | self.handle_data('<%s%s>' % (name, attrs)) 1334 | return 1335 | self.endData() 1336 | 1337 | if not self.isSelfClosingTag(name) and not selfClosing: 1338 | self._smartPop(name) 1339 | 1340 | if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1341 | and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1342 | return 1343 | 1344 | tag = Tag(self, name, attrs, self.currentTag, self.previous) 1345 | if self.previous: 1346 | self.previous.next = tag 1347 | self.previous = tag 1348 | self.pushTag(tag) 1349 | if selfClosing or self.isSelfClosingTag(name): 1350 | self.popTag() 1351 | if name in self.QUOTE_TAGS: 1352 | #print "Beginning quote (%s)" % name 1353 | self.quoteStack.append(name) 1354 | self.literal = 1 1355 | return tag 1356 | 1357 | def unknown_endtag(self, name): 1358 | #print "End tag %s" % name 1359 | if self.quoteStack and self.quoteStack[-1] != name: 1360 | #This is not a real end tag. 1361 | #print "%s> is not real!" % name 1362 | self.handle_data('%s>' % name) 1363 | return 1364 | self.endData() 1365 | self._popToTag(name) 1366 | if self.quoteStack and self.quoteStack[-1] == name: 1367 | self.quoteStack.pop() 1368 | self.literal = (len(self.quoteStack) > 0) 1369 | 1370 | def handle_data(self, data): 1371 | self.currentData.append(data) 1372 | 1373 | def _toStringSubclass(self, text, subclass): 1374 | """Adds a certain piece of text to the tree as a NavigableString 1375 | subclass.""" 1376 | self.endData() 1377 | self.handle_data(text) 1378 | self.endData(subclass) 1379 | 1380 | def handle_pi(self, text): 1381 | """Handle a processing instruction as a ProcessingInstruction 1382 | object, possibly one with a %SOUP-ENCODING% slot into which an 1383 | encoding will be plugged later.""" 1384 | if text[:3] == "xml": 1385 | text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1386 | self._toStringSubclass(text, ProcessingInstruction) 1387 | 1388 | def handle_comment(self, text): 1389 | "Handle comments as Comment objects." 1390 | self._toStringSubclass(text, Comment) 1391 | 1392 | def handle_charref(self, ref): 1393 | "Handle character references as data." 1394 | if self.convertEntities: 1395 | data = unichr(int(ref)) 1396 | else: 1397 | data = '%s;' % ref 1398 | self.handle_data(data) 1399 | 1400 | def handle_entityref(self, ref): 1401 | """Handle entity references as data, possibly converting known 1402 | HTML and/or XML entity references to the corresponding Unicode 1403 | characters.""" 1404 | data = None 1405 | if self.convertHTMLEntities: 1406 | try: 1407 | data = unichr(name2codepoint[ref]) 1408 | except KeyError: 1409 | pass 1410 | 1411 | if not data and self.convertXMLEntities: 1412 | data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1413 | 1414 | if not data and self.convertHTMLEntities and \ 1415 | not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1416 | # TODO: We've got a problem here. We're told this is 1417 | # an entity reference, but it's not an XML entity 1418 | # reference or an HTML entity reference. Nonetheless, 1419 | # the logical thing to do is to pass it through as an 1420 | # unrecognized entity reference. 1421 | # 1422 | # Except: when the input is "&carol;" this function 1423 | # will be called with input "carol". When the input is 1424 | # "AT&T", this function will be called with input 1425 | # "T". We have no way of knowing whether a semicolon 1426 | # was present originally, so we don't know whether 1427 | # this is an unknown entity or just a misplaced 1428 | # ampersand. 1429 | # 1430 | # The more common case is a misplaced ampersand, so I 1431 | # escape the ampersand and omit the trailing semicolon. 1432 | data = "&%s" % ref 1433 | if not data: 1434 | # This case is different from the one above, because we 1435 | # haven't already gone through a supposedly comprehensive 1436 | # mapping of entities to Unicode characters. We might not 1437 | # have gone through any mapping at all. So the chances are 1438 | # very high that this is a real entity, and not a 1439 | # misplaced ampersand. 1440 | data = "&%s;" % ref 1441 | self.handle_data(data) 1442 | 1443 | def handle_decl(self, data): 1444 | "Handle DOCTYPEs and the like as Declaration objects." 1445 | self._toStringSubclass(data, Declaration) 1446 | 1447 | def parse_declaration(self, i): 1448 | """Treat a bogus SGML declaration as raw data. Treat a CDATA 1449 | declaration as a CData object.""" 1450 | j = None 1451 | if self.rawdata[i:i+9] == '', i) 1453 | if k == -1: 1454 | k = len(self.rawdata) 1455 | data = self.rawdata[i+9:k] 1456 | j = k+3 1457 | self._toStringSubclass(data, CData) 1458 | else: 1459 | try: 1460 | j = SGMLParser.parse_declaration(self, i) 1461 | except SGMLParseError: 1462 | toHandle = self.rawdata[i:] 1463 | self.handle_data(toHandle) 1464 | j = i + len(toHandle) 1465 | return j 1466 | 1467 | class BeautifulSoup(BeautifulStoneSoup): 1468 | 1469 | """This parser knows the following facts about HTML: 1470 | 1471 | * Some tags have no closing tag and should be interpreted as being 1472 | closed as soon as they are encountered. 1473 | 1474 | * The text inside some tags (ie. 'script') may contain tags which 1475 | are not really part of the document and which should be parsed 1476 | as text, not tags. If you want to parse the text as tags, you can 1477 | always fetch it and parse it explicitly. 1478 | 1479 | * Tag nesting rules: 1480 | 1481 | Most tags can't be nested at all. For instance, the occurance of 1482 | a tag should implicitly close the previous
tag. 1483 | 1484 |
Para1
Para2 1485 | should be transformed into: 1486 |
Para1
Para2 1487 | 1488 | Some tags can be nested arbitrarily. For instance, the occurance 1489 | of a
tag should _not_ implicitly close the previous 1490 |tag. 1491 | 1492 | Alice said:Bob said:Blah 1493 | should NOT be transformed into: 1494 | Alice said:Bob said:Blah 1495 | 1496 | Some tags can be nested, but the nesting is reset by the 1497 | interposition of other tags. For instance, atag should 1498 | implicitly close the previous tag within the same , 1499 | but not close a
tag in another table. 1500 | 1501 |
Blah Blah 1502 | should be transformed into: 1503 |
Blah Blah 1504 | but, 1505 | Blah
Blah 1506 | should NOT be transformed into 1507 | Blah
Blah 1508 | 1509 | Differing assumptions about tag nesting rules are a major source 1510 | of problems with the BeautifulSoup class. If BeautifulSoup is not 1511 | treating as nestable a tag your page author treats as nestable, 1512 | try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1513 | BeautifulStoneSoup before writing your own subclass.""" 1514 | 1515 | def __init__(self, *args, **kwargs): 1516 | if not kwargs.has_key('smartQuotesTo'): 1517 | kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1518 | kwargs['isHTML'] = True 1519 | BeautifulStoneSoup.__init__(self, *args, **kwargs) 1520 | 1521 | SELF_CLOSING_TAGS = buildTagMap(None, 1522 | ('br' , 'hr', 'input', 'img', 'meta', 1523 | 'spacer', 'link', 'frame', 'base', 'col')) 1524 | 1525 | PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1526 | 1527 | QUOTE_TAGS = {'script' : None, 'textarea' : None} 1528 | 1529 | #According to the HTML standard, each of these inline tags can 1530 | #contain another tag of the same type. Furthermore, it's common 1531 | #to actually use these tags this way. 1532 | NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1533 | 'center') 1534 | 1535 | #According to the HTML standard, these block tags can contain 1536 | #another tag of the same type. Furthermore, it's common 1537 | #to actually use these tags this way. 1538 | NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1539 | 1540 | #Lists can contain other lists, but there are restrictions. 1541 | NESTABLE_LIST_TAGS = { 'ol' : [], 1542 | 'ul' : [], 1543 | 'li' : ['ul', 'ol'], 1544 | 'dl' : [], 1545 | 'dd' : ['dl'], 1546 | 'dt' : ['dl'] } 1547 | 1548 | #Tables can contain other tables, but there are restrictions. 1549 | NESTABLE_TABLE_TAGS = {'table' : [], 1550 | 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1551 | 'td' : ['tr'], 1552 | 'th' : ['tr'], 1553 | 'thead' : ['table'], 1554 | 'tbody' : ['table'], 1555 | 'tfoot' : ['table'], 1556 | } 1557 | 1558 | NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1559 | 1560 | #If one of these tags is encountered, all tags up to the next tag of 1561 | #this type are popped. 1562 | RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1563 | NON_NESTABLE_BLOCK_TAGS, 1564 | NESTABLE_LIST_TAGS, 1565 | NESTABLE_TABLE_TAGS) 1566 | 1567 | NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1568 | NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1569 | 1570 | # Used to detect the charset in a META tag; see start_meta 1571 | CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1572 | 1573 | def start_meta(self, attrs): 1574 | """Beautiful Soup can detect a charset included in a META tag, 1575 | try to convert the document to that charset, and re-parse the 1576 | document from the beginning.""" 1577 | httpEquiv = None 1578 | contentType = None 1579 | contentTypeIndex = None 1580 | tagNeedsEncodingSubstitution = False 1581 | 1582 | for i in range(0, len(attrs)): 1583 | key, value = attrs[i] 1584 | key = key.lower() 1585 | if key == 'http-equiv': 1586 | httpEquiv = value 1587 | elif key == 'content': 1588 | contentType = value 1589 | contentTypeIndex = i 1590 | 1591 | if httpEquiv and contentType: # It's an interesting meta tag. 1592 | match = self.CHARSET_RE.search(contentType) 1593 | if match: 1594 | if (self.declaredHTMLEncoding is not None or 1595 | self.originalEncoding == self.fromEncoding): 1596 | # An HTML encoding was sniffed while converting 1597 | # the document to Unicode, or an HTML encoding was 1598 | # sniffed during a previous pass through the 1599 | # document, or an encoding was specified 1600 | # explicitly and it worked. Rewrite the meta tag. 1601 | def rewrite(match): 1602 | return match.group(1) + "%SOUP-ENCODING%" 1603 | newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1604 | attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1605 | newAttr) 1606 | tagNeedsEncodingSubstitution = True 1607 | else: 1608 | # This is our first pass through the document. 1609 | # Go through it again with the encoding information. 1610 | newCharset = match.group(3) 1611 | if newCharset and newCharset != self.originalEncoding: 1612 | self.declaredHTMLEncoding = newCharset 1613 | self._feed(self.declaredHTMLEncoding) 1614 | raise StopParsing 1615 | pass 1616 | tag = self.unknown_starttag("meta", attrs) 1617 | if tag and tagNeedsEncodingSubstitution: 1618 | tag.containsSubstitutions = True 1619 | 1620 | class StopParsing(Exception): 1621 | pass 1622 | 1623 | class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1624 | 1625 | """The BeautifulSoup class is oriented towards skipping over 1626 | common HTML errors like unclosed tags. However, sometimes it makes 1627 | errors of its own. For instance, consider this fragment: 1628 | 1629 | FooBar 1630 | 1631 | This is perfectly valid (if bizarre) HTML. However, the 1632 | BeautifulSoup class will implicitly close the first b tag when it 1633 | encounters the second 'b'. It will think the author wrote 1634 | "FooBar", and didn't close the first 'b' tag, because 1635 | there's no real-world reason to bold something that's already 1636 | bold. When it encounters '' it will close two more 'b' 1637 | tags, for a grand total of three tags closed instead of two. This 1638 | can throw off the rest of your document structure. The same is 1639 | true of a number of other tags, listed below. 1640 | 1641 | It's much more common for someone to forget to close a 'b' tag 1642 | than to actually use nested 'b' tags, and the BeautifulSoup class 1643 | handles the common case. This class handles the not-co-common 1644 | case: where you can't believe someone wrote what they did, but 1645 | it's valid HTML and BeautifulSoup screwed up by assuming it 1646 | wouldn't be.""" 1647 | 1648 | I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1649 | ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1650 | 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1651 | 'big') 1652 | 1653 | I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1654 | 1655 | NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1656 | I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1657 | I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1658 | 1659 | class MinimalSoup(BeautifulSoup): 1660 | """The MinimalSoup class is for parsing HTML that contains 1661 | pathologically bad markup. It makes no assumptions about tag 1662 | nesting, but it does know which tags are self-closing, that 1663 |