├── r2e
├── r2e.bat
├── .gitignore
├── README.md
├── config.py.example
├── test_rss2email.py
├── CHANGELOG
├── readme.html
├── html2text.py
├── rss2email.py
└── BeautifulSoup.py


/r2e:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python rss2email.py feeds.dat $*
3 | 


--------------------------------------------------------------------------------
/r2e.bat:
--------------------------------------------------------------------------------
1 | @python rss2email.py feeds.dat %1 %2 %3 %4 %5 %6 %7 %8 %9
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | config.py
2 | temp/*
3 | Old/*
4 | feeds.dat
5 | *.pyc
6 | build/*
7 | *.sublime-project
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This project is largely on hold. There is another, active rss2email project for Python 3 that is located at https://github.com/wking/rss2email
2 | 


--------------------------------------------------------------------------------
/config.py.example:
--------------------------------------------------------------------------------
 1 | ### Options for configuring rss2email ###
 2 | 
 3 | # The email address messages are from by default:
 4 | DEFAULT_FROM = "bozo@dev.null.invalid"
 5 | 
 6 | # 1: Send text/html messages when possible.
 7 | # 0: Convert HTML to plain text.
 8 | HTML_MAIL = 1
 9 | 
10 | # 1: Only use the DEFAULT_FROM address.
11 | # 0: Use the email address specified by the feed, when possible.
12 | FORCE_FROM = 0
13 | 
14 | # 1: Receive one email per post.
15 | # 0: Receive an email every time a post changes.
16 | TRUST_GUID = 1
17 | 
18 | # 1: Generate Date header based on item's date, when possible.
19 | # 0: Generate Date header based on time sent.
20 | DATE_HEADER = 1
21 | 
22 | # A tuple consisting of some combination of
23 | # ('issued', 'created', 'modified', 'expired')
24 | # expressing ordered list of preference in dates 
25 | # to use for the Date header of the email.
26 | DATE_HEADER_ORDER = ('modified', 'issued', 'created')
27 | 
28 | # 1: Apply Q-P conversion (required for some MUAs).
29 | # 0: Send message in 8-bits.
30 | # http://cr.yp.to/smtp/8bitmime.html
31 | #DEPRECATED 
32 | QP_REQUIRED = 0
33 | #DEPRECATED 
34 | 	
35 | # 1: Name feeds as they're being processed.
36 | # 0: Keep quiet.
37 | VERBOSE = 0
38 | 
39 | # 1: Use the publisher's email if you can't find the author's.
40 | # 0: Just use the DEFAULT_FROM email instead.
41 | USE_PUBLISHER_EMAIL = 0
42 | 
43 | # 1: Use SMTP_SERVER to send mail.
44 | # 0: Call /usr/sbin/sendmail to send mail.
45 | SMTP_SEND = 1
46 | 
47 | SMTP_SERVER = "smtp.yourisp.net:25"
48 | AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
49 | SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
50 | SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
51 | 
52 | # Connect to the SMTP server using SSL
53 | 
54 | SMTP_SSL = 0
55 | 
56 | 
57 | 
58 | # Set this to add a bonus header to all emails (start with '\n').
59 | BONUS_HEADER = ''
60 | # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
61 | 
62 | # Set this to override From addresses. Keys are feed URLs, values are new titles.
63 | OVERRIDE_FROM = {}
64 | 
65 | # Set this to override From email addresses. Keys are feed URLs, values are new emails.
66 | 
67 | OVERRIDE_EMAIL = {}
68 | 
69 | 
70 | 
71 | # Set this to default From email addresses. Keys are feed URLs, values are new email addresses.
72 | 
73 | DEFAULT_EMAIL = {}
74 | 
75 | 
76 | # Only use the email from address rather than friendly name plus email address
77 | 
78 | NO_FRIENDLY_NAME = 0
79 | 
80 | 
81 | 
82 | # Set this to override the timeout (in seconds) for feed server response
83 | FEED_TIMEOUT = 60
84 | 
85 | # Optional CSS styling
86 | USE_CSS_STYLING = 1
87 | STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; }  .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }'
88 | 
89 | # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/'
90 | PROXY=""
91 | 
92 | # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works
93 | # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes
94 | CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8'
95 | 


--------------------------------------------------------------------------------
/test_rss2email.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """Unit tests for rss2email.
  3 | 
  4 | These tests make sure that rss2email works as it should. If you
  5 | find a bug, the best way to express it is as a test
  6 | case like this that fails."""
  7 | 
  8 | import unittest
  9 | from rss2email import *
 10 | import rss2email
 11 | import feedparser
 12 | 
 13 | class test_validateEmail(unittest.TestCase):
 14 | 	""""""
 15 | 	def test_valid_email(self):
 16 | 		email = validateEmail("valid@example.com", "planb@example.com")
 17 | 		self.assertEqual(email, "valid@example.com")
 18 | 
 19 | 	def test_no_mail_server(self):
 20 | 		email = validateEmail("invalid", "planb@example.com")
 21 | 		self.assertEqual(email, "planb@example.com")
 22 | 
 23 | 	def test_no_email_name(self):
 24 | 		email = validateEmail("@invalid", "planb@example.com")
 25 | 		self.assertEqual(email, "planb@example.com")
 26 | 
 27 | 	def test_no_at(self):
 28 | 		email = validateEmail("invalid", "planb@example.com")
 29 | 		self.assertEqual(email, "planb@example.com")
 30 | 
 31 | class test_getName(unittest.TestCase):
 32 |     """"""
 33 |     def setUp(self):
 34 |         self.feed = feedparser.parse("""
 35 | <feed xmlns="http://www.w3.org/2005/Atom">
 36 | <entry>
 37 |   <author>
 38 |     <name>Example author</name>
 39 |     <email>me@example.com</email>
 40 |     <url>http://example.com/</url>
 41 |   </author>
 42 | </entry>
 43 | </feed>
 44 |         """)
 45 |         self.entry = self.feed.entries[0]
 46 | 
 47 | 
 48 |     def test_no_friendly_name(self):
 49 |         rss2email.NO_FRIENDLY_NAME = 1
 50 |         name = getName(0, 0)
 51 |         rss2email.NO_FRIENDLY_NAME = 0
 52 |         self.assertEqual(name, '')
 53 |         
 54 |     def test_override_from(self):
 55 |         # have to fake url attribute because it is only set on downloaded feeds
 56 |         urlToOverride = 'http://example.com/feed/'
 57 |         self.feed['url'] = urlToOverride
 58 |         rss2email.OVERRIDE_FROM = {urlToOverride:'override'}
 59 |         name = getName(self.feed, self.entry)
 60 |         self.assertEqual(name, 'override')
 61 | 
 62 |     def test_no_friendly_name_negative(self):
 63 |         rss2email.NO_FRIENDLY_NAME = 0
 64 |         name=getName(self.feed, self.entry)
 65 |         self.assertEqual(name, 'Example author')
 66 | 
 67 | class test_getTags(unittest.TestCase):
 68 |     """"""
 69 |     def test_valid_tags(self):
 70 |         entry = {'tags': [{'term': u'tag1', 'scheme': None, 'label': None}]}
 71 |         tagline = getTags(entry)
 72 |         self.assertEqual(tagline, "tag1")
 73 | 
 74 |     def test_no_tags(self):
 75 |         entry = {}
 76 |         tagline = getTags(entry)
 77 |         self.assertEqual(tagline, "")
 78 | 
 79 |     def test_empty_tags(self):
 80 |         entry = {'tags': []}
 81 |         tagline = getTags(entry)
 82 |         self.assertEqual(tagline, "")
 83 | 
 84 |     def test_no_term(self):
 85 |         entry = {'tags': [{'scheme': None, 'label': None}]}
 86 |         tagline = getTags(entry)
 87 |         self.assertEqual(tagline, "")
 88 | 
 89 |     def test_empty_term(self):
 90 |         entry = {'tags': [{'term': u'', 'scheme': None, 'label': None}]}
 91 |         tagline = getTags(entry)
 92 |         self.assertEqual(tagline, "")
 93 | 
 94 |     def test_multiple_tags(self):
 95 |         entry = {'tags': [{'term': u'tag1', 'scheme': None, 'label': None}, {'term': u'tag2', 'scheme': None, 'label': None}]}
 96 |         tagline = getTags(entry)
 97 |         self.assertEqual(tagline, "tag1,tag2")
 98 | 
 99 |     
100 | 
101 | if __name__ == '__main__':
102 |     unittest.main()
103 | 
104 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
  1 | v2.71 (2011-03-04)
  2 |     * Upgraded to feedparser v5.01! (http://code.google.com/p/feedparser/)
  3 |     * Upgrade to html2text v3.01! (https://github.com/aaronsw/html2text)
  4 |     * Potentialy safer method for writing feeds.dat on UNIX
  5 |     * Handle via links with no title attribute
  6 |     * Handle attributes more cleanly with OVERRIDE_EMAIL and DEFAULT_EMAIL
  7 | 
  8 | v2.70 (2010-12-21)
  9 |     * Improved handling of given feed email addresses to prevent mail servers rejecting poorly formed Froms
 10 |     * Added X-RSS-TAGS header that lists any tags provided by an entry, which will be helpful in filtering incoming messages
 11 | 
 12 | v2.69 (2010-11-12)
 13 |     * Added support for connecting to SMTP server via SSL, see SMTP_SSL option
 14 |     * Improved backwards compatibility by fixing issue with listing feeds when run with older Python versions
 15 |     * Added selective feed email overrides through OVERRIDE_EMAIL and DEFAULT_EMAIL options
 16 |     * Added NO_FRIENDLY_NAME to from from address only without the friendly name
 17 |     * Added X-RSS-URL header in each message with the link to the original item
 18 | 
 19 | v2.68 (2010-10-01)
 20 |     * Added ability to pause/resume checking of individual feeds through pause and unpause commands
 21 |     * Added ability to import and export OPML feed lists through importopml and exportopml commands
 22 |     
 23 | v2.67 (2010-09-21)
 24 |     * Fixed entries that include an id which is blank (i.e., an empty string) were being resent 
 25 |     * Fixed some entries not being sent by email because they had bad From headers	
 26 |     * Fixed From headers with HTML entities encoded twice
 27 |     * Compatibility changes to support most recent development versions of feedparser
 28 |     * Compatibility changes to support Google Reader feeds
 29 |     
 30 | v2.66 (2009-12-21)
 31 |     * Complete packaging of all necessary source files (rss2email, html2text, feedparser, r2e, etc.) into one bundle
 32 |         o Included a more complete config.py with all options
 33 |         o Default to HTML mail and CSS results 
 34 |     * Added 'reset' command to erase history of already seen entries
 35 |     * Changed project email to 'lindsey@allthingsrss.com' and project homepage to 'http://www.allthingsrss.com/rss2email/'
 36 |     * Made exception and error output text more useful
 37 |     * Added X-RSS-Feed and X-RSS-ID headers to each email for easier filtering
 38 |     * Improved enclosure handling
 39 |     * Fixed MacOS compatibility issues
 40 | 
 41 | v2.65 (2009-01-05)
 42 | 
 43 |     * Fixed warnings caused by Python v2.6 (using hashlib, removing mimify, etc.)
 44 |     * Deprecated QP_REQUIRED option as this is more than likely no longer needed and part of what triggered Python warnings
 45 |     * Fixed unicode errors in certain post headers
 46 |     * Attempted to incorporate Debian/Ubuntu patches into the mainstream release
 47 |     * Support img type enclosures
 48 |     * No file locking for SunOS
 49 | 
 50 | v2.64 (2008-10-21)
 51 |     * Bug-fix version
 52 |         o Gracefully handle missing charsets
 53 |         o Friendlier and more useful message if sendmail isn't installed
 54 |         o SunOS locking fix
 55 | 
 56 | v2.63 (2008-06-13)
 57 |     * Bug-fix version and license change:
 58 |         o Licensed under GPL 2 & 3 now
 59 |         o Display feed number in warning and error message lines
 60 |         o Fix for unicode handling problem with certain entry titles
 61 | 
 62 | v2.62 (2008-01-14)
 63 |     * Bug-fix version:
 64 |         o Simplified SunOS fix
 65 |         o Local feeds (/home/user/file.xml) should work
 66 | 
 67 | v2.61 (2007-12-07)
 68 |     * Bug-fix version:
 69 |         o Now really compatible with SunOS
 70 |         o Don't wrap long subject headers
 71 |         o New parameter CHARSET_LIST to override or supplement the order in which charsets are tried against an entry
 72 |         o Don't use blank content to generate id
 73 |         o Using GMail as mail server should work
 74 |         
 75 | v2.60 (2006-08-25)
 76 |     * Small bug-fix version:
 77 |         o Now compatible with SunOS
 78 |         o Correctly handle international character sets in email From
 79 | 
 80 | v2.59 (2006-06-09)
 81 |     * Finally added oft-requested support for enclosures. Any enclosures, such as a podcast MP3, will be listed under the entry URL
 82 |     * Made feed timeout compatible with Python versions 2.2 and higher, instead of v2.4 only
 83 |     * Added optional, configurable CSS styling to HTML mail. Set USE_CSS_STYLING=1 in your config.py to enable this. If you want to tweak the look, modify STYLE_SHEET.
 84 |     * Improved empty feed checking
 85 |     * Improved invalid feed messages
 86 |     * Unfortunately, rss2email is no longer compatible with Python v2.1. Two of the most serious lingering issues with rss2email were waiting forever for non-responsive feeds and its inablility to properly handle feeds with international characters. To properly fix these once and for all, rss2email now depends on functionality that was not available until Python v2.2. Hopefully this does not unduly inconvenience anyone that has not yet upgraded to a more current version of Python.
 87 | 
 88 | v2.58 (2006-05-11)
 89 |     * Total rewrite of email code that should fix encoding problems
 90 |     * Added configurable timeout for nonresponsive feeds
 91 |     * Fixed incorrectly using text summary_detail instead of html content
 92 |     * Fixed bug with deleting feed 0 if no default email was set
 93 |     * Print name of feed that is being deleted
 94 | 
 95 | v2.57 (2006-04-07)
 96 |     * Integrated Joey Hess's patches
 97 |        o First, a patch that makes delete more reliable, so it no longer allows you to remove the default email address ('feed' 0) and thereby hose your feed file, or 'remove' entries that don't exist without warning; and so it only says IDs have changed when they really have. Originally from http://bugs.debian.org/313101
 98 |        o Next a patch that avoids a backtrace if there's no email address defined, and outputs a less scary error message.
 99 |        o Next, a simple change to the usage; since the "email" subcommand always needs a parameter, don't mark it as optional.
100 |        o And, avoid a backtrace if the email subcommand does get run w/o a parameter.
101 |        o And also avoid backtraces if delete is run w/o a parameter. Also adds support for --help.
102 |        o Simple change, make a comment match reality (/usr/sbin/sendmail)
103 |        o This avoids another backtrace, this time if there's no feed file yet. [load()]
104 |        o Add a handler for the AttributeError exception, which feedparser can throw. Beats crashing..
105 |        o Next, four hunks that make it more robust if no default email address is set and feeds are added w/o an email address. This patch originally comes from http://bugs.debian.org/310485 which has some examples.
106 |        o Finally, this works around a bug in mimify that causes it to add a newline to the subject header if it contains very long words. Details at http://bugs.debian.org/320185. Note that Tatsuya Kinoshita has a larger patch torard the end of that bug report that deals with some other problems in this area, Aaron has seen that patch before and said it "looks pretty reasonable".
107 |     * add() catches error case on first feed add and no email address is set
108 |     * Made "emailaddress" consistent param label throughout
109 |     * Error message improvements
110 |     * Deleted problematic "if title" line
111 |     * Deleted space in front of SMTP_USER
112 |     * Only logs into SMTP server once
113 |     * Added exception handling around SMTP server connect and login attempt
114 |     * Broke contributors across multiple lines
115 | 
116 | v2.56 (2006-04-04)
117 |     * SMTP AUTH support added
118 |     * Windows support
119 |     * Fixed bug with HTML in titles
120 | 


--------------------------------------------------------------------------------
/readme.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 | <title>Getting Started With rss2email</title>
  4 | </head>
  5 | <body>
  6 | <h1>Getting Started With rss2email</h1>
  7 | 
  8 | <p style="color: green;">We highly recommend that you subscribe to the rss2email project feed so you can keep up to date with the latest version, bugfixes and features: <a href="http://feeds.feedburner.com/allthingsrss/hJBr">http://feeds.feedburner.com/allthingsrss/hJBr</a></p>
  9 | <p><a href="#windows">Instructions for Windows Users</a><br/>
 10 | <a href="#unix">Instructions for UNIX Users</a><br/>
 11 | <a href="#customizeit">Customizing rss2email</a></p>
 12 | 
 13 | 
 14 | <h2 id="windows">Instructions for Windows Users</h2>
 15 | 
 16 | <h3>Requirements</h3>
 17 | 
 18 | <p>Before you install rss2email, you'll need to make sure that a few things are in place. First, is that a version of <a href="http://www.python.org">Python</a> 2.x installed. Second, determine your outgoing email server's address. That should be all you need.</p>
 19 | 
 20 | <h3>Download</h3>
 21 | 
 22 | <ol>
 23 | <li>Create a new folder</li>
 24 | <li>Download the latest rss2email .ZIP file and unzip to the new folder
 25 | </ol>
 26 |   
 27 | <h3>Configure</h3>
 28 | 
 29 | <p>Edit the <code>config.py</code> file and fill in your outoing email server's details. If your server requires you to login, change <code>"AUTHREQUIRED = 0"</code> to <code>"AUTHREQUIRED = 1"</code> and enter your email username and password.</p>
 30 | 
 31 | <h3>Install</h3>
 32 | 
 33 | <p>From the command line, change to the folder you created. Now create a new feed database to send updates to your email address:</p>
 34 | 
 35 | <blockquote>
 36 |   <p><code>r2e new you@yourdomain.com</code></p>
 37 | </blockquote>
 38 | 
 39 | <p>Subscribe to some feeds:</p>
 40 | 
 41 | <blockquote>
 42 |   <p><code>r2e add http://feeds.feedburner.com/allthingsrss/hJBr</code></p>
 43 | </blockquote>
 44 | 
 45 | <p>That's the feed to be notified when there's a new version of rss2email. Repeat this for each feed you want to subscribe to.</p>
 46 | 
 47 | <p>When you run rss2email, it emails you about every story it hasn't seen before. But the first time you run it, that will be every story. To avoid this, you can ask rss2email not to send you any stories the first time you run it:</p>
 48 | 
 49 | <blockquote>
 50 |   <p><code>r2e run --no-send</code></p>
 51 | </blockquote>
 52 | 
 53 | <p>Then later, you can ask it to email you new stories:</p>
 54 | 
 55 | <blockquote>
 56 |   <p><code>r2e run</code></p>
 57 | 
 58 | </blockquote>
 59 | 
 60 | <p>If you get an error message "Sender domain must exist", add a line to <code>config.py</code> like this:</p>
 61 | 
 62 | <blockquote>
 63 |   <p><code>DEFAULT_FROM = rss2email@yoursite.com</code></p>
 64 | </blockquote>
 65 | 
 66 | <p>You can make the email address whatever you want, but your mail server requires that the yoursite.com part actually exists.</p>
 67 | 
 68 | <h3>Automating rss2email</h3>
 69 | 
 70 | <p>More than likely you will want rss2email to run automatically at a regular interval. Under Windows this is can be easily accomplished using the Windows Task Scheduler. This site has a nice <a href="http://www.iopus.com/guides/winscheduler.htm">tutorial</a> on it. Just select r2e.bat as the program to run. Once you've created the task, double click on it in the task list and change the Run entry so that "run" comes after r2e.bat. For example, if you installed rss2email in the C:\rss2email folder, then you would change the Run entry from "C:\rss2email\r2e.bat" to "C:\rss2email\r2e.bat run".</p>
 71 | 
 72 | <p>Now jump down to the section on <a href="#customizeit">customizing rss2email</a> to your needs.</p>
 73 | 
 74 | <h3>Upgrading to a new version</h3>
 75 | <p>Simply replace all of the files from the .ZIP package to your install directory <strong style="color: red;">EXCEPT config.py</strong></p>
 76 | 
 77 | <h2 id="unix">Instructions for UNIX/Linux Users</h2>
 78 | 
 79 | <h3>Requirements</h3>
 80 | 
 81 | <p>Before you install rss2email, you'll need to make sure that a few things are in place. First, is a version of <a href="http://www.python.org">Python</a> 2.x installed. Second, is whether you have sendmail (or a compatible replacement like postfix) installed. If sendmail isn't installed, determine your outgoing email server's address. That should be all you need.</p>
 82 | 
 83 | <h3>Download</h3>
 84 | 
 85 | <p>A quick way to get rss2email going is using pre-made packages. Here are releases for <a href="http://packages.debian.org/cgi-bin/search_packages.pl?searchon=names&amp;version=all&amp;exact=1&amp;keywords=rss2email">Debian</a> Linux, <a href="http://packages.ubuntu.com/search?keywords=rss2email&searchon=names&section=all">Ubuntu</a> Linux and <a href="ftp://ftp.netbsd.org/pub/NetBSD/NetBSD-current/pkgsrc/mail/rss2email/README.html">NetBSD</a>.</p>
 86 | 
 87 | <p>If you are unable to use these packages or you want the latest and greatest version, here's what you do:</p>
 88 | 
 89 | <blockquote><code>
 90 | Unarchive (probably 'tar -xzf') the rss2email .tar.gz package to [folder where you want rss2email files to live] <br>
 91 | cd [yourfolder] <br>
 92 | chmod +x r2e
 93 | </code></blockquote>
 94 | 
 95 | <h3>Install</h3>
 96 | 
 97 | <p>Create a new feed database with your target email address:</p>
 98 | 
 99 | <blockquote>
100 |   <p><code>./r2e new you@yourdomain.com</code></p>
101 | </blockquote>
102 | 
103 | <p>Subscribe to some feeds:</p>
104 | 
105 | <blockquote>
106 |   <p><code>./r2e add http://feeds.feedburner.com/allthingsrss/hJBr</code></p>
107 | 
108 | </blockquote>
109 | 
110 | <p>That's the feed to be notified when there's a new version of rss2email. Repeat this for each feed you want to subscribe to.</p>
111 | 
112 | <p>When you run rss2email, it emails you about every story it hasn't seen before. But the first time you run it, that will be every story. To avoid this, you can ask rss2email not to send you any stories the first time you run it:</p>
113 | 
114 | <blockquote>
115 |   <p><code>./r2e run --no-send</code></p>
116 | </blockquote>
117 | 
118 | <p>Then later, you can ask it to email you new stories:</p>
119 | 
120 | <blockquote>
121 |   <p><code>./r2e run</code></p>
122 | </blockquote>
123 | 
124 | <p>You probably want to set things up so that this command is run repeatedly. (One good way is via a cron job.)</p>
125 | 
126 | <p>If you get an error message "Sender domain must exist", add a line to <code>config.py</code> like this:</p>
127 | 
128 | <blockquote>
129 |   <p><code>DEFAULT_FROM = rss2email@yoursite.com</code></p>
130 | </blockquote>
131 | 
132 | <p>You can make the email address whatever you want, but your mail server requires that the yoursite.com part actually exists.</p>
133 | 
134 | <h3>Upgrading to a new version</h3>
135 | <p>Simply replace all of the files from the .tar.gz package to your install directory <strong style="color: red;">EXCEPT config.py</strong></p>
136 | 
137 | 
138 | 
139 | <h1 id="customizeit">Customize rss2email</h1>
140 | 
141 | <p>There are a number of options, described in full at the top of rss2email.py file, to customize the way rss2email behaves. If you want to change something, edit the <code>config.py</code> file. If you're not using rss2email under Windows, you'll have to create this file if it doesn't already exist.</p>
142 | 
143 | <p>For example, if you want to receive HTML mail, instead of having entries converted to plain text:</p>
144 | 
145 | <blockquote>
146 |   <p><code>HTML_MAIL = 1</code></p>
147 | 
148 | </blockquote>
149 | 
150 | <p>To be notified every time a post changes, instead of just when it's first posted:</p>
151 | 
152 | <blockquote>
153 |   <p><code>TRUST_GUID = 0</code></p>
154 | </blockquote>
155 | 
156 | <p>And to make the emails look as if they were sent when the item was posted:</p>
157 | 
158 | <blockquote>
159 |   <p><code>DATE_HEADER = 1</code></p>
160 |   
161 | </body>
162 | </html>  
163 |   


--------------------------------------------------------------------------------
/html2text.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """html2text: Turn HTML into equivalent Markdown-structured text."""
  3 | __version__ = "3.01"
  4 | __author__ = "Aaron Swartz (me@aaronsw.com)"
  5 | __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
  6 | __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
  7 | 
  8 | # TODO:
  9 | #   Support decoded entities with unifiable.
 10 | 
 11 | try:
 12 |     True
 13 | except NameError:
 14 |     setattr(__builtins__, 'True', 1)
 15 |     setattr(__builtins__, 'False', 0)
 16 | 
 17 | def has_key(x, y):
 18 |     if hasattr(x, 'has_key'): return x.has_key(y)
 19 |     else: return y in x
 20 | 
 21 | try:
 22 |     import htmlentitydefs
 23 |     import urlparse
 24 |     import HTMLParser
 25 | except ImportError: #Python3
 26 |     import html.entities as htmlentitydefs
 27 |     import urllib.parse as urlparse
 28 |     import html.parser as HTMLParser
 29 | try: #Python3
 30 |     import urllib.request as urllib
 31 | except:
 32 |     import urllib
 33 | import re, sys, codecs, types
 34 | 
 35 | try: from textwrap import wrap
 36 | except: pass
 37 | 
 38 | # Use Unicode characters instead of their ascii psuedo-replacements
 39 | UNICODE_SNOB = 0
 40 | 
 41 | # Put the links after each paragraph instead of at the end.
 42 | LINKS_EACH_PARAGRAPH = 0
 43 | 
 44 | # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
 45 | BODY_WIDTH = 78
 46 | 
 47 | # Don't show internal links (href="#local-anchor") -- corresponding link targets
 48 | # won't be visible in the plain text file anyway.
 49 | SKIP_INTERNAL_LINKS = False
 50 | 
 51 | ### Entity Nonsense ###
 52 | 
 53 | def name2cp(k):
 54 |     if k == 'apos': return ord("'")
 55 |     if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
 56 |         return htmlentitydefs.name2codepoint[k]
 57 |     else:
 58 |         k = htmlentitydefs.entitydefs[k]
 59 |         if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
 60 |         return ord(codecs.latin_1_decode(k)[0])
 61 | 
 62 | unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
 63 | 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
 64 | 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
 65 | 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
 66 | 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
 67 | 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
 68 | 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
 69 | 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
 70 | 
 71 | unifiable_n = {}
 72 | 
 73 | for k in unifiable.keys():
 74 |     unifiable_n[name2cp(k)] = unifiable[k]
 75 | 
 76 | def charref(name):
 77 |     if name[0] in ['x','X']:
 78 |         c = int(name[1:], 16)
 79 |     else:
 80 |         c = int(name)
 81 |     
 82 |     if not UNICODE_SNOB and c in unifiable_n.keys():
 83 |         return unifiable_n[c]
 84 |     else:
 85 |         try:
 86 |             return unichr(c)
 87 |         except NameError: #Python3
 88 |             return chr(c)
 89 | 
 90 | def entityref(c):
 91 |     if not UNICODE_SNOB and c in unifiable.keys():
 92 |         return unifiable[c]
 93 |     else:
 94 |         try: name2cp(c)
 95 |         except KeyError: return "&" + c + ';'
 96 |         else:
 97 |             try:
 98 |                 return unichr(name2cp(c))
 99 |             except NameError: #Python3
100 |                 return chr(name2cp(c))
101 | 
102 | def replaceEntities(s):
103 |     s = s.group(1)
104 |     if s[0] == "#": 
105 |         return charref(s[1:])
106 |     else: return entityref(s)
107 | 
108 | r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
109 | def unescape(s):
110 |     return r_unescape.sub(replaceEntities, s)
111 | 
112 | ### End Entity Nonsense ###
113 | 
114 | def onlywhite(line):
115 |     """Return true if the line does only consist of whitespace characters."""
116 |     for c in line:
117 |         if c is not ' ' and c is not '  ':
118 |             return c is ' '
119 |     return line
120 | 
121 | def optwrap(text):
122 |     """Wrap all paragraphs in the provided text."""
123 |     if not BODY_WIDTH:
124 |         return text
125 |     
126 |     assert wrap, "Requires Python 2.3."
127 |     result = ''
128 |     newlines = 0
129 |     for para in text.split("\n"):
130 |         if len(para) > 0:
131 |             if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
132 |                 for line in wrap(para, BODY_WIDTH):
133 |                     result += line + "\n"
134 |                 result += "\n"
135 |                 newlines = 2
136 |             else:
137 |                 if not onlywhite(para):
138 |                     result += para + "\n"
139 |                     newlines = 1
140 |         else:
141 |             if newlines < 2:
142 |                 result += "\n"
143 |                 newlines += 1
144 |     return result
145 | 
146 | def hn(tag):
147 |     if tag[0] == 'h' and len(tag) == 2:
148 |         try:
149 |             n = int(tag[1])
150 |             if n in range(1, 10): return n
151 |         except ValueError: return 0
152 | 
153 | class _html2text(HTMLParser.HTMLParser):
154 |     def __init__(self, out=None, baseurl=''):
155 |         HTMLParser.HTMLParser.__init__(self)
156 |         
157 |         if out is None: self.out = self.outtextf
158 |         else: self.out = out
159 |         try:
160 |             self.outtext = unicode()
161 |         except NameError: # Python3
162 |             self.outtext = str()
163 |         self.quiet = 0
164 |         self.p_p = 0
165 |         self.outcount = 0
166 |         self.start = 1
167 |         self.space = 0
168 |         self.a = []
169 |         self.astack = []
170 |         self.acount = 0
171 |         self.list = []
172 |         self.blockquote = 0
173 |         self.pre = 0
174 |         self.startpre = 0
175 |         self.lastWasNL = 0
176 |         self.abbr_title = None # current abbreviation definition
177 |         self.abbr_data = None # last inner HTML (for abbr being defined)
178 |         self.abbr_list = {} # stack of abbreviations to write later
179 |         self.baseurl = baseurl
180 |     
181 |     def outtextf(self, s): 
182 |         self.outtext += s
183 |     
184 |     def close(self):
185 |         HTMLParser.HTMLParser.close(self)
186 |         
187 |         self.pbr()
188 |         self.o('', 0, 'end')
189 |         
190 |         return self.outtext
191 |         
192 |     def handle_charref(self, c):
193 |         self.o(charref(c))
194 | 
195 |     def handle_entityref(self, c):
196 |         self.o(entityref(c))
197 |             
198 |     def handle_starttag(self, tag, attrs):
199 |         self.handle_tag(tag, attrs, 1)
200 |     
201 |     def handle_endtag(self, tag):
202 |         self.handle_tag(tag, None, 0)
203 |         
204 |     def previousIndex(self, attrs):
205 |         """ returns the index of certain set of attributes (of a link) in the
206 |             self.a list
207 |  
208 |             If the set of attributes is not found, returns None
209 |         """
210 |         if not has_key(attrs, 'href'): return None
211 |         
212 |         i = -1
213 |         for a in self.a:
214 |             i += 1
215 |             match = 0
216 |             
217 |             if has_key(a, 'href') and a['href'] == attrs['href']:
218 |                 if has_key(a, 'title') or has_key(attrs, 'title'):
219 |                         if (has_key(a, 'title') and has_key(attrs, 'title') and
220 |                             a['title'] == attrs['title']):
221 |                             match = True
222 |                 else:
223 |                     match = True
224 | 
225 |             if match: return i
226 | 
227 |     def handle_tag(self, tag, attrs, start):
228 |         #attrs = fixattrs(attrs)
229 |     
230 |         if hn(tag):
231 |             self.p()
232 |             if start: self.o(hn(tag)*"#" + ' ')
233 | 
234 |         if tag in ['p', 'div']: self.p()
235 |         
236 |         if tag == "br" and start: self.o("  \n")
237 | 
238 |         if tag == "hr" and start:
239 |             self.p()
240 |             self.o("* * *")
241 |             self.p()
242 | 
243 |         if tag in ["head", "style", 'script']: 
244 |             if start: self.quiet += 1
245 |             else: self.quiet -= 1
246 | 
247 |         if tag in ["body"]:
248 |             self.quiet = 0 # sites like 9rules.com never close <head>
249 |         
250 |         if tag == "blockquote":
251 |             if start: 
252 |                 self.p(); self.o('> ', 0, 1); self.start = 1
253 |                 self.blockquote += 1
254 |             else:
255 |                 self.blockquote -= 1
256 |                 self.p()
257 |         
258 |         if tag in ['em', 'i', 'u']: self.o("_")
259 |         if tag in ['strong', 'b']: self.o("**")
260 |         if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
261 |         if tag == "abbr":
262 |             if start:
263 |                 attrsD = {}
264 |                 for (x, y) in attrs: attrsD[x] = y
265 |                 attrs = attrsD
266 |                 
267 |                 self.abbr_title = None
268 |                 self.abbr_data = ''
269 |                 if has_key(attrs, 'title'):
270 |                     self.abbr_title = attrs['title']
271 |             else:
272 |                 if self.abbr_title != None:
273 |                     self.abbr_list[self.abbr_data] = self.abbr_title
274 |                     self.abbr_title = None
275 |                 self.abbr_data = ''
276 |         
277 |         if tag == "a":
278 |             if start:
279 |                 attrsD = {}
280 |                 for (x, y) in attrs: attrsD[x] = y
281 |                 attrs = attrsD
282 |                 if has_key(attrs, 'href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 
283 |                     self.astack.append(attrs)
284 |                     self.o("[")
285 |                 else:
286 |                     self.astack.append(None)
287 |             else:
288 |                 if self.astack:
289 |                     a = self.astack.pop()
290 |                     if a:
291 |                         i = self.previousIndex(a)
292 |                         if i is not None:
293 |                             a = self.a[i]
294 |                         else:
295 |                             self.acount += 1
296 |                             a['count'] = self.acount
297 |                             a['outcount'] = self.outcount
298 |                             self.a.append(a)
299 |                         self.o("][" + str(a['count']) + "]")
300 |         
301 |         if tag == "img" and start:
302 |             attrsD = {}
303 |             for (x, y) in attrs: attrsD[x] = y
304 |             attrs = attrsD
305 |             if has_key(attrs, 'src'):
306 |                 attrs['href'] = attrs['src']
307 |                 alt = attrs.get('alt', '')
308 |                 i = self.previousIndex(attrs)
309 |                 if i is not None:
310 |                     attrs = self.a[i]
311 |                 else:
312 |                     self.acount += 1
313 |                     attrs['count'] = self.acount
314 |                     attrs['outcount'] = self.outcount
315 |                     self.a.append(attrs)
316 |                 self.o("![")
317 |                 self.o(alt)
318 |                 self.o("]["+ str(attrs['count']) +"]")
319 |         
320 |         if tag == 'dl' and start: self.p()
321 |         if tag == 'dt' and not start: self.pbr()
322 |         if tag == 'dd' and start: self.o('    ')
323 |         if tag == 'dd' and not start: self.pbr()
324 |         
325 |         if tag in ["ol", "ul"]:
326 |             if start:
327 |                 self.list.append({'name':tag, 'num':0})
328 |             else:
329 |                 if self.list: self.list.pop()
330 |             
331 |             self.p()
332 |         
333 |         if tag == 'li':
334 |             if start:
335 |                 self.pbr()
336 |                 if self.list: li = self.list[-1]
337 |                 else: li = {'name':'ul', 'num':0}
338 |                 self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
339 |                 if li['name'] == "ul": self.o("* ")
340 |                 elif li['name'] == "ol":
341 |                     li['num'] += 1
342 |                     self.o(str(li['num'])+". ")
343 |                 self.start = 1
344 |             else:
345 |                 self.pbr()
346 |         
347 |         if tag in ["table", "tr"] and start: self.p()
348 |         if tag == 'td': self.pbr()
349 |         
350 |         if tag == "pre":
351 |             if start:
352 |                 self.startpre = 1
353 |                 self.pre = 1
354 |             else:
355 |                 self.pre = 0
356 |             self.p()
357 |             
358 |     def pbr(self):
359 |         if self.p_p == 0: self.p_p = 1
360 | 
361 |     def p(self): self.p_p = 2
362 |     
363 |     def o(self, data, puredata=0, force=0):
364 |         if self.abbr_data is not None: self.abbr_data += data
365 |         
366 |         if not self.quiet: 
367 |             if puredata and not self.pre:
368 |                 data = re.sub('\s+', ' ', data)
369 |                 if data and data[0] == ' ':
370 |                     self.space = 1
371 |                     data = data[1:]
372 |             if not data and not force: return
373 |             
374 |             if self.startpre:
375 |                 #self.out(" :") #TODO: not output when already one there
376 |                 self.startpre = 0
377 |             
378 |             bq = (">" * self.blockquote)
379 |             if not (force and data and data[0] == ">") and self.blockquote: bq += " "
380 |             
381 |             if self.pre:
382 |                 bq += "    "
383 |                 data = data.replace("\n", "\n"+bq)
384 |             
385 |             if self.start:
386 |                 self.space = 0
387 |                 self.p_p = 0
388 |                 self.start = 0
389 | 
390 |             if force == 'end':
391 |                 # It's the end.
392 |                 self.p_p = 0
393 |                 self.out("\n")
394 |                 self.space = 0
395 | 
396 | 
397 |             if self.p_p:
398 |                 self.out(('\n'+bq)*self.p_p)
399 |                 self.space = 0
400 |                 
401 |             if self.space:
402 |                 if not self.lastWasNL: self.out(' ')
403 |                 self.space = 0
404 | 
405 |             if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
406 |                 if force == "end": self.out("\n")
407 | 
408 |                 newa = []
409 |                 for link in self.a:
410 |                     if self.outcount > link['outcount']:
411 |                         self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) 
412 |                         if has_key(link, 'title'): self.out(" ("+link['title']+")")
413 |                         self.out("\n")
414 |                     else:
415 |                         newa.append(link)
416 | 
417 |                 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
418 | 
419 |                 self.a = newa
420 |             
421 |             if self.abbr_list and force == "end":
422 |                 for abbr, definition in self.abbr_list.items():
423 |                     self.out("  *[" + abbr + "]: " + definition + "\n")
424 | 
425 |             self.p_p = 0
426 |             self.out(data)
427 |             self.lastWasNL = data and data[-1] == '\n'
428 |             self.outcount += 1
429 | 
430 |     def handle_data(self, data):
431 |         if r'\/script>' in data: self.quiet -= 1
432 |         self.o(data, 1)
433 |     
434 |     def unknown_decl(self, data): pass
435 | 
436 | def wrapwrite(text): sys.stdout.write(text)
437 | 
438 | def html2text_file(html, out=wrapwrite, baseurl=''):
439 |     h = _html2text(out, baseurl)
440 |     h.feed(html)
441 |     h.feed("")
442 |     return h.close()
443 | 
444 | def html2text(html, baseurl=''):
445 |     return optwrap(html2text_file(html, None, baseurl))
446 | 
447 | if __name__ == "__main__":
448 |     baseurl = ''
449 |     if sys.argv[1:]:
450 |         arg = sys.argv[1]
451 |         if arg.startswith('http://') or arg.startswith('https://'):
452 |             baseurl = arg
453 |             j = urllib.urlopen(baseurl)
454 |             try:
455 |                 from feedparser import _getCharacterEncoding as enc
456 |             except ImportError:
457 |                    enc = lambda x, y: ('utf-8', 1)
458 |             text = j.read()
459 |             encoding = enc(j.headers, text)[0]
460 |             if encoding == 'us-ascii': encoding = 'utf-8'
461 |             data = text.decode(encoding)
462 | 
463 |         else:
464 |             encoding = 'utf8'
465 |             if len(sys.argv) > 2:
466 |                 encoding = sys.argv[2]
467 |             try: #Python3
468 |                 data = open(arg, 'r', encoding=encoding).read()
469 |             except TypeError:
470 |                 data = open(arg, 'r').read().decode(encoding)
471 |     else:
472 |         data = sys.stdin.read()
473 |     wrapwrite(html2text(data, baseurl))
474 | 


--------------------------------------------------------------------------------
/rss2email.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """rss2email: get RSS feeds emailed to you
  3 | http://rss2email.infogami.com
  4 | 
  5 | Usage:
  6 |   new [emailaddress] (create new feedfile)
  7 |   email newemailaddress (update default email)
  8 |   run [--no-send] [num]
  9 |   add feedurl [emailaddress]
 10 |   list
 11 |   reset
 12 |   delete n
 13 |   pause n
 14 |   unpause n
 15 |   opmlexport
 16 |   opmlimport filename
 17 | """
 18 | __version__ = "2.72"
 19 | __author__ = "Lindsey Smith (lindsey@allthingsrss.com)"
 20 | __copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2 or 3."
 21 | ___contributors__ = ["Dean Jackson", "Brian Lalor", "Joey Hess", 
 22 |                      "Matej Cepl", "Martin 'Joey' Schulze", 
 23 |                      "Marcel Ackermann (http://www.DreamFlasher.de)", 
 24 |                      "Lindsey Smith (maintainer)", "Erik Hetzner", "Aaron Swartz (original author)" ]
 25 | 
 26 | import urllib2
 27 | import BeautifulSoup
 28 | urllib2.install_opener(urllib2.build_opener())
 29 | 
 30 | ### Vaguely Customizable Options ###
 31 | 
 32 | # The email address messages are from by default:
 33 | DEFAULT_FROM = "bozo@dev.null.invalid"
 34 | 
 35 | # 1: Send text/html messages when possible.
 36 | # 0: Convert HTML to plain text.
 37 | HTML_MAIL = 0
 38 | 
 39 | # 1: Only use the DEFAULT_FROM address.
 40 | # 0: Use the email address specified by the feed, when possible.
 41 | FORCE_FROM = 0
 42 | 
 43 | # 1: Receive one email per post.
 44 | # 0: Receive an email every time a post changes.
 45 | TRUST_GUID = 1
 46 | 
 47 | # 1: Generate Date header based on item's date, when possible.
 48 | # 0: Generate Date header based on time sent.
 49 | DATE_HEADER = 0
 50 | 
 51 | # A tuple consisting of some combination of
 52 | # ('issued', 'created', 'modified', 'expired')
 53 | # expressing ordered list of preference in dates 
 54 | # to use for the Date header of the email.
 55 | DATE_HEADER_ORDER = ('modified', 'issued', 'created')
 56 | 
 57 | # 1: Apply Q-P conversion (required for some MUAs).
 58 | # 0: Send message in 8-bits.
 59 | # http://cr.yp.to/smtp/8bitmime.html
 60 | #DEPRECATED 
 61 | QP_REQUIRED = 0
 62 | #DEPRECATED 
 63 | 	
 64 | # 1: Name feeds as they're being processed.
 65 | # 0: Keep quiet.
 66 | VERBOSE = 0
 67 | 
 68 | # 1: Use the publisher's email if you can't find the author's.
 69 | # 0: Just use the DEFAULT_FROM email instead.
 70 | USE_PUBLISHER_EMAIL = 0
 71 | 
 72 | # 1: Use SMTP_SERVER to send mail.
 73 | # 0: Call /usr/sbin/sendmail to send mail.
 74 | SMTP_SEND = 0
 75 | 
 76 | SMTP_SERVER = "smtp.yourisp.net:25"
 77 | AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
 78 | SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
 79 | SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
 80 | 
 81 | # Connect to the SMTP server using SSL
 82 | SMTP_SSL = 0
 83 | 
 84 | # Set this to add a bonus header to all emails (start with '\n').
 85 | BONUS_HEADER = ''
 86 | # Example: BONUS_HEADER = '\nApproved: joe@bob.org'
 87 | 
 88 | # Set this to override From addresses. Keys are feed URLs, values are new titles.
 89 | OVERRIDE_FROM = {}
 90 | 
 91 | # Set this to override From email addresses. Keys are feed URLs, values are new emails.
 92 | OVERRIDE_EMAIL = {}
 93 | 
 94 | # Set this to default From email addresses. Keys are feed URLs, values are new email addresses.
 95 | DEFAULT_EMAIL = {}
 96 | 
 97 | # Only use the email from address rather than friendly name plus email address
 98 | NO_FRIENDLY_NAME = 0
 99 | 
100 | # Set this to override the timeout (in seconds) for feed server response
101 | FEED_TIMEOUT = 60
102 | 
103 | # Optional CSS styling
104 | USE_CSS_STYLING = 0
105 | STYLE_SHEET='h1 {font: 18pt Georgia, "Times New Roman";} body {font: 12pt Arial;} a:link {font: 12pt Arial; font-weight: bold; color: #0000cc} blockquote {font-family: monospace; }  .header { background: #e0ecff; border-bottom: solid 4px #c3d9ff; padding: 5px; margin-top: 0px; color: red;} .header a { font-size: 20px; text-decoration: none; } .footer { background: #c3d9ff; border-top: solid 4px #c3d9ff; padding: 5px; margin-bottom: 0px; } #entry {border: solid 4px #c3d9ff; } #body { margin-left: 5px; margin-right: 5px; }'
106 | 
107 | # If you have an HTTP Proxy set this in the format 'http://your.proxy.here:8080/'
108 | PROXY=""
109 | 
110 | # To most correctly encode emails with international characters, we iterate through the list below and use the first character set that works
111 | # Eventually (and theoretically) ISO-8859-1 and UTF-8 are our catch-all failsafes
112 | CHARSET_LIST='US-ASCII', 'BIG5', 'ISO-2022-JP', 'ISO-8859-1', 'UTF-8'
113 | 
114 | from email.MIMEText import MIMEText
115 | from email.Header import Header
116 | from email.Utils import parseaddr, formataddr
117 | 			 
118 | # Note: You can also override the send function.
119 | 
120 | def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
121 | 	"""Send an email.
122 | 	
123 | 	All arguments should be Unicode strings (plain ASCII works as well).
124 | 	
125 | 	Only the real name part of sender and recipient addresses may contain
126 | 	non-ASCII characters.
127 | 	
128 | 	The email will be properly MIME encoded and delivered though SMTP to
129 | 	localhost port 25.  This is easy to change if you want something different.
130 | 	
131 | 	The charset of the email will be the first one out of the list
132 | 	that can represent all the characters occurring in the email.
133 | 	"""
134 | 
135 | 	# Header class is smart enough to try US-ASCII, then the charset we
136 | 	# provide, then fall back to UTF-8.
137 | 	header_charset = 'ISO-8859-1'
138 | 	
139 | 	# We must choose the body charset manually
140 | 	for body_charset in CHARSET_LIST:
141 | 	    try:
142 | 	        body.encode(body_charset)
143 | 	    except (UnicodeError, LookupError):
144 | 	        pass
145 | 	    else:
146 | 	        break
147 | 
148 | 	# Split real name (which is optional) and email address parts
149 | 	sender_name, sender_addr = parseaddr(sender)
150 | 	recipient_name, recipient_addr = parseaddr(recipient)
151 | 	
152 | 	# We must always pass Unicode strings to Header, otherwise it will
153 | 	# use RFC 2047 encoding even on plain ASCII strings.
154 | 	sender_name = str(Header(unicode(sender_name), header_charset))
155 | 	recipient_name = str(Header(unicode(recipient_name), header_charset))
156 | 	
157 | 	# Make sure email addresses do not contain non-ASCII characters
158 | 	sender_addr = sender_addr.encode('ascii')
159 | 	recipient_addr = recipient_addr.encode('ascii')
160 | 	
161 | 	# Create the message ('plain' stands for Content-Type: text/plain)
162 | 	msg = MIMEText(body.encode(body_charset), contenttype, body_charset)
163 | 	msg['To'] = formataddr((recipient_name, recipient_addr))
164 | 	msg['Subject'] = Header(unicode(subject), header_charset)
165 | 	for hdr in extraheaders.keys():
166 | 		try:
167 | 			msg[hdr] = Header(unicode(extraheaders[hdr], header_charset))
168 | 		except:
169 | 			msg[hdr] = Header(extraheaders[hdr])
170 | 		
171 | 	fromhdr = formataddr((sender_name, sender_addr))
172 | 	msg['From'] = fromhdr
173 | 
174 | 	msg_as_string = msg.as_string()
175 | #DEPRECATED 	if QP_REQUIRED:
176 | #DEPRECATED 		ins, outs = SIO(msg_as_string), SIO()
177 | #DEPRECATED 		mimify.mimify(ins, outs)
178 | #DEPRECATED 		msg_as_string = outs.getvalue()
179 | 
180 | 	if SMTP_SEND:
181 | 		if not smtpserver: 
182 | 			import smtplib
183 | 			
184 | 			try:
185 | 				if SMTP_SSL:
186 | 					smtpserver = smtplib.SMTP_SSL()
187 | 				else:
188 | 					smtpserver = smtplib.SMTP()
189 | 				smtpserver.connect(SMTP_SERVER)
190 | 			except KeyboardInterrupt:
191 | 				raise
192 | 			except Exception, e:
193 | 				print >>warn, ""
194 | 				print >>warn, ('Fatal error: could not connect to mail server "%s"' % SMTP_SERVER)
195 | 				print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly')
196 | 				if hasattr(e, 'reason'):
197 | 					print >>warn, "Reason:", e.reason
198 | 				sys.exit(1)
199 | 					
200 | 			if AUTHREQUIRED:
201 | 				try:
202 | 					smtpserver.ehlo()
203 | 					if not SMTP_SSL: smtpserver.starttls()
204 | 					smtpserver.ehlo()
205 | 					smtpserver.login(SMTP_USER, SMTP_PASS)
206 | 				except KeyboardInterrupt:
207 | 					raise
208 | 				except Exception, e:
209 | 					print >>warn, ""
210 | 					print >>warn, ('Fatal error: could not authenticate with mail server "%s" as user "%s"' % (SMTP_SERVER, SMTP_USER))
211 | 					print >>warn, ('Check your config.py file to confirm that SMTP_SERVER and other mail server settings are configured properly')
212 | 					if hasattr(e, 'reason'):
213 | 						print >>warn, "Reason:", e.reason
214 | 					sys.exit(1)
215 | 					
216 | 		smtpserver.sendmail(sender, recipient, msg_as_string)
217 | 		return smtpserver
218 | 
219 | 	else:
220 | 		try:
221 | 			p = subprocess.Popen(["/usr/sbin/sendmail", recipient], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
222 | 			p.communicate(msg_as_string)
223 | 			status = p.returncode
224 | 			assert status != None, "just a sanity check"
225 | 			if status != 0:
226 | 				print >>warn, ""
227 | 				print >>warn, ('Fatal error: sendmail exited with code %s' % status)
228 | 				sys.exit(1)
229 | 		except:
230 | 			print '''Error attempting to send email via sendmail. Possibly you need to configure your config.py to use a SMTP server? Please refer to the rss2email documentation or website (http://rss2email.infogami.com) for complete documentation of config.py. The options below may suffice for configuring email:
231 | # 1: Use SMTP_SERVER to send mail.
232 | # 0: Call /usr/sbin/sendmail to send mail.
233 | SMTP_SEND = 0
234 | 
235 | SMTP_SERVER = "smtp.yourisp.net:25"
236 | AUTHREQUIRED = 0 # if you need to use SMTP AUTH set to 1
237 | SMTP_USER = 'username'  # for SMTP AUTH, set SMTP username here
238 | SMTP_PASS = 'password'  # for SMTP AUTH, set SMTP password here
239 | '''
240 | 			sys.exit(1)
241 | 		return None
242 | 
243 | ## html2text options ##
244 | 
245 | # Use Unicode characters instead of their ascii psuedo-replacements
246 | UNICODE_SNOB = 0
247 | 
248 | # Put the links after each paragraph instead of at the end.
249 | LINKS_EACH_PARAGRAPH = 0
250 | 
251 | # Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
252 | BODY_WIDTH = 0
253 | 
254 | ### Load the Options ###
255 | 
256 | # Read options from config file if present.
257 | import sys
258 | sys.path.insert(0,".")
259 | try:
260 | 	from config import *
261 | except:
262 | 	pass
263 | 
264 | warn = sys.stderr
265 | 	
266 | if QP_REQUIRED:
267 | 	print >>warn, "QP_REQUIRED has been deprecated in rss2email."
268 | 
269 | ### Import Modules ###
270 | 
271 | import cPickle as pickle, time, os, traceback, sys, types, subprocess
272 | hash = ()
273 | try:
274 | 	import hashlib
275 | 	hash = hashlib.md5
276 | except ImportError:
277 | 	import md5
278 | 	hash = md5.new
279 | 
280 | unix = 0
281 | try:
282 | 	import fcntl
283 | # A pox on SunOS file locking methods	
284 | 	if (sys.platform.find('sunos') == -1): 
285 | 		unix = 1
286 | except:
287 | 	pass
288 | 		
289 | import socket; socket_errors = []
290 | for e in ['error', 'gaierror']:
291 | 	if hasattr(socket, e): socket_errors.append(getattr(socket, e))
292 | 
293 | #DEPRECATED import mimify 
294 | #DEPRECATED from StringIO import StringIO as SIO 
295 | #DEPRECATED mimify.CHARSET = 'utf-8'
296 | 
297 | import feedparser
298 | feedparser.USER_AGENT = "rss2email/"+__version__+ " +http://www.allthingsrss.com/rss2email/"
299 | feedparser.SANITIZE_HTML = 0
300 | 
301 | import html2text as h2t
302 | 
303 | h2t.UNICODE_SNOB = UNICODE_SNOB
304 | h2t.LINKS_EACH_PARAGRAPH = LINKS_EACH_PARAGRAPH
305 | h2t.BODY_WIDTH = BODY_WIDTH
306 | html2text = h2t.html2text
307 | 
308 | from types import *
309 | 
310 | ### Utility Functions ###
311 | 
312 | import threading
313 | class TimeoutError(Exception): pass
314 | 
315 | class InputError(Exception): pass
316 | 
317 | def timelimit(timeout, function):
318 | #    def internal(function):
319 |         def internal2(*args, **kw):
320 |             """
321 |             from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/473878
322 |             """
323 |             class Calculator(threading.Thread):
324 |                 def __init__(self):
325 |                     threading.Thread.__init__(self)
326 |                     self.result = None
327 |                     self.error = None
328 |                 
329 |                 def run(self):
330 |                     try:
331 |                         self.result = function(*args, **kw)
332 |                     except:
333 |                         self.error = sys.exc_info()
334 |             
335 |             c = Calculator()
336 |             c.setDaemon(True) # don't hold up exiting
337 |             c.start()
338 |             c.join(timeout)
339 |             if c.isAlive():
340 |                 raise TimeoutError
341 |             if c.error:
342 |                 raise c.error[0], c.error[1]
343 |             return c.result
344 |         return internal2
345 | #    return internal
346 |     
347 | 
348 | def isstr(f): return isinstance(f, type('')) or isinstance(f, type(u''))
349 | def ishtml(t): return type(t) is type(())
350 | def contains(a,b): return a.find(b) != -1
351 | def unu(s): # I / freakin' hate / that unicode
352 | 	if type(s) is types.UnicodeType: return s.encode('utf-8')
353 | 	else: return s
354 | 
355 | ### Parsing Utilities ###
356 | 
357 | def getContent(entry, HTMLOK=0):
358 | 	"""Select the best content from an entry, deHTMLizing if necessary.
359 | 	If raw HTML is best, an ('HTML', best) tuple is returned. """
360 | 	
361 | 	# How this works:
362 | 	#  * We have a bunch of potential contents. 
363 | 	#  * We go thru looking for our first choice. 
364 | 	#    (HTML or text, depending on HTMLOK)
365 | 	#  * If that doesn't work, we go thru looking for our second choice.
366 | 	#  * If that still doesn't work, we just take the first one.
367 | 	#
368 | 	# Possible future improvement:
369 | 	#  * Instead of just taking the first one
370 | 	#    pick the one in the "best" language.
371 | 	#  * HACK: hardcoded HTMLOK, should take a tuple of media types
372 | 	
373 | 	conts = entry.get('content', [])
374 | 	
375 | 	if entry.get('summary_detail', {}):
376 | 		conts += [entry.summary_detail]
377 | 	
378 | 	if conts:
379 | 		if HTMLOK:
380 | 			for c in conts:
381 | 				if contains(c.type, 'html'): return ('HTML', c.value)
382 | 	
383 | 		if not HTMLOK: # Only need to convert to text if HTML isn't OK
384 | 			for c in conts:
385 | 				if contains(c.type, 'html'):
386 | 					cleanerhtml = BeautifulSoup.BeautifulSoup(c.value)
387 | 					return html2text(unicode(cleanerhtml))
388 | 		
389 | 		for c in conts:
390 | 			if c.type == 'text/plain': return c.value
391 | 	
392 | 		return conts[0].value	
393 | 	
394 | 	return ""
395 | 
396 | def getID(entry):
397 | 	"""Get best ID from an entry.
398 | 	NEEDS UNIT TESTS"""
399 | 	if TRUST_GUID:
400 | 		if 'id' in entry and entry.id: 
401 | 			# Newer versions of feedparser could return a dictionary
402 | 			if type(entry.id) is DictType:
403 | 				return entry.id.values()[0]
404 | 
405 | 			return entry.id
406 | 
407 | 	content = getContent(entry)
408 | 	if content and content != "\n": return hash(unu(content)).hexdigest()
409 | 	if 'link' in entry: return entry.link
410 | 	if 'title' in entry: return hash(unu(entry.title)).hexdigest()
411 | 
412 | def getName(fullfeed, entry):
413 | 	"""Get the best name.
414 | 	NEEDS UNIT TESTS"""
415 | 
416 | 	if NO_FRIENDLY_NAME: return ''
417 | 
418 | 	feedinfo = fullfeed.feed
419 | 	if hasattr(fullfeed, "url") and fullfeed.url in OVERRIDE_FROM.keys():
420 | 		return OVERRIDE_FROM[fullfeed.url]
421 | 	
422 | 	name = feedinfo.get('title', '')
423 | 
424 | 	if 'name' in entry.get('author_detail', []): # normally {} but py2.1
425 | 		if entry.author_detail.name:
426 | 			if name: name += ": "
427 | 			det=entry.author_detail.name
428 | 			try:
429 | 			    name +=  entry.author_detail.name
430 | 			except UnicodeDecodeError:
431 | 			    name +=  unicode(entry.author_detail.name, 'utf-8')
432 | 
433 | 	elif 'name' in feedinfo.get('author_detail', []):
434 | 		if feedinfo.author_detail.name:
435 | 			if name: name += ", "
436 | 			name += feedinfo.author_detail.name
437 | 	
438 | 	return name
439 | 
440 | def validateEmail(email, planb):
441 | 	"""Do a basic quality check on email address, but return planb if email doesn't appear to be well-formed"""
442 | 	email_parts = email.split('@')
443 | 	if (len(email_parts) != 2) or not email_parts[0] or not email_parts[1]:
444 | 		return planb
445 | 	return email
446 | 	
447 | def getEmail(r, entry):
448 | 	"""Get the best email_address. If the best guess isn't well-formed (something@somthing.com), use DEFAULT_FROM instead.
449 | 	NEEDS UNIT TESTS"""
450 | 	
451 | 	feed = r.feed
452 | 		
453 | 	if FORCE_FROM: return DEFAULT_FROM
454 | 	
455 | 	if hasattr(r, "url") and r.url in OVERRIDE_EMAIL.keys():
456 | 		return validateEmail(OVERRIDE_EMAIL[r.url], DEFAULT_FROM)
457 | 	
458 | 	if 'email' in entry.get('author_detail', []):
459 | 		return validateEmail(entry.author_detail.email, DEFAULT_FROM)
460 | 	
461 | 	if 'email' in feed.get('author_detail', []):
462 | 		return validateEmail(feed.author_detail.email, DEFAULT_FROM)
463 | 		
464 | 	if USE_PUBLISHER_EMAIL:
465 | 		if 'email' in feed.get('publisher_detail', []):
466 | 			return validateEmail(feed.publisher_detail.email, DEFAULT_FROM)
467 | 		
468 | 		if feed.get("errorreportsto", ''):
469 | 			return validateEmail(feed.errorreportsto, DEFAULT_FROM)
470 | 			
471 | 	if hasattr(r, "url") and r.url in DEFAULT_EMAIL.keys():
472 | 		return DEFAULT_EMAIL[r.url]
473 | 	return DEFAULT_FROM
474 | 
475 | def getTags(entry):
476 | 	"""If the entry has any tags, build a tagline and return as a string. Otherwise returns empty string"""
477 | 	tagline = ""
478 | 	if 'tags' in entry:
479 | 		tags = entry.get('tags')
480 | 		taglist = []
481 | 		if tags:
482 | 			for tag in tags:
483 | 				if tag.has_key('term'): taglist.append(tag['term'])
484 | 		if taglist:
485 | 			tagline = ",".join(taglist)
486 | 
487 | 	return tagline
488 | 	
489 | 
490 | ### Simple Database of Feeds ###
491 | 
492 | class Feed:
493 | 	def __init__(self, url, to):
494 | 		self.url, self.etag, self.modified, self.seen = url, None, None, {}
495 | 		self.active = True
496 | 		self.to = to		
497 | 
498 | def load(lock=1):
499 | 	if not os.path.exists(feedfile):
500 | 		print 'Feedfile "%s" does not exist.  If you\'re using r2e for the first time, you' % feedfile
501 | 		print "have to run 'r2e new' first."
502 | 		sys.exit(1)
503 | 	try:
504 | 		feedfileObject = open(feedfile, 'r')
505 | 	except IOError, e:
506 | 		print "Feedfile could not be opened: %s" % e
507 | 		sys.exit(1)
508 | 	feeds = pickle.load(feedfileObject)
509 | 	
510 | 	if lock:
511 | 		locktype = 0
512 | 		if unix:
513 | 			locktype = fcntl.LOCK_EX
514 | 			fcntl.flock(feedfileObject.fileno(), locktype)
515 | 		#HACK: to deal with lock caching
516 | 		feedfileObject = open(feedfile, 'r')
517 | 		feeds = pickle.load(feedfileObject)
518 | 		if unix: 
519 | 			fcntl.flock(feedfileObject.fileno(), locktype)
520 | 	if feeds: 
521 | 		for feed in feeds[1:]:
522 | 			if not hasattr(feed, 'active'): 
523 | 				feed.active = True
524 | 		
525 | 	return feeds, feedfileObject
526 | 
527 | def unlock(feeds, feedfileObject):
528 | 	if not unix: 
529 | 		pickle.dump(feeds, open(feedfile, 'w'))
530 | 	else:	
531 | 		fd = open(feedfile+'.tmp', 'w')
532 | 		pickle.dump(feeds, fd)
533 | 		fd.flush()
534 | 		os.fsync(fd.fileno())
535 | 		fd.close()
536 | 		os.rename(feedfile+'.tmp', feedfile)
537 | 		fcntl.flock(feedfileObject.fileno(), fcntl.LOCK_UN)
538 | 
539 | #@timelimit(FEED_TIMEOUT)		
540 | def parse(url, etag, modified):
541 | 	if PROXY == '':
542 | 		return feedparser.parse(url, etag, modified)
543 | 	else:
544 | 		proxy = urllib2.ProxyHandler( {"http":PROXY} )
545 | 		return feedparser.parse(url, etag, modified, handlers = [proxy])	
546 | 	
547 | 		
548 | ### Program Functions ###
549 | 
550 | def add(*args):
551 | 	if len(args) == 2 and contains(args[1], '@') and not contains(args[1], '://'):
552 | 		urls, to = [args[0]], args[1]
553 | 	else:
554 | 		urls, to = args, None
555 | 	
556 | 	feeds, feedfileObject = load()
557 | 	if (feeds and not isstr(feeds[0]) and to is None) or (not len(feeds) and to is None):
558 | 		print "No email address has been defined. Please run 'r2e email emailaddress' or"
559 | 		print "'r2e add url emailaddress'."
560 | 		sys.exit(1)
561 | 	for url in urls: feeds.append(Feed(url, to))
562 | 	unlock(feeds, feedfileObject)
563 | 
564 | def run(num=None):
565 | 	feeds, feedfileObject = load()
566 | 	smtpserver = None
567 | 	try:
568 | 		# We store the default to address as the first item in the feeds list.
569 | 		# Here we take it out and save it for later.
570 | 		default_to = ""
571 | 		if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
572 | 		else: ifeeds = feeds
573 | 		
574 | 		if num: ifeeds = [feeds[num]]
575 | 		feednum = 0
576 | 		
577 | 		for f in ifeeds:
578 | 			try: 
579 | 				feednum += 1
580 | 				if not f.active: continue
581 | 				
582 | 				if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
583 | 				r = {}
584 | 				try:
585 | 					r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
586 | 				except TimeoutError:
587 | 					print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
588 | 					continue
589 | 				
590 | 				# Handle various status conditions, as required
591 | 				if 'status' in r:
592 | 					if r.status == 301:
593 | 						print >>warn, "W: feed moved; updating", f.url, "to", r['url']
594 | 						f.url = r['url']
595 | 					elif r.status == 410:
596 | 						print >>warn, "W: feed gone; deleting", f.url
597 | 						feeds.remove(f)
598 | 						continue
599 | 				
600 | 				http_status = r.get('status', 200)
601 | 				if VERBOSE > 1: print >>warn, "I: http status", http_status
602 | 				http_headers = r.get('headers', {
603 | 				  'content-type': 'application/rss+xml', 
604 | 				  'content-length':'1'})
605 | 				exc_type = r.get("bozo_exception", Exception()).__class__
606 | 				if http_status != 304 and not r.entries and not r.get('version', ''):
607 | 					if http_status not in [200, 302]: 
608 | 						print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)
609 | 
610 | 					elif contains(http_headers.get('content-type', 'rss'), 'html'):
611 | 						print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)
612 | 
613 | 					elif http_headers.get('content-length', '1') == '0':
614 | 						print >>warn, "W: empty page [%d] %s" % (feednum, f.url)
615 | 
616 | 					elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
617 | 						print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
618 | 					
619 | 					elif exc_type == IOError:
620 | 						print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
621 | 					
622 | 					elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
623 | 						print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
624 | 					
625 | 					elif exc_type in socket_errors:
626 | 						exc_reason = r.bozo_exception.args[1]
627 | 						print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
628 | 
629 | 					elif exc_type == urllib2.URLError:
630 | 						if r.bozo_exception.reason.__class__ in socket_errors:
631 | 							exc_reason = r.bozo_exception.reason.args[1]
632 | 						else:
633 | 							exc_reason = r.bozo_exception.reason
634 | 						print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
635 | 					
636 | 					elif exc_type == AttributeError:
637 | 						print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
638 | 					
639 | 					elif exc_type == KeyboardInterrupt:
640 | 						raise r.bozo_exception
641 | 						
642 | 					elif r.bozo:
643 | 						print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))
644 | 
645 | 					else:
646 | 						print >>warn, "=== rss2email encountered a problem with this feed ==="
647 | 						print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
648 | 						print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ==="
649 | 						print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
650 | 						print >>warn, r
651 | 						print >>warn, "rss2email", __version__
652 | 						print >>warn, "feedparser", feedparser.__version__
653 | 						print >>warn, "html2text", h2t.__version__
654 | 						print >>warn, "Python", sys.version
655 | 						print >>warn, "=== END HERE ==="
656 | 					continue
657 | 				
658 | 				r.entries.reverse()
659 | 				
660 | 				for entry in r.entries:
661 | 					id = getID(entry)
662 | 					
663 | 					# If TRUST_GUID isn't set, we get back hashes of the content.
664 | 					# Instead of letting these run wild, we put them in context
665 | 					# by associating them with the actual ID (if it exists).
666 | 					
667 | 					frameid = entry.get('id')
668 | 					if not(frameid): frameid = id
669 | 					if type(frameid) is DictType:
670 | 						frameid = frameid.values()[0]
671 | 					
672 | 					# If this item's ID is in our database
673 | 					# then it's already been sent
674 | 					# and we don't need to do anything more.
675 | 					
676 | 					if frameid in f.seen:
677 | 						if f.seen[frameid] == id: continue
678 | 
679 | 					if not (f.to or default_to):
680 | 						print "No default email address defined. Please run 'r2e email emailaddress'"
681 | 						print "Ignoring feed %s" % f.url
682 | 						break
683 | 					
684 | 					if 'title_detail' in entry and entry.title_detail:
685 | 						title = entry.title_detail.value
686 | 						if contains(entry.title_detail.type, 'html'):
687 | 							title = html2text(title)
688 | 					else:
689 | 						title = getContent(entry)[:70]
690 | 
691 | 					title = title.replace("\n", " ").strip()
692 | 					
693 | 					datetime = time.gmtime()
694 | 
695 | 					if DATE_HEADER:
696 | 						for datetype in DATE_HEADER_ORDER:
697 | 							kind = datetype+"_parsed"
698 | 							if kind in entry and entry[kind]: datetime = entry[kind]
699 | 						
700 | 					link = entry.get('link', "")
701 | 					
702 | 					from_addr = getEmail(r, entry)
703 | 					
704 | 					name = h2t.unescape(getName(r, entry))
705 | 					fromhdr = formataddr((name, from_addr,))
706 | 					tohdr = (f.to or default_to)
707 | 					subjecthdr = title
708 | 					datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
709 | 					useragenthdr = "rss2email"
710 | 					
711 | 					# Add post tags, if available
712 | 					tagline = getTags(entry)
713 | 
714 | 					extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline}
715 | 					if BONUS_HEADER != '':
716 | 						for hdr in BONUS_HEADER.strip().splitlines():
717 | 							pos = hdr.strip().find(':')
718 | 							if pos > 0:
719 | 								extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
720 | 							else:
721 | 								print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER	
722 | 					
723 | 					entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
724 | 					contenttype = 'plain'
725 | 					content = ''
726 | 					if USE_CSS_STYLING and HTML_MAIL:
727 | 						contenttype = 'html'
728 | 						content = "<html>\n" 
729 | 						content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n'
730 | 						content += '<body>\n'
731 | 						content += '<div id="entry">\n'
732 | 						content += '<h1'
733 | 						content += ' class="header"'
734 | 						content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
735 | 						if ishtml(entrycontent):
736 | 							body = entrycontent[1].strip()
737 | 						else:
738 | 							body = entrycontent.strip()
739 | 						if body != '':	
740 | 							content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n'
741 | 						content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
742 | 						if hasattr(entry,'enclosures'):
743 | 							for enclosure in entry.enclosures:
744 | 								if (hasattr(enclosure, 'url') and enclosure.url != ""):
745 | 									content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
746 | 								if (hasattr(enclosure, 'src') and enclosure.src != ""):
747 | 									content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
748 | 						if 'links' in entry:
749 | 							for extralink in entry.links:
750 | 								if ('rel' in extralink) and extralink['rel'] == u'via':
751 | 									extraurl = extralink['href']
752 | 									extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
753 | 									viatitle = extraurl
754 | 									if ('title' in extralink):
755 | 									    viatitle = extralink['title']
756 | 									content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
757 | 						content += '</p></div>\n'
758 | 						content += "\n\n</body></html>"
759 | 					else:	
760 | 						if ishtml(entrycontent):
761 | 							contenttype = 'html'
762 | 							content = "<html>\n" 
763 | 							content = ("<html><body>\n\n" + 
764 | 							           '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
765 | 							           entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
766 | 							           '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
767 | 							           
768 | 							if hasattr(entry,'enclosures'):
769 | 								for enclosure in entry.enclosures:
770 | 									if enclosure.url != "":
771 | 										content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
772 | 							if 'links' in entry:
773 | 								for extralink in entry.links:
774 | 									if ('rel' in extralink) and extralink['rel'] == u'via':
775 | 										content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
776 |                                                                 
777 | 							content += ("\n</body></html>")
778 | 						else:
779 | 							content = entrycontent.strip() + "\n\nURL: "+link
780 | 							if hasattr(entry,'enclosures'):
781 | 								for enclosure in entry.enclosures:
782 | 									if enclosure.url != "":
783 | 										content += ('\nEnclosure: ' + enclosure.url + "\n")
784 | 							if 'links' in entry:
785 | 								for extralink in entry.links:
786 | 									if ('rel' in extralink) and extralink['rel'] == u'via':
787 | 										content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'
788 | 
789 | 					smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver)
790 | 			
791 | 					f.seen[frameid] = id
792 | 					
793 | 				f.etag, f.modified = r.get('etag', None), r.get('modified', None)
794 | 			except (KeyboardInterrupt, SystemExit):
795 | 				raise
796 | 			except:
797 | 				print >>warn, "=== rss2email encountered a problem with this feed ==="
798 | 				print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
799 | 				print >>warn, "=== If this occurs repeatedly, send this to lindsey@allthingsrss.com ==="
800 | 				print >>warn, "E: could not parse", f.url
801 | 				traceback.print_exc(file=warn)
802 | 				print >>warn, "rss2email", __version__
803 | 				print >>warn, "feedparser", feedparser.__version__
804 | 				print >>warn, "html2text", h2t.__version__
805 | 				print >>warn, "Python", sys.version
806 | 				print >>warn, "=== END HERE ==="
807 | 				continue
808 | 
809 | 	finally:		
810 | 		unlock(feeds, feedfileObject)
811 | 		if smtpserver:
812 | 			smtpserver.quit()
813 | 
814 | def list():
815 | 	feeds, feedfileObject = load(lock=0)
816 | 	default_to = ""
817 | 	
818 | 	if feeds and isstr(feeds[0]):
819 | 		default_to = feeds[0]; ifeeds = feeds[1:]; i=1
820 | 		print "default email:", default_to
821 | 	else: ifeeds = feeds; i = 0
822 | 	for f in ifeeds:
823 | 		active = ('[ ]', '[*]')[f.active]
824 | 		print `i`+':',active, f.url, '('+(f.to or ('default: '+default_to))+')'
825 | 		if not (f.to or default_to):
826 | 			print "   W: Please define a default address with 'r2e email emailaddress'"
827 | 		i+= 1
828 | 
829 | def opmlexport():
830 | 	import xml.sax.saxutils
831 | 	feeds, feedfileObject = load(lock=0)
832 | 	
833 | 	if feeds:
834 | 		print '<?xml version="1.0" encoding="UTF-8"?>\n<opml version="1.0">\n<head>\n<title>rss2email OPML export</title>\n</head>\n<body>'
835 | 		for f in feeds[1:]:
836 | 			url = xml.sax.saxutils.escape(f.url)
837 | 			print '<outline type="rss" text="%s" xmlUrl="%s"/>' % (url, url)
838 | 		print '</body>\n</opml>'
839 | 
840 | def opmlimport(importfile):
841 | 	importfileObject = None
842 | 	print 'Importing feeds from', importfile
843 | 	if not os.path.exists(importfile):
844 | 		print 'OPML import file "%s" does not exist.' % importfile
845 | 	try:
846 | 		importfileObject = open(importfile, 'r')
847 | 	except IOError, e:
848 | 		print "OPML import file could not be opened: %s" % e
849 | 		sys.exit(1)
850 | 	try:
851 | 		import xml.dom.minidom
852 | 		dom = xml.dom.minidom.parse(importfileObject)
853 | 		newfeeds = dom.getElementsByTagName('outline')
854 | 	except:
855 | 		print 'E: Unable to parse OPML file'
856 | 		sys.exit(1)
857 | 
858 | 	feeds, feedfileObject = load(lock=1)
859 | 	
860 | 	import xml.sax.saxutils
861 | 	
862 | 	for f in newfeeds:
863 | 		if f.hasAttribute('xmlUrl'):
864 | 			feedurl = f.getAttribute('xmlUrl')
865 | 			print 'Adding %s' % xml.sax.saxutils.unescape(feedurl)
866 | 			feeds.append(Feed(feedurl, None))
867 | 			
868 | 	unlock(feeds, feedfileObject)
869 | 
870 | def delete(n):
871 | 	feeds, feedfileObject = load()
872 | 	if (n == 0) and (feeds and isstr(feeds[0])):
873 | 		print >>warn, "W: ID has to be equal to or higher than 1"
874 | 	elif n >= len(feeds):
875 | 		print >>warn, "W: no such feed"
876 | 	else:
877 | 		print >>warn, "W: deleting feed %s" % feeds[n].url
878 | 		feeds = feeds[:n] + feeds[n+1:]
879 | 		if n != len(feeds):
880 | 			print >>warn, "W: feed IDs have changed, list before deleting again"
881 | 	unlock(feeds, feedfileObject)
882 | 	
883 | def toggleactive(n, active):
884 | 	feeds, feedfileObject = load()
885 | 	if (n == 0) and (feeds and isstr(feeds[0])):
886 | 		print >>warn, "W: ID has to be equal to or higher than 1"
887 | 	elif n >= len(feeds):
888 | 		print >>warn, "W: no such feed"
889 | 	else:
890 | 		action = ('Pausing', 'Unpausing')[active]
891 | 		print >>warn, "%s feed %s" % (action, feeds[n].url)
892 | 		feeds[n].active = active
893 | 	unlock(feeds, feedfileObject)
894 | 	
895 | def reset():
896 | 	feeds, feedfileObject = load()
897 | 	if feeds and isstr(feeds[0]):
898 | 		ifeeds = feeds[1:]
899 | 	else: ifeeds = feeds
900 | 	for f in ifeeds:
901 | 		if VERBOSE: print "Resetting %d already seen items" % len(f.seen)
902 | 		f.seen = {}
903 | 		f.etag = None
904 | 		f.modified = None
905 | 	
906 | 	unlock(feeds, feedfileObject)
907 | 	
908 | def email(addr):
909 | 	feeds, feedfileObject = load()
910 | 	if feeds and isstr(feeds[0]): feeds[0] = addr
911 | 	else: feeds = [addr] + feeds
912 | 	unlock(feeds, feedfileObject)
913 | 
914 | if __name__ == '__main__':
915 | 	args = sys.argv
916 | 	try:
917 | 		if len(args) < 3: raise InputError, "insufficient args"
918 | 		feedfile, action, args = args[1], args[2], args[3:]
919 | 		
920 | 		if action == "run": 
921 | 			if args and args[0] == "--no-send":
922 | 				def send(sender, recipient, subject, body, contenttype, extraheaders=None, smtpserver=None):
923 | 					if VERBOSE: print 'Not sending:', unu(subject)
924 | 
925 | 			if args and args[-1].isdigit(): run(int(args[-1]))
926 | 			else: run()
927 | 
928 | 		elif action == "email":
929 | 			if not args:
930 | 				raise InputError, "Action '%s' requires an argument" % action
931 | 			else:
932 | 				email(args[0])
933 | 
934 | 		elif action == "add": add(*args)
935 | 
936 | 		elif action == "new": 
937 | 			if len(args) == 1: d = [args[0]]
938 | 			else: d = []
939 | 			pickle.dump(d, open(feedfile, 'w'))
940 | 
941 | 		elif action == "list": list()
942 | 
943 | 		elif action in ("help", "--help", "-h"): print __doc__
944 | 
945 | 		elif action == "delete":
946 | 			if not args:
947 | 				raise InputError, "Action '%s' requires an argument" % action
948 | 			elif args[0].isdigit():
949 | 				delete(int(args[0]))
950 | 			else:
951 | 				raise InputError, "Action '%s' requires a number as its argument" % action
952 | 
953 | 		elif action in ("pause", "unpause"):
954 | 			if not args:
955 | 				raise InputError, "Action '%s' requires an argument" % action
956 | 			elif args[0].isdigit():
957 | 				active = (action == "unpause")
958 | 				toggleactive(int(args[0]), active)
959 | 			else:
960 | 				raise InputError, "Action '%s' requires a number as its argument" % action
961 | 
962 | 		elif action == "reset": reset()
963 | 
964 | 		elif action == "opmlexport": opmlexport()
965 | 
966 | 		elif action == "opmlimport": 
967 | 			if not args:
968 | 				raise InputError, "OPML import '%s' requires a filename argument" % action
969 | 			opmlimport(args[0])
970 | 
971 | 		else:
972 | 			raise InputError, "Invalid action"
973 | 		
974 | 	except InputError, e:
975 | 		print "E:", e
976 | 		print
977 | 		print __doc__
978 | 
979 | 


--------------------------------------------------------------------------------
/BeautifulSoup.py:
--------------------------------------------------------------------------------
   1 | """Beautiful Soup
   2 | Elixir and Tonic
   3 | "The Screen-Scraper's Friend"
   4 | http://www.crummy.com/software/BeautifulSoup/
   5 | 
   6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 | tree representation. It provides methods and Pythonic idioms that make
   8 | it easy to navigate, search, and modify the tree.
   9 | 
  10 | A well-formed XML/HTML document yields a well-formed data
  11 | structure. An ill-formed XML/HTML document yields a correspondingly
  12 | ill-formed data structure. If your document is only locally
  13 | well-formed, you can use this library to find and process the
  14 | well-formed part of it.
  15 | 
  16 | Beautiful Soup works with Python 2.2 and up. It has no external
  17 | dependencies, but you'll have more success at converting data to UTF-8
  18 | if you also install these three packages:
  19 | 
  20 | * chardet, for auto-detecting character encodings
  21 |   http://chardet.feedparser.org/
  22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23 |   by stock Python.
  24 |   http://cjkpython.i18n.org/
  25 | 
  26 | Beautiful Soup defines classes for two main parsing strategies:
  27 | 
  28 |  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29 |    language that kind of looks like XML.
  30 | 
  31 |  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32 |    or invalid. This class has web browser-like heuristics for
  33 |    obtaining a sensible parse tree in the face of common HTML errors.
  34 | 
  35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 | the encoding of an HTML or XML document, and converting it to
  37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38 | 
  39 | For more than you ever wanted to know about Beautiful Soup, see the
  40 | documentation:
  41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
  42 | 
  43 | Here, have some legalese:
  44 | 
  45 | Copyright (c) 2004-2010, Leonard Richardson
  46 | 
  47 | All rights reserved.
  48 | 
  49 | Redistribution and use in source and binary forms, with or without
  50 | modification, are permitted provided that the following conditions are
  51 | met:
  52 | 
  53 |   * Redistributions of source code must retain the above copyright
  54 |     notice, this list of conditions and the following disclaimer.
  55 | 
  56 |   * Redistributions in binary form must reproduce the above
  57 |     copyright notice, this list of conditions and the following
  58 |     disclaimer in the documentation and/or other materials provided
  59 |     with the distribution.
  60 | 
  61 |   * Neither the name of the the Beautiful Soup Consortium and All
  62 |     Night Kosher Bakery nor the names of its contributors may be
  63 |     used to endorse or promote products derived from this software
  64 |     without specific prior written permission.
  65 | 
  66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77 | 
  78 | """
  79 | from __future__ import generators
  80 | 
  81 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 | __version__ = "3.2.0"
  83 | __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
  84 | __license__ = "New-style BSD"
  85 | 
  86 | from sgmllib import SGMLParser, SGMLParseError
  87 | import codecs
  88 | import markupbase
  89 | import types
  90 | import re
  91 | import sgmllib
  92 | try:
  93 |   from htmlentitydefs import name2codepoint
  94 | except ImportError:
  95 |   name2codepoint = {}
  96 | try:
  97 |     set
  98 | except NameError:
  99 |     from sets import Set as set
 100 | 
 101 | #These hacks make Beautiful Soup able to parse XML with namespaces
 102 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 103 | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 104 | 
 105 | DEFAULT_OUTPUT_ENCODING = "utf-8"
 106 | 
 107 | def _match_css_class(str):
 108 |     """Build a RE to match the given CSS class."""
 109 |     return re.compile(r"(^|.*\s)%s($|\s)" % str)
 110 | 
 111 | # First, the classes that represent markup elements.
 112 | 
 113 | class PageElement(object):
 114 |     """Contains the navigational information for some part of the page
 115 |     (either a tag or a piece of text)"""
 116 | 
 117 |     def setup(self, parent=None, previous=None):
 118 |         """Sets up the initial relations between this element and
 119 |         other elements."""
 120 |         self.parent = parent
 121 |         self.previous = previous
 122 |         self.next = None
 123 |         self.previousSibling = None
 124 |         self.nextSibling = None
 125 |         if self.parent and self.parent.contents:
 126 |             self.previousSibling = self.parent.contents[-1]
 127 |             self.previousSibling.nextSibling = self
 128 | 
 129 |     def replaceWith(self, replaceWith):
 130 |         oldParent = self.parent
 131 |         myIndex = self.parent.index(self)
 132 |         if hasattr(replaceWith, "parent")\
 133 |                   and replaceWith.parent is self.parent:
 134 |             # We're replacing this element with one of its siblings.
 135 |             index = replaceWith.parent.index(replaceWith)
 136 |             if index and index < myIndex:
 137 |                 # Furthermore, it comes before this element. That
 138 |                 # means that when we extract it, the index of this
 139 |                 # element will change.
 140 |                 myIndex = myIndex - 1
 141 |         self.extract()
 142 |         oldParent.insert(myIndex, replaceWith)
 143 | 
 144 |     def replaceWithChildren(self):
 145 |         myParent = self.parent
 146 |         myIndex = self.parent.index(self)
 147 |         self.extract()
 148 |         reversedChildren = list(self.contents)
 149 |         reversedChildren.reverse()
 150 |         for child in reversedChildren:
 151 |             myParent.insert(myIndex, child)
 152 | 
 153 |     def extract(self):
 154 |         """Destructively rips this element out of the tree."""
 155 |         if self.parent:
 156 |             try:
 157 |                 del self.parent.contents[self.parent.index(self)]
 158 |             except ValueError:
 159 |                 pass
 160 | 
 161 |         #Find the two elements that would be next to each other if
 162 |         #this element (and any children) hadn't been parsed. Connect
 163 |         #the two.
 164 |         lastChild = self._lastRecursiveChild()
 165 |         nextElement = lastChild.next
 166 | 
 167 |         if self.previous:
 168 |             self.previous.next = nextElement
 169 |         if nextElement:
 170 |             nextElement.previous = self.previous
 171 |         self.previous = None
 172 |         lastChild.next = None
 173 | 
 174 |         self.parent = None
 175 |         if self.previousSibling:
 176 |             self.previousSibling.nextSibling = self.nextSibling
 177 |         if self.nextSibling:
 178 |             self.nextSibling.previousSibling = self.previousSibling
 179 |         self.previousSibling = self.nextSibling = None
 180 |         return self
 181 | 
 182 |     def _lastRecursiveChild(self):
 183 |         "Finds the last element beneath this object to be parsed."
 184 |         lastChild = self
 185 |         while hasattr(lastChild, 'contents') and lastChild.contents:
 186 |             lastChild = lastChild.contents[-1]
 187 |         return lastChild
 188 | 
 189 |     def insert(self, position, newChild):
 190 |         if isinstance(newChild, basestring) \
 191 |             and not isinstance(newChild, NavigableString):
 192 |             newChild = NavigableString(newChild)
 193 | 
 194 |         position =  min(position, len(self.contents))
 195 |         if hasattr(newChild, 'parent') and newChild.parent is not None:
 196 |             # We're 'inserting' an element that's already one
 197 |             # of this object's children.
 198 |             if newChild.parent is self:
 199 |                 index = self.index(newChild)
 200 |                 if index > position:
 201 |                     # Furthermore we're moving it further down the
 202 |                     # list of this object's children. That means that
 203 |                     # when we extract this element, our target index
 204 |                     # will jump down one.
 205 |                     position = position - 1
 206 |             newChild.extract()
 207 | 
 208 |         newChild.parent = self
 209 |         previousChild = None
 210 |         if position == 0:
 211 |             newChild.previousSibling = None
 212 |             newChild.previous = self
 213 |         else:
 214 |             previousChild = self.contents[position-1]
 215 |             newChild.previousSibling = previousChild
 216 |             newChild.previousSibling.nextSibling = newChild
 217 |             newChild.previous = previousChild._lastRecursiveChild()
 218 |         if newChild.previous:
 219 |             newChild.previous.next = newChild
 220 | 
 221 |         newChildsLastElement = newChild._lastRecursiveChild()
 222 | 
 223 |         if position >= len(self.contents):
 224 |             newChild.nextSibling = None
 225 | 
 226 |             parent = self
 227 |             parentsNextSibling = None
 228 |             while not parentsNextSibling:
 229 |                 parentsNextSibling = parent.nextSibling
 230 |                 parent = parent.parent
 231 |                 if not parent: # This is the last element in the document.
 232 |                     break
 233 |             if parentsNextSibling:
 234 |                 newChildsLastElement.next = parentsNextSibling
 235 |             else:
 236 |                 newChildsLastElement.next = None
 237 |         else:
 238 |             nextChild = self.contents[position]
 239 |             newChild.nextSibling = nextChild
 240 |             if newChild.nextSibling:
 241 |                 newChild.nextSibling.previousSibling = newChild
 242 |             newChildsLastElement.next = nextChild
 243 | 
 244 |         if newChildsLastElement.next:
 245 |             newChildsLastElement.next.previous = newChildsLastElement
 246 |         self.contents.insert(position, newChild)
 247 | 
 248 |     def append(self, tag):
 249 |         """Appends the given tag to the contents of this tag."""
 250 |         self.insert(len(self.contents), tag)
 251 | 
 252 |     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 253 |         """Returns the first item that matches the given criteria and
 254 |         appears after this Tag in the document."""
 255 |         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 256 | 
 257 |     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 258 |                     **kwargs):
 259 |         """Returns all items that match the given criteria and appear
 260 |         after this Tag in the document."""
 261 |         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 262 |                              **kwargs)
 263 | 
 264 |     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 265 |         """Returns the closest sibling to this Tag that matches the
 266 |         given criteria and appears after this Tag in the document."""
 267 |         return self._findOne(self.findNextSiblings, name, attrs, text,
 268 |                              **kwargs)
 269 | 
 270 |     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 271 |                          **kwargs):
 272 |         """Returns the siblings of this Tag that match the given
 273 |         criteria and appear after this Tag in the document."""
 274 |         return self._findAll(name, attrs, text, limit,
 275 |                              self.nextSiblingGenerator, **kwargs)
 276 |     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 277 | 
 278 |     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 279 |         """Returns the first item that matches the given criteria and
 280 |         appears before this Tag in the document."""
 281 |         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 282 | 
 283 |     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 284 |                         **kwargs):
 285 |         """Returns all items that match the given criteria and appear
 286 |         before this Tag in the document."""
 287 |         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 288 |                            **kwargs)
 289 |     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 290 | 
 291 |     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 292 |         """Returns the closest sibling to this Tag that matches the
 293 |         given criteria and appears before this Tag in the document."""
 294 |         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 295 |                              **kwargs)
 296 | 
 297 |     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 298 |                              limit=None, **kwargs):
 299 |         """Returns the siblings of this Tag that match the given
 300 |         criteria and appear before this Tag in the document."""
 301 |         return self._findAll(name, attrs, text, limit,
 302 |                              self.previousSiblingGenerator, **kwargs)
 303 |     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 304 | 
 305 |     def findParent(self, name=None, attrs={}, **kwargs):
 306 |         """Returns the closest parent of this Tag that matches the given
 307 |         criteria."""
 308 |         # NOTE: We can't use _findOne because findParents takes a different
 309 |         # set of arguments.
 310 |         r = None
 311 |         l = self.findParents(name, attrs, 1)
 312 |         if l:
 313 |             r = l[0]
 314 |         return r
 315 | 
 316 |     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 317 |         """Returns the parents of this Tag that match the given
 318 |         criteria."""
 319 | 
 320 |         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 321 |                              **kwargs)
 322 |     fetchParents = findParents # Compatibility with pre-3.x
 323 | 
 324 |     #These methods do the real heavy lifting.
 325 | 
 326 |     def _findOne(self, method, name, attrs, text, **kwargs):
 327 |         r = None
 328 |         l = method(name, attrs, text, 1, **kwargs)
 329 |         if l:
 330 |             r = l[0]
 331 |         return r
 332 | 
 333 |     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 334 |         "Iterates over a generator looking for things that match."
 335 | 
 336 |         if isinstance(name, SoupStrainer):
 337 |             strainer = name
 338 |         # (Possibly) special case some findAll*(...) searches
 339 |         elif text is None and not limit and not attrs and not kwargs:
 340 |             # findAll*(True)
 341 |             if name is True:
 342 |                 return [element for element in generator()
 343 |                         if isinstance(element, Tag)]
 344 |             # findAll*('tag-name')
 345 |             elif isinstance(name, basestring):
 346 |                 return [element for element in generator()
 347 |                         if isinstance(element, Tag) and
 348 |                         element.name == name]
 349 |             else:
 350 |                 strainer = SoupStrainer(name, attrs, text, **kwargs)
 351 |         # Build a SoupStrainer
 352 |         else:
 353 |             strainer = SoupStrainer(name, attrs, text, **kwargs)
 354 |         results = ResultSet(strainer)
 355 |         g = generator()
 356 |         while True:
 357 |             try:
 358 |                 i = g.next()
 359 |             except StopIteration:
 360 |                 break
 361 |             if i:
 362 |                 found = strainer.search(i)
 363 |                 if found:
 364 |                     results.append(found)
 365 |                     if limit and len(results) >= limit:
 366 |                         break
 367 |         return results
 368 | 
 369 |     #These Generators can be used to navigate starting from both
 370 |     #NavigableStrings and Tags.
 371 |     def nextGenerator(self):
 372 |         i = self
 373 |         while i is not None:
 374 |             i = i.next
 375 |             yield i
 376 | 
 377 |     def nextSiblingGenerator(self):
 378 |         i = self
 379 |         while i is not None:
 380 |             i = i.nextSibling
 381 |             yield i
 382 | 
 383 |     def previousGenerator(self):
 384 |         i = self
 385 |         while i is not None:
 386 |             i = i.previous
 387 |             yield i
 388 | 
 389 |     def previousSiblingGenerator(self):
 390 |         i = self
 391 |         while i is not None:
 392 |             i = i.previousSibling
 393 |             yield i
 394 | 
 395 |     def parentGenerator(self):
 396 |         i = self
 397 |         while i is not None:
 398 |             i = i.parent
 399 |             yield i
 400 | 
 401 |     # Utility methods
 402 |     def substituteEncoding(self, str, encoding=None):
 403 |         encoding = encoding or "utf-8"
 404 |         return str.replace("%SOUP-ENCODING%", encoding)
 405 | 
 406 |     def toEncoding(self, s, encoding=None):
 407 |         """Encodes an object to a string in some encoding, or to Unicode.
 408 |         ."""
 409 |         if isinstance(s, unicode):
 410 |             if encoding:
 411 |                 s = s.encode(encoding)
 412 |         elif isinstance(s, str):
 413 |             if encoding:
 414 |                 s = s.encode(encoding)
 415 |             else:
 416 |                 s = unicode(s)
 417 |         else:
 418 |             if encoding:
 419 |                 s  = self.toEncoding(str(s), encoding)
 420 |             else:
 421 |                 s = unicode(s)
 422 |         return s
 423 | 
 424 | class NavigableString(unicode, PageElement):
 425 | 
 426 |     def __new__(cls, value):
 427 |         """Create a new NavigableString.
 428 | 
 429 |         When unpickling a NavigableString, this method is called with
 430 |         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
 431 |         passed in to the superclass's __new__ or the superclass won't know
 432 |         how to handle non-ASCII characters.
 433 |         """
 434 |         if isinstance(value, unicode):
 435 |             return unicode.__new__(cls, value)
 436 |         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
 437 | 
 438 |     def __getnewargs__(self):
 439 |         return (NavigableString.__str__(self),)
 440 | 
 441 |     def __getattr__(self, attr):
 442 |         """text.string gives you text. This is for backwards
 443 |         compatibility for Navigable*String, but for CData* it lets you
 444 |         get the string without the CData wrapper."""
 445 |         if attr == 'string':
 446 |             return self
 447 |         else:
 448 |             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 449 | 
 450 |     def __unicode__(self):
 451 |         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 452 | 
 453 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 454 |         if encoding:
 455 |             return self.encode(encoding)
 456 |         else:
 457 |             return self
 458 | 
 459 | class CData(NavigableString):
 460 | 
 461 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 462 |         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 463 | 
 464 | class ProcessingInstruction(NavigableString):
 465 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 466 |         output = self
 467 |         if "%SOUP-ENCODING%" in output:
 468 |             output = self.substituteEncoding(output, encoding)
 469 |         return "<?%s?>" % self.toEncoding(output, encoding)
 470 | 
 471 | class Comment(NavigableString):
 472 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 473 |         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 474 | 
 475 | class Declaration(NavigableString):
 476 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 477 |         return "<!%s>" % NavigableString.__str__(self, encoding)
 478 | 
 479 | class Tag(PageElement):
 480 | 
 481 |     """Represents a found HTML tag with its attributes and contents."""
 482 | 
 483 |     def _invert(h):
 484 |         "Cheap function to invert a hash."
 485 |         i = {}
 486 |         for k,v in h.items():
 487 |             i[v] = k
 488 |         return i
 489 | 
 490 |     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 491 |                                       "quot" : '"',
 492 |                                       "amp" : "&",
 493 |                                       "lt" : "<",
 494 |                                       "gt" : ">" }
 495 | 
 496 |     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 497 | 
 498 |     def _convertEntities(self, match):
 499 |         """Used in a call to re.sub to replace HTML, XML, and numeric
 500 |         entities with the appropriate Unicode characters. If HTML
 501 |         entities are being converted, any unrecognized entities are
 502 |         escaped."""
 503 |         x = match.group(1)
 504 |         if self.convertHTMLEntities and x in name2codepoint:
 505 |             return unichr(name2codepoint[x])
 506 |         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 507 |             if self.convertXMLEntities:
 508 |                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 509 |             else:
 510 |                 return u'&%s;' % x
 511 |         elif len(x) > 0 and x[0] == '#':
 512 |             # Handle numeric entities
 513 |             if len(x) > 1 and x[1] == 'x':
 514 |                 return unichr(int(x[2:], 16))
 515 |             else:
 516 |                 return unichr(int(x[1:]))
 517 | 
 518 |         elif self.escapeUnrecognizedEntities:
 519 |             return u'&amp;%s;' % x
 520 |         else:
 521 |             return u'&%s;' % x
 522 | 
 523 |     def __init__(self, parser, name, attrs=None, parent=None,
 524 |                  previous=None):
 525 |         "Basic constructor."
 526 | 
 527 |         # We don't actually store the parser object: that lets extracted
 528 |         # chunks be garbage-collected
 529 |         self.parserClass = parser.__class__
 530 |         self.isSelfClosing = parser.isSelfClosingTag(name)
 531 |         self.name = name
 532 |         if attrs is None:
 533 |             attrs = []
 534 |         elif isinstance(attrs, dict):
 535 |             attrs = attrs.items()
 536 |         self.attrs = attrs
 537 |         self.contents = []
 538 |         self.setup(parent, previous)
 539 |         self.hidden = False
 540 |         self.containsSubstitutions = False
 541 |         self.convertHTMLEntities = parser.convertHTMLEntities
 542 |         self.convertXMLEntities = parser.convertXMLEntities
 543 |         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 544 | 
 545 |         # Convert any HTML, XML, or numeric entities in the attribute values.
 546 |         convert = lambda(k, val): (k,
 547 |                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 548 |                                           self._convertEntities,
 549 |                                           val))
 550 |         self.attrs = map(convert, self.attrs)
 551 | 
 552 |     def getString(self):
 553 |         if (len(self.contents) == 1
 554 |             and isinstance(self.contents[0], NavigableString)):
 555 |             return self.contents[0]
 556 | 
 557 |     def setString(self, string):
 558 |         """Replace the contents of the tag with a string"""
 559 |         self.clear()
 560 |         self.append(string)
 561 | 
 562 |     string = property(getString, setString)
 563 | 
 564 |     def getText(self, separator=u""):
 565 |         if not len(self.contents):
 566 |             return u""
 567 |         stopNode = self._lastRecursiveChild().next
 568 |         strings = []
 569 |         current = self.contents[0]
 570 |         while current is not stopNode:
 571 |             if isinstance(current, NavigableString):
 572 |                 strings.append(current.strip())
 573 |             current = current.next
 574 |         return separator.join(strings)
 575 | 
 576 |     text = property(getText)
 577 | 
 578 |     def get(self, key, default=None):
 579 |         """Returns the value of the 'key' attribute for the tag, or
 580 |         the value given for 'default' if it doesn't have that
 581 |         attribute."""
 582 |         return self._getAttrMap().get(key, default)
 583 | 
 584 |     def clear(self):
 585 |         """Extract all children."""
 586 |         for child in self.contents[:]:
 587 |             child.extract()
 588 | 
 589 |     def index(self, element):
 590 |         for i, child in enumerate(self.contents):
 591 |             if child is element:
 592 |                 return i
 593 |         raise ValueError("Tag.index: element not in tag")
 594 | 
 595 |     def has_key(self, key):
 596 |         return self._getAttrMap().has_key(key)
 597 | 
 598 |     def __getitem__(self, key):
 599 |         """tag[key] returns the value of the 'key' attribute for the tag,
 600 |         and throws an exception if it's not there."""
 601 |         return self._getAttrMap()[key]
 602 | 
 603 |     def __iter__(self):
 604 |         "Iterating over a tag iterates over its contents."
 605 |         return iter(self.contents)
 606 | 
 607 |     def __len__(self):
 608 |         "The length of a tag is the length of its list of contents."
 609 |         return len(self.contents)
 610 | 
 611 |     def __contains__(self, x):
 612 |         return x in self.contents
 613 | 
 614 |     def __nonzero__(self):
 615 |         "A tag is non-None even if it has no contents."
 616 |         return True
 617 | 
 618 |     def __setitem__(self, key, value):
 619 |         """Setting tag[key] sets the value of the 'key' attribute for the
 620 |         tag."""
 621 |         self._getAttrMap()
 622 |         self.attrMap[key] = value
 623 |         found = False
 624 |         for i in range(0, len(self.attrs)):
 625 |             if self.attrs[i][0] == key:
 626 |                 self.attrs[i] = (key, value)
 627 |                 found = True
 628 |         if not found:
 629 |             self.attrs.append((key, value))
 630 |         self._getAttrMap()[key] = value
 631 | 
 632 |     def __delitem__(self, key):
 633 |         "Deleting tag[key] deletes all 'key' attributes for the tag."
 634 |         for item in self.attrs:
 635 |             if item[0] == key:
 636 |                 self.attrs.remove(item)
 637 |                 #We don't break because bad HTML can define the same
 638 |                 #attribute multiple times.
 639 |             self._getAttrMap()
 640 |             if self.attrMap.has_key(key):
 641 |                 del self.attrMap[key]
 642 | 
 643 |     def __call__(self, *args, **kwargs):
 644 |         """Calling a tag like a function is the same as calling its
 645 |         findAll() method. Eg. tag('a') returns a list of all the A tags
 646 |         found within this tag."""
 647 |         return apply(self.findAll, args, kwargs)
 648 | 
 649 |     def __getattr__(self, tag):
 650 |         #print "Getattr %s.%s" % (self.__class__, tag)
 651 |         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 652 |             return self.find(tag[:-3])
 653 |         elif tag.find('__') != 0:
 654 |             return self.find(tag)
 655 |         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 656 | 
 657 |     def __eq__(self, other):
 658 |         """Returns true iff this tag has the same name, the same attributes,
 659 |         and the same contents (recursively) as the given tag.
 660 | 
 661 |         NOTE: right now this will return false if two tags have the
 662 |         same attributes in a different order. Should this be fixed?"""
 663 |         if other is self:
 664 |             return True
 665 |         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 666 |             return False
 667 |         for i in range(0, len(self.contents)):
 668 |             if self.contents[i] != other.contents[i]:
 669 |                 return False
 670 |         return True
 671 | 
 672 |     def __ne__(self, other):
 673 |         """Returns true iff this tag is not identical to the other tag,
 674 |         as defined in __eq__."""
 675 |         return not self == other
 676 | 
 677 |     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 678 |         """Renders this tag as a string."""
 679 |         return self.__str__(encoding)
 680 | 
 681 |     def __unicode__(self):
 682 |         return self.__str__(None)
 683 | 
 684 |     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 685 |                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 686 |                                            + ")")
 687 | 
 688 |     def _sub_entity(self, x):
 689 |         """Used with a regular expression to substitute the
 690 |         appropriate XML entity for an XML special character."""
 691 |         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 692 | 
 693 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 694 |                 prettyPrint=False, indentLevel=0):
 695 |         """Returns a string or Unicode representation of this tag and
 696 |         its contents. To get Unicode, pass None for encoding.
 697 | 
 698 |         NOTE: since Python's HTML parser consumes whitespace, this
 699 |         method is not certain to reproduce the whitespace present in
 700 |         the original string."""
 701 | 
 702 |         encodedName = self.toEncoding(self.name, encoding)
 703 | 
 704 |         attrs = []
 705 |         if self.attrs:
 706 |             for key, val in self.attrs:
 707 |                 fmt = '%s="%s"'
 708 |                 if isinstance(val, basestring):
 709 |                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 710 |                         val = self.substituteEncoding(val, encoding)
 711 | 
 712 |                     # The attribute value either:
 713 |                     #
 714 |                     # * Contains no embedded double quotes or single quotes.
 715 |                     #   No problem: we enclose it in double quotes.
 716 |                     # * Contains embedded single quotes. No problem:
 717 |                     #   double quotes work here too.
 718 |                     # * Contains embedded double quotes. No problem:
 719 |                     #   we enclose it in single quotes.
 720 |                     # * Embeds both single _and_ double quotes. This
 721 |                     #   can't happen naturally, but it can happen if
 722 |                     #   you modify an attribute value after parsing
 723 |                     #   the document. Now we have a bit of a
 724 |                     #   problem. We solve it by enclosing the
 725 |                     #   attribute in single quotes, and escaping any
 726 |                     #   embedded single quotes to XML entities.
 727 |                     if '"' in val:
 728 |                         fmt = "%s='%s'"
 729 |                         if "'" in val:
 730 |                             # TODO: replace with apos when
 731 |                             # appropriate.
 732 |                             val = val.replace("'", "&squot;")
 733 | 
 734 |                     # Now we're okay w/r/t quotes. But the attribute
 735 |                     # value might also contain angle brackets, or
 736 |                     # ampersands that aren't part of entities. We need
 737 |                     # to escape those to XML entities too.
 738 |                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 739 | 
 740 |                 attrs.append(fmt % (self.toEncoding(key, encoding),
 741 |                                     self.toEncoding(val, encoding)))
 742 |         close = ''
 743 |         closeTag = ''
 744 |         if self.isSelfClosing:
 745 |             close = ' /'
 746 |         else:
 747 |             closeTag = '</%s>' % encodedName
 748 | 
 749 |         indentTag, indentContents = 0, 0
 750 |         if prettyPrint:
 751 |             indentTag = indentLevel
 752 |             space = (' ' * (indentTag-1))
 753 |             indentContents = indentTag + 1
 754 |         contents = self.renderContents(encoding, prettyPrint, indentContents)
 755 |         if self.hidden:
 756 |             s = contents
 757 |         else:
 758 |             s = []
 759 |             attributeString = ''
 760 |             if attrs:
 761 |                 attributeString = ' ' + ' '.join(attrs)
 762 |             if prettyPrint:
 763 |                 s.append(space)
 764 |             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 765 |             if prettyPrint:
 766 |                 s.append("\n")
 767 |             s.append(contents)
 768 |             if prettyPrint and contents and contents[-1] != "\n":
 769 |                 s.append("\n")
 770 |             if prettyPrint and closeTag:
 771 |                 s.append(space)
 772 |             s.append(closeTag)
 773 |             if prettyPrint and closeTag and self.nextSibling:
 774 |                 s.append("\n")
 775 |             s = ''.join(s)
 776 |         return s
 777 | 
 778 |     def decompose(self):
 779 |         """Recursively destroys the contents of this tree."""
 780 |         self.extract()
 781 |         if len(self.contents) == 0:
 782 |             return
 783 |         current = self.contents[0]
 784 |         while current is not None:
 785 |             next = current.next
 786 |             if isinstance(current, Tag):
 787 |                 del current.contents[:]
 788 |             current.parent = None
 789 |             current.previous = None
 790 |             current.previousSibling = None
 791 |             current.next = None
 792 |             current.nextSibling = None
 793 |             current = next
 794 | 
 795 |     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 796 |         return self.__str__(encoding, True)
 797 | 
 798 |     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 799 |                        prettyPrint=False, indentLevel=0):
 800 |         """Renders the contents of this tag as a string in the given
 801 |         encoding. If encoding is None, returns a Unicode string.."""
 802 |         s=[]
 803 |         for c in self:
 804 |             text = None
 805 |             if isinstance(c, NavigableString):
 806 |                 text = c.__str__(encoding)
 807 |             elif isinstance(c, Tag):
 808 |                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 809 |             if text and prettyPrint:
 810 |                 text = text.strip()
 811 |             if text:
 812 |                 if prettyPrint:
 813 |                     s.append(" " * (indentLevel-1))
 814 |                 s.append(text)
 815 |                 if prettyPrint:
 816 |                     s.append("\n")
 817 |         return ''.join(s)
 818 | 
 819 |     #Soup methods
 820 | 
 821 |     def find(self, name=None, attrs={}, recursive=True, text=None,
 822 |              **kwargs):
 823 |         """Return only the first child of this Tag matching the given
 824 |         criteria."""
 825 |         r = None
 826 |         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 827 |         if l:
 828 |             r = l[0]
 829 |         return r
 830 |     findChild = find
 831 | 
 832 |     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 833 |                 limit=None, **kwargs):
 834 |         """Extracts a list of Tag objects that match the given
 835 |         criteria.  You can specify the name of the Tag and any
 836 |         attributes you want the Tag to have.
 837 | 
 838 |         The value of a key-value pair in the 'attrs' map can be a
 839 |         string, a list of strings, a regular expression object, or a
 840 |         callable that takes a string and returns whether or not the
 841 |         string matches for some custom definition of 'matches'. The
 842 |         same is true of the tag name."""
 843 |         generator = self.recursiveChildGenerator
 844 |         if not recursive:
 845 |             generator = self.childGenerator
 846 |         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 847 |     findChildren = findAll
 848 | 
 849 |     # Pre-3.x compatibility methods
 850 |     first = find
 851 |     fetch = findAll
 852 | 
 853 |     def fetchText(self, text=None, recursive=True, limit=None):
 854 |         return self.findAll(text=text, recursive=recursive, limit=limit)
 855 | 
 856 |     def firstText(self, text=None, recursive=True):
 857 |         return self.find(text=text, recursive=recursive)
 858 | 
 859 |     #Private methods
 860 | 
 861 |     def _getAttrMap(self):
 862 |         """Initializes a map representation of this tag's attributes,
 863 |         if not already initialized."""
 864 |         if not getattr(self, 'attrMap'):
 865 |             self.attrMap = {}
 866 |             for (key, value) in self.attrs:
 867 |                 self.attrMap[key] = value
 868 |         return self.attrMap
 869 | 
 870 |     #Generator methods
 871 |     def childGenerator(self):
 872 |         # Just use the iterator from the contents
 873 |         return iter(self.contents)
 874 | 
 875 |     def recursiveChildGenerator(self):
 876 |         if not len(self.contents):
 877 |             raise StopIteration
 878 |         stopNode = self._lastRecursiveChild().next
 879 |         current = self.contents[0]
 880 |         while current is not stopNode:
 881 |             yield current
 882 |             current = current.next
 883 | 
 884 | 
 885 | # Next, a couple classes to represent queries and their results.
 886 | class SoupStrainer:
 887 |     """Encapsulates a number of ways of matching a markup element (tag or
 888 |     text)."""
 889 | 
 890 |     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 891 |         self.name = name
 892 |         if isinstance(attrs, basestring):
 893 |             kwargs['class'] = _match_css_class(attrs)
 894 |             attrs = None
 895 |         if kwargs:
 896 |             if attrs:
 897 |                 attrs = attrs.copy()
 898 |                 attrs.update(kwargs)
 899 |             else:
 900 |                 attrs = kwargs
 901 |         self.attrs = attrs
 902 |         self.text = text
 903 | 
 904 |     def __str__(self):
 905 |         if self.text:
 906 |             return self.text
 907 |         else:
 908 |             return "%s|%s" % (self.name, self.attrs)
 909 | 
 910 |     def searchTag(self, markupName=None, markupAttrs={}):
 911 |         found = None
 912 |         markup = None
 913 |         if isinstance(markupName, Tag):
 914 |             markup = markupName
 915 |             markupAttrs = markup
 916 |         callFunctionWithTagData = callable(self.name) \
 917 |                                 and not isinstance(markupName, Tag)
 918 | 
 919 |         if (not self.name) \
 920 |                or callFunctionWithTagData \
 921 |                or (markup and self._matches(markup, self.name)) \
 922 |                or (not markup and self._matches(markupName, self.name)):
 923 |             if callFunctionWithTagData:
 924 |                 match = self.name(markupName, markupAttrs)
 925 |             else:
 926 |                 match = True
 927 |                 markupAttrMap = None
 928 |                 for attr, matchAgainst in self.attrs.items():
 929 |                     if not markupAttrMap:
 930 |                          if hasattr(markupAttrs, 'get'):
 931 |                             markupAttrMap = markupAttrs
 932 |                          else:
 933 |                             markupAttrMap = {}
 934 |                             for k,v in markupAttrs:
 935 |                                 markupAttrMap[k] = v
 936 |                     attrValue = markupAttrMap.get(attr)
 937 |                     if not self._matches(attrValue, matchAgainst):
 938 |                         match = False
 939 |                         break
 940 |             if match:
 941 |                 if markup:
 942 |                     found = markup
 943 |                 else:
 944 |                     found = markupName
 945 |         return found
 946 | 
 947 |     def search(self, markup):
 948 |         #print 'looking for %s in %s' % (self, markup)
 949 |         found = None
 950 |         # If given a list of items, scan it for a text element that
 951 |         # matches.
 952 |         if hasattr(markup, "__iter__") \
 953 |                 and not isinstance(markup, Tag):
 954 |             for element in markup:
 955 |                 if isinstance(element, NavigableString) \
 956 |                        and self.search(element):
 957 |                     found = element
 958 |                     break
 959 |         # If it's a Tag, make sure its name or attributes match.
 960 |         # Don't bother with Tags if we're searching for text.
 961 |         elif isinstance(markup, Tag):
 962 |             if not self.text:
 963 |                 found = self.searchTag(markup)
 964 |         # If it's text, make sure the text matches.
 965 |         elif isinstance(markup, NavigableString) or \
 966 |                  isinstance(markup, basestring):
 967 |             if self._matches(markup, self.text):
 968 |                 found = markup
 969 |         else:
 970 |             raise Exception, "I don't know how to match against a %s" \
 971 |                   % markup.__class__
 972 |         return found
 973 | 
 974 |     def _matches(self, markup, matchAgainst):
 975 |         #print "Matching %s against %s" % (markup, matchAgainst)
 976 |         result = False
 977 |         if matchAgainst is True:
 978 |             result = markup is not None
 979 |         elif callable(matchAgainst):
 980 |             result = matchAgainst(markup)
 981 |         else:
 982 |             #Custom match methods take the tag as an argument, but all
 983 |             #other ways of matching match the tag name as a string.
 984 |             if isinstance(markup, Tag):
 985 |                 markup = markup.name
 986 |             if markup and not isinstance(markup, basestring):
 987 |                 markup = unicode(markup)
 988 |             #Now we know that chunk is either a string, or None.
 989 |             if hasattr(matchAgainst, 'match'):
 990 |                 # It's a regexp object.
 991 |                 result = markup and matchAgainst.search(markup)
 992 |             elif hasattr(matchAgainst, '__iter__'): # list-like
 993 |                 result = markup in matchAgainst
 994 |             elif hasattr(matchAgainst, 'items'):
 995 |                 result = markup.has_key(matchAgainst)
 996 |             elif matchAgainst and isinstance(markup, basestring):
 997 |                 if isinstance(markup, unicode):
 998 |                     matchAgainst = unicode(matchAgainst)
 999 |                 else:
1000 |                     matchAgainst = str(matchAgainst)
1001 | 
1002 |             if not result:
1003 |                 result = matchAgainst == markup
1004 |         return result
1005 | 
1006 | class ResultSet(list):
1007 |     """A ResultSet is just a list that keeps track of the SoupStrainer
1008 |     that created it."""
1009 |     def __init__(self, source):
1010 |         list.__init__([])
1011 |         self.source = source
1012 | 
1013 | # Now, some helper functions.
1014 | 
1015 | def buildTagMap(default, *args):
1016 |     """Turns a list of maps, lists, or scalars into a single map.
1017 |     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018 |     NESTING_RESET_TAGS maps out of lists and partial maps."""
1019 |     built = {}
1020 |     for portion in args:
1021 |         if hasattr(portion, 'items'):
1022 |             #It's a map. Merge it.
1023 |             for k,v in portion.items():
1024 |                 built[k] = v
1025 |         elif hasattr(portion, '__iter__'): # is a list
1026 |             #It's a list. Map each item to the default.
1027 |             for k in portion:
1028 |                 built[k] = default
1029 |         else:
1030 |             #It's a scalar. Map it to the default.
1031 |             built[portion] = default
1032 |     return built
1033 | 
1034 | # Now, the parser classes.
1035 | 
1036 | class BeautifulStoneSoup(Tag, SGMLParser):
1037 | 
1038 |     """This class contains the basic parser and search code. It defines
1039 |     a parser that knows nothing about tag behavior except for the
1040 |     following:
1041 | 
1042 |       You can't close a tag without closing all the tags it encloses.
1043 |       That is, "<foo><bar></foo>" actually means
1044 |       "<foo><bar></bar></foo>".
1045 | 
1046 |     [Another possible explanation is "<foo><bar /></foo>", but since
1047 |     this class defines no SELF_CLOSING_TAGS, it will never use that
1048 |     explanation.]
1049 | 
1050 |     This class is useful for parsing XML or made-up markup languages,
1051 |     or when BeautifulSoup makes an assumption counter to what you were
1052 |     expecting."""
1053 | 
1054 |     SELF_CLOSING_TAGS = {}
1055 |     NESTABLE_TAGS = {}
1056 |     RESET_NESTING_TAGS = {}
1057 |     QUOTE_TAGS = {}
1058 |     PRESERVE_WHITESPACE_TAGS = []
1059 | 
1060 |     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061 |                        lambda x: x.group(1) + ' />'),
1062 |                       (re.compile('<!\s+([^<>]*)>'),
1063 |                        lambda x: '<!' + x.group(1) + '>')
1064 |                       ]
1065 | 
1066 |     ROOT_TAG_NAME = u'[document]'
1067 | 
1068 |     HTML_ENTITIES = "html"
1069 |     XML_ENTITIES = "xml"
1070 |     XHTML_ENTITIES = "xhtml"
1071 |     # TODO: This only exists for backwards-compatibility
1072 |     ALL_ENTITIES = XHTML_ENTITIES
1073 | 
1074 |     # Used when determining whether a text node is all whitespace and
1075 |     # can be replaced with a single space. A text node that contains
1076 |     # fancy Unicode spaces (usually non-breaking) should be left
1077 |     # alone.
1078 |     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1079 | 
1080 |     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081 |                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082 |                  convertEntities=None, selfClosingTags=None, isHTML=False):
1083 |         """The Soup object is initialized as the 'root tag', and the
1084 |         provided markup (which can be a string or a file-like object)
1085 |         is fed into the underlying parser.
1086 | 
1087 |         sgmllib will process most bad HTML, and the BeautifulSoup
1088 |         class has some tricks for dealing with some HTML that kills
1089 |         sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090 |         if your data uses self-closing tags or declarations
1091 |         incorrectly.
1092 | 
1093 |         By default, Beautiful Soup uses regexes to sanitize input,
1094 |         avoiding the vast majority of these problems. If the problems
1095 |         don't apply to you, pass in False for markupMassage, and
1096 |         you'll get better performance.
1097 | 
1098 |         The default parser massage techniques fix the two most common
1099 |         instances of invalid HTML that choke sgmllib:
1100 | 
1101 |          <br/> (No space between name of closing tag and tag close)
1102 |          <! --Comment--> (Extraneous whitespace in declaration)
1103 | 
1104 |         You can pass in a custom list of (RE object, replace method)
1105 |         tuples to get Beautiful Soup to scrub your input the way you
1106 |         want."""
1107 | 
1108 |         self.parseOnlyThese = parseOnlyThese
1109 |         self.fromEncoding = fromEncoding
1110 |         self.smartQuotesTo = smartQuotesTo
1111 |         self.convertEntities = convertEntities
1112 |         # Set the rules for how we'll deal with the entities we
1113 |         # encounter
1114 |         if self.convertEntities:
1115 |             # It doesn't make sense to convert encoded characters to
1116 |             # entities even while you're converting entities to Unicode.
1117 |             # Just convert it all to Unicode.
1118 |             self.smartQuotesTo = None
1119 |             if convertEntities == self.HTML_ENTITIES:
1120 |                 self.convertXMLEntities = False
1121 |                 self.convertHTMLEntities = True
1122 |                 self.escapeUnrecognizedEntities = True
1123 |             elif convertEntities == self.XHTML_ENTITIES:
1124 |                 self.convertXMLEntities = True
1125 |                 self.convertHTMLEntities = True
1126 |                 self.escapeUnrecognizedEntities = False
1127 |             elif convertEntities == self.XML_ENTITIES:
1128 |                 self.convertXMLEntities = True
1129 |                 self.convertHTMLEntities = False
1130 |                 self.escapeUnrecognizedEntities = False
1131 |         else:
1132 |             self.convertXMLEntities = False
1133 |             self.convertHTMLEntities = False
1134 |             self.escapeUnrecognizedEntities = False
1135 | 
1136 |         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137 |         SGMLParser.__init__(self)
1138 | 
1139 |         if hasattr(markup, 'read'):        # It's a file-type object.
1140 |             markup = markup.read()
1141 |         self.markup = markup
1142 |         self.markupMassage = markupMassage
1143 |         try:
1144 |             self._feed(isHTML=isHTML)
1145 |         except StopParsing:
1146 |             pass
1147 |         self.markup = None                 # The markup can now be GCed
1148 | 
1149 |     def convert_charref(self, name):
1150 |         """This method fixes a bug in Python's SGMLParser."""
1151 |         try:
1152 |             n = int(name)
1153 |         except ValueError:
1154 |             return
1155 |         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156 |             return
1157 |         return self.convert_codepoint(n)
1158 | 
1159 |     def _feed(self, inDocumentEncoding=None, isHTML=False):
1160 |         # Convert the document to Unicode.
1161 |         markup = self.markup
1162 |         if isinstance(markup, unicode):
1163 |             if not hasattr(self, 'originalEncoding'):
1164 |                 self.originalEncoding = None
1165 |         else:
1166 |             dammit = UnicodeDammit\
1167 |                      (markup, [self.fromEncoding, inDocumentEncoding],
1168 |                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169 |             markup = dammit.unicode
1170 |             self.originalEncoding = dammit.originalEncoding
1171 |             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172 |         if markup:
1173 |             if self.markupMassage:
1174 |                 if not hasattr(self.markupMassage, "__iter__"):
1175 |                     self.markupMassage = self.MARKUP_MASSAGE
1176 |                 for fix, m in self.markupMassage:
1177 |                     markup = fix.sub(m, markup)
1178 |                 # TODO: We get rid of markupMassage so that the
1179 |                 # soup object can be deepcopied later on. Some
1180 |                 # Python installations can't copy regexes. If anyone
1181 |                 # was relying on the existence of markupMassage, this
1182 |                 # might cause problems.
1183 |                 del(self.markupMassage)
1184 |         self.reset()
1185 | 
1186 |         SGMLParser.feed(self, markup)
1187 |         # Close out any unfinished strings and close all the open tags.
1188 |         self.endData()
1189 |         while self.currentTag.name != self.ROOT_TAG_NAME:
1190 |             self.popTag()
1191 | 
1192 |     def __getattr__(self, methodName):
1193 |         """This method routes method call requests to either the SGMLParser
1194 |         superclass or the Tag superclass, depending on the method name."""
1195 |         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196 | 
1197 |         if methodName.startswith('start_') or methodName.startswith('end_') \
1198 |                or methodName.startswith('do_'):
1199 |             return SGMLParser.__getattr__(self, methodName)
1200 |         elif not methodName.startswith('__'):
1201 |             return Tag.__getattr__(self, methodName)
1202 |         else:
1203 |             raise AttributeError
1204 | 
1205 |     def isSelfClosingTag(self, name):
1206 |         """Returns true iff the given string is the name of a
1207 |         self-closing tag according to this parser."""
1208 |         return self.SELF_CLOSING_TAGS.has_key(name) \
1209 |                or self.instanceSelfClosingTags.has_key(name)
1210 | 
1211 |     def reset(self):
1212 |         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213 |         self.hidden = 1
1214 |         SGMLParser.reset(self)
1215 |         self.currentData = []
1216 |         self.currentTag = None
1217 |         self.tagStack = []
1218 |         self.quoteStack = []
1219 |         self.pushTag(self)
1220 | 
1221 |     def popTag(self):
1222 |         tag = self.tagStack.pop()
1223 | 
1224 |         #print "Pop", tag.name
1225 |         if self.tagStack:
1226 |             self.currentTag = self.tagStack[-1]
1227 |         return self.currentTag
1228 | 
1229 |     def pushTag(self, tag):
1230 |         #print "Push", tag.name
1231 |         if self.currentTag:
1232 |             self.currentTag.contents.append(tag)
1233 |         self.tagStack.append(tag)
1234 |         self.currentTag = self.tagStack[-1]
1235 | 
1236 |     def endData(self, containerClass=NavigableString):
1237 |         if self.currentData:
1238 |             currentData = u''.join(self.currentData)
1239 |             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240 |                 not set([tag.name for tag in self.tagStack]).intersection(
1241 |                     self.PRESERVE_WHITESPACE_TAGS)):
1242 |                 if '\n' in currentData:
1243 |                     currentData = '\n'
1244 |                 else:
1245 |                     currentData = ' '
1246 |             self.currentData = []
1247 |             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248 |                    (not self.parseOnlyThese.text or \
1249 |                     not self.parseOnlyThese.search(currentData)):
1250 |                 return
1251 |             o = containerClass(currentData)
1252 |             o.setup(self.currentTag, self.previous)
1253 |             if self.previous:
1254 |                 self.previous.next = o
1255 |             self.previous = o
1256 |             self.currentTag.contents.append(o)
1257 | 
1258 | 
1259 |     def _popToTag(self, name, inclusivePop=True):
1260 |         """Pops the tag stack up to and including the most recent
1261 |         instance of the given tag. If inclusivePop is false, pops the tag
1262 |         stack up to but *not* including the most recent instqance of
1263 |         the given tag."""
1264 |         #print "Popping to %s" % name
1265 |         if name == self.ROOT_TAG_NAME:
1266 |             return
1267 | 
1268 |         numPops = 0
1269 |         mostRecentTag = None
1270 |         for i in range(len(self.tagStack)-1, 0, -1):
1271 |             if name == self.tagStack[i].name:
1272 |                 numPops = len(self.tagStack)-i
1273 |                 break
1274 |         if not inclusivePop:
1275 |             numPops = numPops - 1
1276 | 
1277 |         for i in range(0, numPops):
1278 |             mostRecentTag = self.popTag()
1279 |         return mostRecentTag
1280 | 
1281 |     def _smartPop(self, name):
1282 | 
1283 |         """We need to pop up to the previous tag of this type, unless
1284 |         one of this tag's nesting reset triggers comes between this
1285 |         tag and the previous tag of this type, OR unless this tag is a
1286 |         generic nesting trigger and another generic nesting trigger
1287 |         comes between this tag and the previous tag of this type.
1288 | 
1289 |         Examples:
1290 |          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291 |          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292 |          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1293 | 
1294 |          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295 |          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296 |          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1297 |         """
1298 | 
1299 |         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300 |         isNestable = nestingResetTriggers != None
1301 |         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302 |         popTo = None
1303 |         inclusive = True
1304 |         for i in range(len(self.tagStack)-1, 0, -1):
1305 |             p = self.tagStack[i]
1306 |             if (not p or p.name == name) and not isNestable:
1307 |                 #Non-nestable tags get popped to the top or to their
1308 |                 #last occurance.
1309 |                 popTo = name
1310 |                 break
1311 |             if (nestingResetTriggers is not None
1312 |                 and p.name in nestingResetTriggers) \
1313 |                 or (nestingResetTriggers is None and isResetNesting
1314 |                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1315 | 
1316 |                 #If we encounter one of the nesting reset triggers
1317 |                 #peculiar to this tag, or we encounter another tag
1318 |                 #that causes nesting to reset, pop up to but not
1319 |                 #including that tag.
1320 |                 popTo = p.name
1321 |                 inclusive = False
1322 |                 break
1323 |             p = p.parent
1324 |         if popTo:
1325 |             self._popToTag(popTo, inclusive)
1326 | 
1327 |     def unknown_starttag(self, name, attrs, selfClosing=0):
1328 |         #print "Start tag %s: %s" % (name, attrs)
1329 |         if self.quoteStack:
1330 |             #This is not a real tag.
1331 |             #print "<%s> is not real!" % name
1332 |             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333 |             self.handle_data('<%s%s>' % (name, attrs))
1334 |             return
1335 |         self.endData()
1336 | 
1337 |         if not self.isSelfClosingTag(name) and not selfClosing:
1338 |             self._smartPop(name)
1339 | 
1340 |         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341 |                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342 |             return
1343 | 
1344 |         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345 |         if self.previous:
1346 |             self.previous.next = tag
1347 |         self.previous = tag
1348 |         self.pushTag(tag)
1349 |         if selfClosing or self.isSelfClosingTag(name):
1350 |             self.popTag()
1351 |         if name in self.QUOTE_TAGS:
1352 |             #print "Beginning quote (%s)" % name
1353 |             self.quoteStack.append(name)
1354 |             self.literal = 1
1355 |         return tag
1356 | 
1357 |     def unknown_endtag(self, name):
1358 |         #print "End tag %s" % name
1359 |         if self.quoteStack and self.quoteStack[-1] != name:
1360 |             #This is not a real end tag.
1361 |             #print "</%s> is not real!" % name
1362 |             self.handle_data('</%s>' % name)
1363 |             return
1364 |         self.endData()
1365 |         self._popToTag(name)
1366 |         if self.quoteStack and self.quoteStack[-1] == name:
1367 |             self.quoteStack.pop()
1368 |             self.literal = (len(self.quoteStack) > 0)
1369 | 
1370 |     def handle_data(self, data):
1371 |         self.currentData.append(data)
1372 | 
1373 |     def _toStringSubclass(self, text, subclass):
1374 |         """Adds a certain piece of text to the tree as a NavigableString
1375 |         subclass."""
1376 |         self.endData()
1377 |         self.handle_data(text)
1378 |         self.endData(subclass)
1379 | 
1380 |     def handle_pi(self, text):
1381 |         """Handle a processing instruction as a ProcessingInstruction
1382 |         object, possibly one with a %SOUP-ENCODING% slot into which an
1383 |         encoding will be plugged later."""
1384 |         if text[:3] == "xml":
1385 |             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386 |         self._toStringSubclass(text, ProcessingInstruction)
1387 | 
1388 |     def handle_comment(self, text):
1389 |         "Handle comments as Comment objects."
1390 |         self._toStringSubclass(text, Comment)
1391 | 
1392 |     def handle_charref(self, ref):
1393 |         "Handle character references as data."
1394 |         if self.convertEntities:
1395 |             data = unichr(int(ref))
1396 |         else:
1397 |             data = '&#%s;' % ref
1398 |         self.handle_data(data)
1399 | 
1400 |     def handle_entityref(self, ref):
1401 |         """Handle entity references as data, possibly converting known
1402 |         HTML and/or XML entity references to the corresponding Unicode
1403 |         characters."""
1404 |         data = None
1405 |         if self.convertHTMLEntities:
1406 |             try:
1407 |                 data = unichr(name2codepoint[ref])
1408 |             except KeyError:
1409 |                 pass
1410 | 
1411 |         if not data and self.convertXMLEntities:
1412 |                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413 | 
1414 |         if not data and self.convertHTMLEntities and \
1415 |             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416 |                 # TODO: We've got a problem here. We're told this is
1417 |                 # an entity reference, but it's not an XML entity
1418 |                 # reference or an HTML entity reference. Nonetheless,
1419 |                 # the logical thing to do is to pass it through as an
1420 |                 # unrecognized entity reference.
1421 |                 #
1422 |                 # Except: when the input is "&carol;" this function
1423 |                 # will be called with input "carol". When the input is
1424 |                 # "AT&T", this function will be called with input
1425 |                 # "T". We have no way of knowing whether a semicolon
1426 |                 # was present originally, so we don't know whether
1427 |                 # this is an unknown entity or just a misplaced
1428 |                 # ampersand.
1429 |                 #
1430 |                 # The more common case is a misplaced ampersand, so I
1431 |                 # escape the ampersand and omit the trailing semicolon.
1432 |                 data = "&amp;%s" % ref
1433 |         if not data:
1434 |             # This case is different from the one above, because we
1435 |             # haven't already gone through a supposedly comprehensive
1436 |             # mapping of entities to Unicode characters. We might not
1437 |             # have gone through any mapping at all. So the chances are
1438 |             # very high that this is a real entity, and not a
1439 |             # misplaced ampersand.
1440 |             data = "&%s;" % ref
1441 |         self.handle_data(data)
1442 | 
1443 |     def handle_decl(self, data):
1444 |         "Handle DOCTYPEs and the like as Declaration objects."
1445 |         self._toStringSubclass(data, Declaration)
1446 | 
1447 |     def parse_declaration(self, i):
1448 |         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449 |         declaration as a CData object."""
1450 |         j = None
1451 |         if self.rawdata[i:i+9] == '<![CDATA[':
1452 |              k = self.rawdata.find(']]>', i)
1453 |              if k == -1:
1454 |                  k = len(self.rawdata)
1455 |              data = self.rawdata[i+9:k]
1456 |              j = k+3
1457 |              self._toStringSubclass(data, CData)
1458 |         else:
1459 |             try:
1460 |                 j = SGMLParser.parse_declaration(self, i)
1461 |             except SGMLParseError:
1462 |                 toHandle = self.rawdata[i:]
1463 |                 self.handle_data(toHandle)
1464 |                 j = i + len(toHandle)
1465 |         return j
1466 | 
1467 | class BeautifulSoup(BeautifulStoneSoup):
1468 | 
1469 |     """This parser knows the following facts about HTML:
1470 | 
1471 |     * Some tags have no closing tag and should be interpreted as being
1472 |       closed as soon as they are encountered.
1473 | 
1474 |     * The text inside some tags (ie. 'script') may contain tags which
1475 |       are not really part of the document and which should be parsed
1476 |       as text, not tags. If you want to parse the text as tags, you can
1477 |       always fetch it and parse it explicitly.
1478 | 
1479 |     * Tag nesting rules:
1480 | 
1481 |       Most tags can't be nested at all. For instance, the occurance of
1482 |       a <p> tag should implicitly close the previous <p> tag.
1483 | 
1484 |        <p>Para1<p>Para2
1485 |         should be transformed into:
1486 |        <p>Para1</p><p>Para2
1487 | 
1488 |       Some tags can be nested arbitrarily. For instance, the occurance
1489 |       of a <blockquote> tag should _not_ implicitly close the previous
1490 |       <blockquote> tag.
1491 | 
1492 |        Alice said: <blockquote>Bob said: <blockquote>Blah
1493 |         should NOT be transformed into:
1494 |        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1495 | 
1496 |       Some tags can be nested, but the nesting is reset by the
1497 |       interposition of other tags. For instance, a <tr> tag should
1498 |       implicitly close the previous <tr> tag within the same <table>,
1499 |       but not close a <tr> tag in another table.
1500 | 
1501 |        <table><tr>Blah<tr>Blah
1502 |         should be transformed into:
1503 |        <table><tr>Blah</tr><tr>Blah
1504 |         but,
1505 |        <tr>Blah<table><tr>Blah
1506 |         should NOT be transformed into
1507 |        <tr>Blah<table></tr><tr>Blah
1508 | 
1509 |     Differing assumptions about tag nesting rules are a major source
1510 |     of problems with the BeautifulSoup class. If BeautifulSoup is not
1511 |     treating as nestable a tag your page author treats as nestable,
1512 |     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513 |     BeautifulStoneSoup before writing your own subclass."""
1514 | 
1515 |     def __init__(self, *args, **kwargs):
1516 |         if not kwargs.has_key('smartQuotesTo'):
1517 |             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518 |         kwargs['isHTML'] = True
1519 |         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1520 | 
1521 |     SELF_CLOSING_TAGS = buildTagMap(None,
1522 |                                     ('br' , 'hr', 'input', 'img', 'meta',
1523 |                                     'spacer', 'link', 'frame', 'base', 'col'))
1524 | 
1525 |     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1526 | 
1527 |     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1528 | 
1529 |     #According to the HTML standard, each of these inline tags can
1530 |     #contain another tag of the same type. Furthermore, it's common
1531 |     #to actually use these tags this way.
1532 |     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1533 |                             'center')
1534 | 
1535 |     #According to the HTML standard, these block tags can contain
1536 |     #another tag of the same type. Furthermore, it's common
1537 |     #to actually use these tags this way.
1538 |     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1539 | 
1540 |     #Lists can contain other lists, but there are restrictions.
1541 |     NESTABLE_LIST_TAGS = { 'ol' : [],
1542 |                            'ul' : [],
1543 |                            'li' : ['ul', 'ol'],
1544 |                            'dl' : [],
1545 |                            'dd' : ['dl'],
1546 |                            'dt' : ['dl'] }
1547 | 
1548 |     #Tables can contain other tables, but there are restrictions.
1549 |     NESTABLE_TABLE_TAGS = {'table' : [],
1550 |                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1551 |                            'td' : ['tr'],
1552 |                            'th' : ['tr'],
1553 |                            'thead' : ['table'],
1554 |                            'tbody' : ['table'],
1555 |                            'tfoot' : ['table'],
1556 |                            }
1557 | 
1558 |     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1559 | 
1560 |     #If one of these tags is encountered, all tags up to the next tag of
1561 |     #this type are popped.
1562 |     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563 |                                      NON_NESTABLE_BLOCK_TAGS,
1564 |                                      NESTABLE_LIST_TAGS,
1565 |                                      NESTABLE_TABLE_TAGS)
1566 | 
1567 |     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568 |                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1569 | 
1570 |     # Used to detect the charset in a META tag; see start_meta
1571 |     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1572 | 
1573 |     def start_meta(self, attrs):
1574 |         """Beautiful Soup can detect a charset included in a META tag,
1575 |         try to convert the document to that charset, and re-parse the
1576 |         document from the beginning."""
1577 |         httpEquiv = None
1578 |         contentType = None
1579 |         contentTypeIndex = None
1580 |         tagNeedsEncodingSubstitution = False
1581 | 
1582 |         for i in range(0, len(attrs)):
1583 |             key, value = attrs[i]
1584 |             key = key.lower()
1585 |             if key == 'http-equiv':
1586 |                 httpEquiv = value
1587 |             elif key == 'content':
1588 |                 contentType = value
1589 |                 contentTypeIndex = i
1590 | 
1591 |         if httpEquiv and contentType: # It's an interesting meta tag.
1592 |             match = self.CHARSET_RE.search(contentType)
1593 |             if match:
1594 |                 if (self.declaredHTMLEncoding is not None or
1595 |                     self.originalEncoding == self.fromEncoding):
1596 |                     # An HTML encoding was sniffed while converting
1597 |                     # the document to Unicode, or an HTML encoding was
1598 |                     # sniffed during a previous pass through the
1599 |                     # document, or an encoding was specified
1600 |                     # explicitly and it worked. Rewrite the meta tag.
1601 |                     def rewrite(match):
1602 |                         return match.group(1) + "%SOUP-ENCODING%"
1603 |                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604 |                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1605 |                                                newAttr)
1606 |                     tagNeedsEncodingSubstitution = True
1607 |                 else:
1608 |                     # This is our first pass through the document.
1609 |                     # Go through it again with the encoding information.
1610 |                     newCharset = match.group(3)
1611 |                     if newCharset and newCharset != self.originalEncoding:
1612 |                         self.declaredHTMLEncoding = newCharset
1613 |                         self._feed(self.declaredHTMLEncoding)
1614 |                         raise StopParsing
1615 |                     pass
1616 |         tag = self.unknown_starttag("meta", attrs)
1617 |         if tag and tagNeedsEncodingSubstitution:
1618 |             tag.containsSubstitutions = True
1619 | 
1620 | class StopParsing(Exception):
1621 |     pass
1622 | 
1623 | class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1624 | 
1625 |     """The BeautifulSoup class is oriented towards skipping over
1626 |     common HTML errors like unclosed tags. However, sometimes it makes
1627 |     errors of its own. For instance, consider this fragment:
1628 | 
1629 |      <b>Foo<b>Bar</b></b>
1630 | 
1631 |     This is perfectly valid (if bizarre) HTML. However, the
1632 |     BeautifulSoup class will implicitly close the first b tag when it
1633 |     encounters the second 'b'. It will think the author wrote
1634 |     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635 |     there's no real-world reason to bold something that's already
1636 |     bold. When it encounters '</b></b>' it will close two more 'b'
1637 |     tags, for a grand total of three tags closed instead of two. This
1638 |     can throw off the rest of your document structure. The same is
1639 |     true of a number of other tags, listed below.
1640 | 
1641 |     It's much more common for someone to forget to close a 'b' tag
1642 |     than to actually use nested 'b' tags, and the BeautifulSoup class
1643 |     handles the common case. This class handles the not-co-common
1644 |     case: where you can't believe someone wrote what they did, but
1645 |     it's valid HTML and BeautifulSoup screwed up by assuming it
1646 |     wouldn't be."""
1647 | 
1648 |     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649 |      ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650 |       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1651 |       'big')
1652 | 
1653 |     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1654 | 
1655 |     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1658 | 
1659 | class MinimalSoup(BeautifulSoup):
1660 |     """The MinimalSoup class is for parsing HTML that contains
1661 |     pathologically bad markup. It makes no assumptions about tag
1662 |     nesting, but it does know which tags are self-closing, that
1663 |     <script> tags contain Javascript and should not be parsed, that
1664 |     META tags may contain encoding information, and so on.
1665 | 
1666 |     This also makes it better for subclassing than BeautifulStoneSoup
1667 |     or BeautifulSoup."""
1668 | 
1669 |     RESET_NESTING_TAGS = buildTagMap('noscript')
1670 |     NESTABLE_TAGS = {}
1671 | 
1672 | class BeautifulSOAP(BeautifulStoneSoup):
1673 |     """This class will push a tag with only a single string child into
1674 |     the tag's parent as an attribute. The attribute's name is the tag
1675 |     name, and the value is the string child. An example should give
1676 |     the flavor of the change:
1677 | 
1678 |     <foo><bar>baz</bar></foo>
1679 |      =>
1680 |     <foo bar="baz"><bar>baz</bar></foo>
1681 | 
1682 |     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1683 | 
1684 |     This is, of course, useful for scraping structures that tend to
1685 |     use subelements instead of attributes, such as SOAP messages. Note
1686 |     that it modifies its input, so don't print the modified version
1687 |     out.
1688 | 
1689 |     I'm not sure how many people really want to use this class; let me
1690 |     know if you do. Mainly I like the name."""
1691 | 
1692 |     def popTag(self):
1693 |         if len(self.tagStack) > 1:
1694 |             tag = self.tagStack[-1]
1695 |             parent = self.tagStack[-2]
1696 |             parent._getAttrMap()
1697 |             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698 |                 isinstance(tag.contents[0], NavigableString) and
1699 |                 not parent.attrMap.has_key(tag.name)):
1700 |                 parent[tag.name] = tag.contents[0]
1701 |         BeautifulStoneSoup.popTag(self)
1702 | 
1703 | #Enterprise class names! It has come to our attention that some people
1704 | #think the names of the Beautiful Soup parser classes are too silly
1705 | #and "unprofessional" for use in enterprise screen-scraping. We feel
1706 | #your pain! For such-minded folk, the Beautiful Soup Consortium And
1707 | #All-Night Kosher Bakery recommends renaming this file to
1708 | #"RobustParser.py" (or, in cases of extreme enterprisiness,
1709 | #"RobustParserBeanInterface.class") and using the following
1710 | #enterprise-friendly class aliases:
1711 | class RobustXMLParser(BeautifulStoneSoup):
1712 |     pass
1713 | class RobustHTMLParser(BeautifulSoup):
1714 |     pass
1715 | class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1716 |     pass
1717 | class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1718 |     pass
1719 | class SimplifyingSOAPParser(BeautifulSOAP):
1720 |     pass
1721 | 
1722 | ######################################################
1723 | #
1724 | # Bonus library: Unicode, Dammit
1725 | #
1726 | # This class forces XML data into a standard format (usually to UTF-8
1727 | # or Unicode).  It is heavily based on code from Mark Pilgrim's
1728 | # Universal Feed Parser. It does not rewrite the XML or HTML to
1729 | # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730 | # (XML) and BeautifulSoup.start_meta (HTML).
1731 | 
1732 | # Autodetects character encodings.
1733 | # Download from http://chardet.feedparser.org/
1734 | try:
1735 |     import chardet
1736 | #    import chardet.constants
1737 | #    chardet.constants._debug = 1
1738 | except ImportError:
1739 |     chardet = None
1740 | 
1741 | # cjkcodecs and iconv_codec make Python know about more character encodings.
1742 | # Both are available from http://cjkpython.i18n.org/
1743 | # They're built in if you use Python 2.4.
1744 | try:
1745 |     import cjkcodecs.aliases
1746 | except ImportError:
1747 |     pass
1748 | try:
1749 |     import iconv_codec
1750 | except ImportError:
1751 |     pass
1752 | 
1753 | class UnicodeDammit:
1754 |     """A class for detecting the encoding of a *ML document and
1755 |     converting it to a Unicode string. If the source encoding is
1756 |     windows-1252, can replace MS smart quotes with their HTML or XML
1757 |     equivalents."""
1758 | 
1759 |     # This dictionary maps commonly seen values for "charset" in HTML
1760 |     # meta tags to the corresponding Python codec names. It only covers
1761 |     # values that aren't in Python's aliases and can't be determined
1762 |     # by the heuristics in find_codec.
1763 |     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764 |                         "x-sjis" : "shift-jis" }
1765 | 
1766 |     def __init__(self, markup, overrideEncodings=[],
1767 |                  smartQuotesTo='xml', isHTML=False):
1768 |         self.declaredHTMLEncoding = None
1769 |         self.markup, documentEncoding, sniffedEncoding = \
1770 |                      self._detectEncoding(markup, isHTML)
1771 |         self.smartQuotesTo = smartQuotesTo
1772 |         self.triedEncodings = []
1773 |         if markup == '' or isinstance(markup, unicode):
1774 |             self.originalEncoding = None
1775 |             self.unicode = unicode(markup)
1776 |             return
1777 | 
1778 |         u = None
1779 |         for proposedEncoding in overrideEncodings:
1780 |             u = self._convertFrom(proposedEncoding)
1781 |             if u: break
1782 |         if not u:
1783 |             for proposedEncoding in (documentEncoding, sniffedEncoding):
1784 |                 u = self._convertFrom(proposedEncoding)
1785 |                 if u: break
1786 | 
1787 |         # If no luck and we have auto-detection library, try that:
1788 |         if not u and chardet and not isinstance(self.markup, unicode):
1789 |             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1790 | 
1791 |         # As a last resort, try utf-8 and windows-1252:
1792 |         if not u:
1793 |             for proposed_encoding in ("utf-8", "windows-1252"):
1794 |                 u = self._convertFrom(proposed_encoding)
1795 |                 if u: break
1796 | 
1797 |         self.unicode = u
1798 |         if not u: self.originalEncoding = None
1799 | 
1800 |     def _subMSChar(self, orig):
1801 |         """Changes a MS smart quote character to an XML or HTML
1802 |         entity."""
1803 |         sub = self.MS_CHARS.get(orig)
1804 |         if isinstance(sub, tuple):
1805 |             if self.smartQuotesTo == 'xml':
1806 |                 sub = '&#x%s;' % sub[1]
1807 |             else:
1808 |                 sub = '&%s;' % sub[0]
1809 |         return sub
1810 | 
1811 |     def _convertFrom(self, proposed):
1812 |         proposed = self.find_codec(proposed)
1813 |         if not proposed or proposed in self.triedEncodings:
1814 |             return None
1815 |         self.triedEncodings.append(proposed)
1816 |         markup = self.markup
1817 | 
1818 |         # Convert smart quotes to HTML if coming from an encoding
1819 |         # that might have them.
1820 |         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1821 |                                                       "iso-8859-1",
1822 |                                                       "iso-8859-2"):
1823 |             markup = re.compile("([\x80-\x9f])").sub \
1824 |                      (lambda(x): self._subMSChar(x.group(1)),
1825 |                       markup)
1826 | 
1827 |         try:
1828 |             # print "Trying to convert document to %s" % proposed
1829 |             u = self._toUnicode(markup, proposed)
1830 |             self.markup = u
1831 |             self.originalEncoding = proposed
1832 |         except Exception, e:
1833 |             # print "That didn't work!"
1834 |             # print e
1835 |             return None
1836 |         #print "Correct encoding: %s" % proposed
1837 |         return self.markup
1838 | 
1839 |     def _toUnicode(self, data, encoding):
1840 |         '''Given a string and its encoding, decodes the string into Unicode.
1841 |         %encoding is a string recognized by encodings.aliases'''
1842 | 
1843 |         # strip Byte Order Mark (if present)
1844 |         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845 |                and (data[2:4] != '\x00\x00'):
1846 |             encoding = 'utf-16be'
1847 |             data = data[2:]
1848 |         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849 |                  and (data[2:4] != '\x00\x00'):
1850 |             encoding = 'utf-16le'
1851 |             data = data[2:]
1852 |         elif data[:3] == '\xef\xbb\xbf':
1853 |             encoding = 'utf-8'
1854 |             data = data[3:]
1855 |         elif data[:4] == '\x00\x00\xfe\xff':
1856 |             encoding = 'utf-32be'
1857 |             data = data[4:]
1858 |         elif data[:4] == '\xff\xfe\x00\x00':
1859 |             encoding = 'utf-32le'
1860 |             data = data[4:]
1861 |         newdata = unicode(data, encoding)
1862 |         return newdata
1863 | 
1864 |     def _detectEncoding(self, xml_data, isHTML=False):
1865 |         """Given a document, tries to detect its XML encoding."""
1866 |         xml_encoding = sniffed_xml_encoding = None
1867 |         try:
1868 |             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1869 |                 # EBCDIC
1870 |                 xml_data = self._ebcdic_to_ascii(xml_data)
1871 |             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1872 |                 # UTF-16BE
1873 |                 sniffed_xml_encoding = 'utf-16be'
1874 |                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876 |                      and (xml_data[2:4] != '\x00\x00'):
1877 |                 # UTF-16BE with BOM
1878 |                 sniffed_xml_encoding = 'utf-16be'
1879 |                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880 |             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1881 |                 # UTF-16LE
1882 |                 sniffed_xml_encoding = 'utf-16le'
1883 |                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885 |                      (xml_data[2:4] != '\x00\x00'):
1886 |                 # UTF-16LE with BOM
1887 |                 sniffed_xml_encoding = 'utf-16le'
1888 |                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889 |             elif xml_data[:4] == '\x00\x00\x00\x3c':
1890 |                 # UTF-32BE
1891 |                 sniffed_xml_encoding = 'utf-32be'
1892 |                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893 |             elif xml_data[:4] == '\x3c\x00\x00\x00':
1894 |                 # UTF-32LE
1895 |                 sniffed_xml_encoding = 'utf-32le'
1896 |                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897 |             elif xml_data[:4] == '\x00\x00\xfe\xff':
1898 |                 # UTF-32BE with BOM
1899 |                 sniffed_xml_encoding = 'utf-32be'
1900 |                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901 |             elif xml_data[:4] == '\xff\xfe\x00\x00':
1902 |                 # UTF-32LE with BOM
1903 |                 sniffed_xml_encoding = 'utf-32le'
1904 |                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905 |             elif xml_data[:3] == '\xef\xbb\xbf':
1906 |                 # UTF-8 with BOM
1907 |                 sniffed_xml_encoding = 'utf-8'
1908 |                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1909 |             else:
1910 |                 sniffed_xml_encoding = 'ascii'
1911 |                 pass
1912 |         except:
1913 |             xml_encoding_match = None
1914 |         xml_encoding_match = re.compile(
1915 |             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916 |         if not xml_encoding_match and isHTML:
1917 |             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918 |             xml_encoding_match = regexp.search(xml_data)
1919 |         if xml_encoding_match is not None:
1920 |             xml_encoding = xml_encoding_match.groups()[0].lower()
1921 |             if isHTML:
1922 |                 self.declaredHTMLEncoding = xml_encoding
1923 |             if sniffed_xml_encoding and \
1924 |                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925 |                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926 |                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1927 |                                  'utf16', 'u16')):
1928 |                 xml_encoding = sniffed_xml_encoding
1929 |         return xml_data, xml_encoding, sniffed_xml_encoding
1930 | 
1931 | 
1932 |     def find_codec(self, charset):
1933 |         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934 |                or (charset and self._codec(charset.replace("-", ""))) \
1935 |                or (charset and self._codec(charset.replace("-", "_"))) \
1936 |                or charset
1937 | 
1938 |     def _codec(self, charset):
1939 |         if not charset: return charset
1940 |         codec = None
1941 |         try:
1942 |             codecs.lookup(charset)
1943 |             codec = charset
1944 |         except (LookupError, ValueError):
1945 |             pass
1946 |         return codec
1947 | 
1948 |     EBCDIC_TO_ASCII_MAP = None
1949 |     def _ebcdic_to_ascii(self, s):
1950 |         c = self.__class__
1951 |         if not c.EBCDIC_TO_ASCII_MAP:
1952 |             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953 |                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954 |                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955 |                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956 |                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957 |                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958 |                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959 |                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960 |                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961 |                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962 |                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963 |                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964 |                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965 |                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966 |                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967 |                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968 |                     250,251,252,253,254,255)
1969 |             import string
1970 |             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971 |             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972 |         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1973 | 
1974 |     MS_CHARS = { '\x80' : ('euro', '20AC'),
1975 |                  '\x81' : ' ',
1976 |                  '\x82' : ('sbquo', '201A'),
1977 |                  '\x83' : ('fnof', '192'),
1978 |                  '\x84' : ('bdquo', '201E'),
1979 |                  '\x85' : ('hellip', '2026'),
1980 |                  '\x86' : ('dagger', '2020'),
1981 |                  '\x87' : ('Dagger', '2021'),
1982 |                  '\x88' : ('circ', '2C6'),
1983 |                  '\x89' : ('permil', '2030'),
1984 |                  '\x8A' : ('Scaron', '160'),
1985 |                  '\x8B' : ('lsaquo', '2039'),
1986 |                  '\x8C' : ('OElig', '152'),
1987 |                  '\x8D' : '?',
1988 |                  '\x8E' : ('#x17D', '17D'),
1989 |                  '\x8F' : '?',
1990 |                  '\x90' : '?',
1991 |                  '\x91' : ('lsquo', '2018'),
1992 |                  '\x92' : ('rsquo', '2019'),
1993 |                  '\x93' : ('ldquo', '201C'),
1994 |                  '\x94' : ('rdquo', '201D'),
1995 |                  '\x95' : ('bull', '2022'),
1996 |                  '\x96' : ('ndash', '2013'),
1997 |                  '\x97' : ('mdash', '2014'),
1998 |                  '\x98' : ('tilde', '2DC'),
1999 |                  '\x99' : ('trade', '2122'),
2000 |                  '\x9a' : ('scaron', '161'),
2001 |                  '\x9b' : ('rsaquo', '203A'),
2002 |                  '\x9c' : ('oelig', '153'),
2003 |                  '\x9d' : '?',
2004 |                  '\x9e' : ('#x17E', '17E'),
2005 |                  '\x9f' : ('Yuml', ''),}
2006 | 
2007 | #######################################################################
2008 | 
2009 | 
2010 | #By default, act as an HTML pretty-printer.
2011 | if __name__ == '__main__':
2012 |     import sys
2013 |     soup = BeautifulSoup(sys.stdin)
2014 |     print soup.prettify()
2015 | 


--------------------------------------------------------------------------------