├── .gitignore
├── AUTHORS
├── HISTORICAL
├── LICENSE
├── MANIFEST.in
├── README.md
├── README.rst
├── bin
    ├── query
    └── readability
├── boilerpipy
    ├── __init__.py
    ├── common.py
    ├── compat.py
    ├── error.py
    └── expressions.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *~
3 | build
4 | boilerpipy.egg-info
5 | dist


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | List of Authors:
2 | =================
3 | 1. Sharmila Gopirajan - sharmila.gopirajan@gmail.com (Original Author)
4 | 2. Harshavardhana - harsha@harshavardhana.net (Current Maintainer/Author)
5 | 


--------------------------------------------------------------------------------
/HISTORICAL:
--------------------------------------------------------------------------------
 1 | Hi Sharmila,
 2 | 
 3 | This email is about reaching out to you to get maintainer ship of your project
 4 | on readability, to a newly revamped code base on github.com written by me.
 5 | 
 6 | The project i am referring to is --> https://code.google.com/p/decruft/
 7 | 
 8 | I am writing down the AUTHOR file and README.md you as the Original Author,
 9 | but i request your consent that i will move forward to maintain this project
10 | in the long run and changing the copyright to my own needs.
11 | 
12 | While i have sufficiently improved upon your project -> to support the new
13 | readability.js codebase and also a complete restructuring of the project. Also
14 | adding new techniques to improve upon accuracy in terms of readability
15 | applied on various corpus of websites.
16 | 
17 | This repo will also have other changes from boilerpipe project -
18 | https://code.google.com/p/boilerpipe/ which is written in java.
19 | I am planning to write the whole project into Python which will
20 | succeed your work.
21 | 
22 | Let me know if this is acceptable for ownership change and also
23 | Copyright change, feel free to visit
24 | https://github.com/harshavardhana/boilerpipy and poke around.
25 | 
26 | Thank you :-)
27 | 
28 | -Harsha


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include AUTHORS


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Boilerpipy
 2 | ======
 3 | 
 4 | Readability/Boilerpipe extractor in Python, closest port to latest Arc90's readability.js - 1.7.1
 5 | 
 6 |     Copyright (C) 2012 Harshavardhana - Current Maintainer/Author
 7 | 
 8 | Licensed under the Apache License, Version 2.0 (the "License");
 9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 | 
20 | [![Downloads](https://img.shields.io/pypi/dm/boilerpipy.svg)](https://pypi.python.org/pypi/boilerpipy/)
21 | [![Version](https://img.shields.io/pypi/v/boilerpipy.svg)](https://pypi.python.org/pypi/boilerpipy/)
22 | [![License](https://img.shields.io/pypi/l/boilerpipy.svg)](https://pypi.python.org/pypi/boilerpipy/)
23 | 
24 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Boilerpipy
 2 | ======
 3 | 
 4 | Readability/Boilerpipe extractor in Python, closest port to latest Arc90's readability.js - 1.7.1
 5 | 
 6 |     Copyright (C) 2012 Harshavardhana - Current Maintainer/Author
 7 | 
 8 | Licensed under the Apache License, Version 2.0 (the "License");
 9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 | 
12 |     http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 | 


--------------------------------------------------------------------------------
/bin/query:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import logging
 4 | import chardet
 5 | 
 6 | 
 7 | from optparse import OptionParser
 8 | 
 9 | from boilerpipy import (Extractor, isvalidhtml,
10 |                         compat_urllib_request)
11 | 
12 | def main():
13 |     parser = OptionParser(usage="%prog: [options] [file]")
14 |     parser.add_option('-u', '--url', help="use URL instead of a local file")
15 |     parser.add_option('-q', '--query', help="query should be a string")
16 |     parser.add_option('-d', help="enable debug", action="store_true", default=False, dest="debug")
17 |     (options, args) = parser.parse_args()
18 | 
19 |     if not (options.url and options.query):
20 |         parser.print_help()
21 |         sys.exit(1)
22 | 
23 |     loglevel = logging.INFO
24 |     if options.debug:
25 |         loglevel = logging.DEBUG
26 | 
27 |     url = None
28 |     if options.url:
29 |         if not isvalidhtml(options.url):
30 |             print "Unrecognized URL, please provide a content-type of text/html"
31 |             sys.exit(255)
32 | 
33 |         url = compat_urllib_request.urlopen(options.url)
34 |     try:
35 |         content = url.read()
36 |         try:
37 |             enc = chardet.detect(content)['encoding']
38 |             content = content.decode(enc)
39 |         except:
40 |             pass
41 |         out = Extractor(content, tag=options.query, loglevel=loglevel).query()
42 |         if out is None:
43 |             raise
44 | 
45 |         print out
46 | 
47 |     except Exception as err:
48 |         print "Error in printing the extracted html () %s" % err
49 | 
50 |     finally:
51 |         url.close()
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/bin/readability:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import logging
 4 | import chardet
 5 | 
 6 | from optparse import OptionParser
 7 | 
 8 | from boilerpipy import (Extractor, isvalidhtml,
 9 |                         compat_urllib_request)
10 | 
11 | def main():
12 |     parser = OptionParser(usage="%prog: [options] [file]")
13 |     parser.add_option('-u', '--url', help="use URL instead of a local file")
14 |     parser.add_option('-d', help="enable debug", action="store_true",
15 |                       default=False, dest="debug")
16 |     (options, args) = parser.parse_args()
17 | 
18 |     if not options.url:
19 |         parser.print_help()
20 |         sys.exit(1)
21 | 
22 |     loglevel = logging.INFO
23 |     if options.debug:
24 |         loglevel = logging.DEBUG
25 | 
26 |     url = None
27 |     if options.url:
28 |         if not isvalidhtml(options.url):
29 |             print ("Unrecognized URL, please provide a content-type of text/html")
30 |             sys.exit(255)
31 | 
32 |         url = compat_urllib_request.urlopen(options.url)
33 |     try:
34 |         content = url.read()
35 |         try:
36 |             enc = chardet.detect(content)['encoding']
37 |             content = content.decode(enc)
38 |         except:
39 |             pass
40 |         out = Extractor(content, loglevel=loglevel).extracted()
41 |         if out is None:
42 |             raise
43 |         print (out.encode('utf-8','ignore'))
44 | 
45 |     except Exception as err:
46 |         print ("Error in printing the extracted html () %s" % err)
47 | 
48 |     finally:
49 |         url.close()
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/boilerpipy/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | import sys
  4 | 
  5 | from collections import defaultdict
  6 | 
  7 | from lxml.etree import (tostring, tounicode, ParserError,
  8 |                         iterwalk, XMLSyntaxError)
  9 | from lxml.html.clean import Cleaner
 10 | import lxml.html as html
 11 | 
 12 | from .expressions import *
 13 | from .common import *
 14 | from .error import *
 15 | from .compat import *
 16 | 
 17 | import logging
 18 | FORMAT = '%(asctime)-15s %(message)s'
 19 | logging.basicConfig(format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S')
 20 | logger = logging.getLogger('extractor')
 21 | # Default
 22 | logger.setLevel(logging.INFO)
 23 | __version__ = "0.2.2"
 24 | __license__ = "Apache 2.0"
 25 | 
 26 | def setLogLevel(level):
 27 |     if not isinstance(level, int):
 28 |         raise ValueError
 29 |     logger.setLevel(level)
 30 | 
 31 | class Extractor:
 32 |     def __init__(self, input, notify=None, tag=None, **options):
 33 |         self.input = input
 34 |         self.options = defaultdict(lambda: None)
 35 |         for k, v in list(options.items()):
 36 |             self.options[k] = v
 37 |         self.notify = notify or logger.debug
 38 |         self.html = None
 39 |         self.TEXT_LENGTH_THRESHOLD = 25
 40 |         self.RETRY_LENGTH = 250
 41 |         if 'loglevel' in options:
 42 |             setLogLevel(self.options['loglevel'])
 43 |         self.tag = tag
 44 | 
 45 |     def normalize_html(self, force=False):
 46 |         # Use lxml 'Cleaner' class to normalize html to a feasible value
 47 |         if force or self.html is None:
 48 |             cleaner = Cleaner(scripts=True, javascript=True, comments=True,
 49 |                               style=True, links=True, meta=False,
 50 |                               add_nofollow=False, page_structure=False,
 51 |                               processing_instructions=True, embedded=False,
 52 |                               frames=False, forms=False, annoying_tags=False,
 53 |                               remove_tags=None, remove_unknown_tags=False,
 54 |                               safe_attrs_only=False)
 55 |             if isinstance(self.input, COMPAT_STR):
 56 |                 # Work around: ValueError: Unicode strings with encoding
 57 |                 # declaration are not supported by lxml
 58 |                 self.input = self.input.encode('utf-8')
 59 |             self.input = self.input.replace(b'\r', b'')
 60 |             self.html = parse(cleaner.clean_html(self.input),
 61 |                               self.options['url'], notify=self.notify)
 62 |         return self.html
 63 | 
 64 |     def content(self):
 65 |         return get_body(self.normalize_html())
 66 | 
 67 |     def title(self):
 68 |         return get_title(self.normalize_html())
 69 | 
 70 |     def query(self):
 71 |         if self.tag is None:
 72 |             raise ValueError('Please provide tag value before calling this function')
 73 | 
 74 |         return get_queried_tags(self.normalize_html(), self.tag)
 75 | 
 76 |     def extracted(self):
 77 |         try:
 78 |             single_pass = True
 79 |             while True:
 80 |                 self.normalize_html(True)
 81 |                 [i.drop_tree() for i in self.tags(self.html, 'script',
 82 |                                                   'style', 'noscript')]
 83 | 
 84 |                 if single_pass:
 85 |                     self.remove_unlikely_nodes()
 86 | 
 87 |                 self.transform_misused_divs_into_paragraphs()
 88 |                 nodes = self.score_paragraphs(self.options.get('min_text_length',
 89 |                                                                self.TEXT_LENGTH_THRESHOLD))
 90 | 
 91 |                 best_node = self.select_best_node(nodes)
 92 |                 if best_node:
 93 |                     article = self.get_article(nodes, best_node)
 94 |                 else:
 95 |                     if single_pass:
 96 |                         single_pass = False
 97 |                         logger.info("Ended up stripping too much - going for a safer parsing scheme")
 98 |                         # try again
 99 |                         continue
100 |                     else:
101 |                         logger.info("Ruthless and simple parsing did not work. Returning unprocessed raw html")
102 |                         if self.html.find('body') is not None:
103 |                             article = self.html.find('body')
104 |                         else:
105 |                             article = self.html
106 | 
107 |                 content_scores = []
108 | 
109 |                 for x in nodes:
110 |                     if nodes[x]['content_score'] < 0:
111 |                         continue
112 |                     content_scores.append(nodes[x]['content_score'])
113 | 
114 |                 cleaned_article = self.sanitize(article, nodes)
115 |                 at_acceptable_length = len(cleaned_article) >= self.RETRY_LENGTH
116 | 
117 |                 if single_pass and not at_acceptable_length:
118 |                     single_pass = False
119 |                     continue # try again
120 |                 else:
121 |                     return cleaned_article
122 |         except ParserError as e:
123 |             logger.info('error getting summary: %s' % e)
124 |             return None
125 | 
126 |         except XMLSyntaxError as e:
127 |             logger.info('error getting summary: %s' % e)
128 |             return None
129 | 
130 |         except:
131 |             logger.info('unexpected error: %s' % sys.exc_info()[0])
132 |             return None
133 | 
134 |     def get_article(self, nodes, best_node):
135 |         # Now that we have the top node, look through its siblings for content that might also be related.
136 |         # Things like preambles, content split by ads that we removed, etc.
137 | 
138 |         sibling_score_threshold = max([10, best_node['content_score'] * 0.2])
139 |         output = parse("<div/>")
140 |         for sibling in best_node['elem'].getparent().getchildren():
141 |             append = False
142 |             if sibling is best_node['elem']:
143 |                 append = True
144 |             sibling_key = sibling
145 |             if sibling_key in nodes and nodes[sibling_key]['content_score'] >= sibling_score_threshold:
146 |                 append = True
147 | 
148 |             if sibling.tag == "p":
149 |                 link_density = self.get_link_density(sibling)
150 |                 node_content = sibling.text or ""
151 |                 node_length = len(node_content)
152 | 
153 |                 if node_length > 80 and link_density < 0.25:
154 |                     append = True
155 |                 elif node_length < 80 and link_density == 0 and re.search(r'\.( |$)', node_content):
156 |                     append = True
157 | 
158 |             if append:
159 |                 output.append(sibling)
160 |         if output is not None: output.append(best_node['elem'])
161 |         return output
162 | 
163 |     def select_best_node(self, nodes):
164 |         sorted_nodes = sorted(list(nodes.values()),
165 |                               key=lambda x: x['content_score'], reverse=True)
166 |         logger.debug("Top 5 nodes:")
167 |         for node in sorted_nodes[:5]:
168 |             elem = node['elem']
169 |             logger.debug("Node %s with score %s '%s...'" % (describe(elem),
170 |                                                             node['content_score'],
171 |                                                             snippet(elem)))
172 | 
173 |         if len(sorted_nodes) == 0:
174 |             return None
175 |         best_node = sorted_nodes[0]
176 |         logger.debug("Best node %s with score %s" % (describe(best_node['elem']),
177 |                                                      best_node['content_score']))
178 |         return best_node
179 | 
180 |     def get_link_density(self, elem):
181 |         link_length = len("".join([i.text_content() or "" for i in elem.findall(".//a")]))
182 |         text_length = len(elem.text_content())
183 |         return float(link_length) / max(text_length, 1)
184 | 
185 |     def score_paragraphs(self, min_text_length):
186 |         nodes = {}
187 |         logger.debug(str([describe(node) for node in self.tags(self.html,
188 |                                                                "div")]))
189 |         elems = self.tags(self.html, "div", "p", "td", 'li', "a")
190 | 
191 |         for elem in elems:
192 |             parent_node = elem.getparent()
193 |             grand_parent_node = parent_node.getparent()
194 |             elem_key = elem
195 |             parent_key = parent_node
196 |             grand_parent_key = grand_parent_node
197 | 
198 |             inner_text = elem.text_content()
199 | 
200 |             # If this paragraph is less than 25 characters, don't even count it.
201 |             if (not inner_text) or len(inner_text) < min_text_length:
202 |                 continue
203 | 
204 |             if parent_key not in nodes:
205 |                 nodes[parent_key] = self.score_node(parent_node)
206 |             if grand_parent_node is not None and grand_parent_key not in nodes:
207 |                 nodes[grand_parent_key] = self.score_node(grand_parent_node)
208 | 
209 |             content_score = 1
210 |             content_score += len(inner_text.split(','))
211 |             content_score += min([(len(inner_text) / 100), 3])
212 |             if elem not in nodes:
213 |                 nodes[elem_key] = self.score_node(elem)
214 |             nodes[elem_key]['content_score'] += content_score
215 |             nodes[parent_key]['content_score'] += content_score
216 |             if grand_parent_node is not None:
217 |                 nodes[grand_parent_key]['content_score'] += content_score / 2.0
218 | 
219 |         # Scale the final nodes score based on link density. Good content should have a
220 |         # relatively small link density (5% or less) and be mostly unaffected by this operation.
221 |         for elem, node in list(nodes.items()):
222 |             link_density = self.get_link_density(elem)
223 |             node['content_score'] *= (1 - link_density)
224 |             if node['content_score'] > 0:
225 |                 logger.debug("node %s scored %s"
226 |                              "(linkd: %s) '%s'" % (describe(elem),
227 |                                                    node['content_score'],
228 |                                                    link_density,
229 |                                                    snippet(elem, 30)))
230 | 
231 |         return nodes
232 | 
233 |     def class_weight(self, e):
234 |         weight = 0
235 |         if e.get('class', None):
236 |             if REGEXPS.get('negative').search(e.get('class')):
237 |                 weight -= 25
238 | 
239 |             if REGEXPS.get('positive').search(e.get('class')):
240 |                 weight += 25
241 | 
242 |         if e.get('id', None):
243 |             if REGEXPS.get('negative').search(e.get('id')):
244 |                 weight -= 25
245 | 
246 |             if REGEXPS.get('positive').search(e.get('id')):
247 |                 weight += 25
248 | 
249 |         return weight
250 | 
251 |     def score_node(self, elem):
252 |         content_score = self.class_weight(elem)
253 |         tag = elem.tag.lower()
254 |         if tag == "div":
255 |             content_score += 5
256 |         elif tag == "blockquote":
257 |             content_score += 3
258 |         elif tag == "form":
259 |             content_score -= 3
260 |         elif tag == "th":
261 |             content_score -= 5
262 |         return {'content_score': content_score, 'elem': elem}
263 | 
264 |     def remove_unlikely_nodes(self):
265 |         remove_list = []
266 |         context = iterwalk(self.html)
267 |         for action, elem in context:
268 |             s = "%s%s" % (elem.get('class', ''), elem.get('id', ''))
269 |             if REGEXPS['unlikelyNodes'].search(s) and (not REGEXPS['okMaybeItsANode'].search(s)) and elem.tag != 'body':
270 |                 logger.debug("Removing unlikely node - %s" % s)
271 |                 remove_list.append(elem)
272 |         [e.drop_tree() for e in remove_list if e.tag != 'html']
273 | 
274 |     def transform_misused_divs_into_paragraphs(self):
275 |         for elem in self.html.iter():
276 |             if elem.tag.lower() == "div":
277 |                 # transform <div>s that do not contain other block elements into <p>s
278 |                 if not REGEXPS['divToPElements'].search(COMPAT_STR(''.join(map(tounicode, list(elem))))):
279 |                     logger.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''),
280 |                                                                 elem.get('class', '')))
281 |                     elem.tag = "p"
282 | 
283 |     def tags(self, node, *tag_names):
284 |         for tag_name in tag_names:
285 |             for e in node.findall('.//%s' %tag_name):
286 |                 yield e
287 | 
288 |     def sanitize(self, node, nodes):
289 |         for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
290 |             if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree()
291 | 
292 |         for elem in self.tags(node, "form"):
293 |             elem.drop_tree()
294 |         allowed = {}
295 |         # Conditionally clean <table>s, <ul>s, and <div>s
296 |         for el in self.tags(node, "table", "ul", "div"):
297 |             if el in allowed:
298 |                 continue
299 |             weight = self.class_weight(el)
300 |             el_key = el
301 |             if el_key in nodes:
302 |                 content_score = nodes[el_key]['content_score']
303 |             else:
304 |                 content_score = 0
305 | 
306 |             tag = el.tag
307 |             if weight + content_score < 0:
308 |                 el.drop_tree()
309 |                 logger.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
310 |                     (describe(el), weight, content_score))
311 |             elif len(el.text_content().split(",")) < 10:
312 |                 counts = {}
313 |                 for kind in ['p', 'img', 'li', 'a', 'embed', 'input', 'iframe']:
314 |                     counts[kind] = len(el.findall('.//%s' % kind))
315 |                 counts["li"] -= 100
316 | 
317 |                 content_length = len(el.text_content()) # Count the text length excluding any surrounding whitespace
318 |                 link_density = self.get_link_density(el)
319 |                 parent_node = el.getparent()
320 |                 if parent_node is not None:
321 |                     if parent_node in nodes:
322 |                         content_score = nodes[parent_node]['content_score']
323 |                     else:
324 |                         content_score = 0
325 |                     pweight = self.class_weight(parent_node) + content_score
326 |                     pname = parent_node.tag
327 |                 else:
328 |                     pweight = 0
329 |                     pname = "no parent"
330 |                 to_remove = False
331 |                 reason = ""
332 | 
333 |                 if counts["p"] and counts["img"] > counts["p"]:
334 |                     reason = "too many images"
335 |                     to_remove = True
336 |                 elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
337 |                     reason = "more <li>s than <p>s"
338 |                     to_remove = True
339 |                 elif counts["input"] > (counts["p"] / 3):
340 |                     reason = "less than 3x <p>s than <input>s"
341 |                     to_remove = True
342 |                 elif content_length < (self.options.get('min_text_length',
343 |                                                         self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0):
344 |                     reason = "too short a content length without a single image"
345 |                     to_remove = True
346 |                 elif weight < 25 and link_density > 0.2:
347 |                     reason = "too many links for its weight less than 25 (#{weight})"
348 |                     to_remove = True
349 |                 elif weight >= 25 and link_density > 0.5:
350 |                     reason = "too many links for its weight (#{weight})"
351 |                     to_remove = True
352 |                 elif el.tag.lower() == "embed":
353 |                     if not REGEXPS.get('videos').search(el.get('src')):
354 |                         to_remove = True
355 |                 elif el.tag.lower() == "iframe":
356 |                     if not REGEXPS.get('videos').search(el.get('src')):
357 |                         to_remove = True
358 |                 elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 2:
359 |                     reason = "<embed>s with too short a content length, or too many <embed>s"
360 |                     to_remove = True
361 |                 elif (counts["iframe"] == 1 and content_length < 75) or counts["iframe"] > 2:
362 |                     reason = "<iframe>s with too short a content length, or too many <iframe>s"
363 |                     to_remove = True
364 |                 if to_remove:
365 |                     logger.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." %
366 |                                (el.tag, el.get('id', ''), el.get('class', ''),
367 |                                 weight, content_score, reason))
368 |                     logger.debug("pname %s pweight %s" % (pname, pweight))
369 |                     el.drop_tree()
370 |         return tounicode(node)
371 | 


--------------------------------------------------------------------------------
/boilerpipy/common.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import sys
  3 | 
  4 | from lxml.etree import tostring
  5 | import lxml.html as html
  6 | 
  7 | from .expressions import HTMLSTRIP, CRUFTY_REGEXPS_HTML
  8 | from .compat import (compat_urllib_parse_urlparse,
  9 |                      compat_html_parser, COMPAT_STR,
 10 |                      compat_http_client)
 11 | from .error import Unparseable
 12 | 
 13 | try:
 14 |     from bs4 import UnicodeDammit
 15 | except:
 16 |     print("Please install beautifulsoup4 --> easy_install -U beautifulsoup4")
 17 |     sys.exit(1)
 18 | 
 19 | def create_doc(content, base_href):
 20 |     # Work around: ValueError: Unicode strings with encoding
 21 |     # declaration are not supported by lxml
 22 |     if isinstance(content, COMPAT_STR):
 23 |         content = content.encode('utf-8')
 24 | 
 25 |     html_parser = html.HTMLParser(recover=True, remove_comments=True,
 26 |                                   no_network=True, encoding="utf-8")
 27 |     html_doc = html.fromstring(content, parser=html_parser)
 28 | 
 29 |     if base_href:
 30 |         html_doc.make_links_absolute(base_href, resolve_base_href=True)
 31 |     else:
 32 |         html_doc.resolve_base_href()
 33 |     return html_doc
 34 | 
 35 | # Verify if the provided HTML has 'Content-Type' as HTML
 36 | def isvalidhtml(url):
 37 |     """
 38 |     Verify valid HTML content
 39 |     """
 40 | 
 41 |     if url is None:
 42 |         return False
 43 | 
 44 |     try:
 45 |         parsed = compat_urllib_parse_urlparse(url)
 46 |         h = compat_http_client.HTTPConnection(parsed.netloc)
 47 |         h.request('HEAD', parsed.path)
 48 |         response = h.getresponse()
 49 | 
 50 |         # Handle response status 301
 51 |         if response.status/100 == 3 and response.getheader('Location'):
 52 |             parsed = compat_urllib_parse_urlparse(response.getheader('Location'))
 53 |             h = compat_http_client.HTTPConnection(parsed.netloc)
 54 |             h.request('HEAD', parsed.path)
 55 |             response = h.getresponse()
 56 |             if response.status/100 == 3:
 57 |                 # Multiple re-directs throw away the HTML
 58 |                 return False
 59 | 
 60 |         # Make sure response is not None
 61 |         if response.getheader('content-type') is None:
 62 |             return False
 63 | 
 64 |         # Only html if valid Header
 65 |         if response.getheader('content-type').find('text/html') != -1:
 66 |             return True
 67 | 
 68 |         return False
 69 |     except Exception as err:
 70 |         print(("Header returned error: %s, skip not a valid HTML" % err))
 71 |         return False
 72 | 
 73 | # helpers for parsing
 74 | def normalize_spaces(s):
 75 |     """replace any sequence of whitespace
 76 |     characters with a single space"""
 77 |     return ' '.join(s.split())
 78 | 
 79 | def _clean_crufty_html(content):
 80 |     for regexps in CRUFTY_REGEXPS_HTML:
 81 |         content = regexps.sub(content)
 82 |     return content
 83 | 
 84 | def clean_attributes(raw_html):
 85 |     while HTMLSTRIP.search(raw_html):
 86 |         raw_html = HTMLSTRIP.sub('<\\1\\2>', raw_html)
 87 |     return raw_html
 88 | 
 89 | def describe(node):
 90 |     if not hasattr(node, 'tag'):
 91 |         return "[text]"
 92 |     return "%s#%s.%s" % (
 93 |         node.tag, node.get('id', ''), node.get('class', ''))
 94 | 
 95 | def snippet(node, n=40):
 96 |     """ return one-liner snippet of the text under the node """
 97 |     txt = node.text_content()
 98 |     txt = COMPAT_STR(' '.join(txt.split()))
 99 |     if len(txt) > n:
100 |         txt = txt[:n] + COMPAT_STR("...")
101 |     return txt
102 | 
103 | def parse(raw_content, base_href=None, notify=lambda *args: None):
104 |     try:
105 |         content = UnicodeDammit(raw_content, is_html=True).markup
106 |         cleaned = _clean_crufty_html(content)
107 |         return create_doc(cleaned, base_href)
108 |     except compat_html_parser.HTMLParseError as e:
109 |         notify("parsing failed:", e)
110 |     raise Unparseable()
111 | 
112 | def get_title(doc):
113 |     title = COMPAT_STR(getattr(doc.find('.//title'), 'text', ''))
114 |     if not title:
115 |         return None
116 |     return normalize_spaces(title)
117 | 
118 | def get_body(doc):
119 |     [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
120 | 
121 |     if doc.body is not None:
122 |         raw_html = COMPAT_STR(tostring(doc.body))
123 |     elif doc is not None:
124 |         raw_html = COMPAT_STR(tostring(doc))
125 | 
126 |     try:
127 |         cleaned = clean_attributes(raw_html)
128 |         return cleaned
129 |     except compat_html_parser.HTMLParseError:
130 |         print ("cleansing broke html content: %s\n---------\n%s" % (raw_html,
131 |                                                                     cleaned))
132 |         return raw_html
133 | 
134 | def get_queried_tags(doc, tag):
135 |     [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')]
136 | 
137 |     queried_results = []
138 |     for i in doc.findall('.//%s' % tag):
139 |         queried_results.append(COMPAT_STR(tostring(i).strip()))
140 | 
141 |     return queried_results
142 | 


--------------------------------------------------------------------------------
/boilerpipy/compat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import locale
 4 | 
 5 | try:
 6 |     import urllib.request as compat_urllib_request
 7 | except ImportError: # Python 2
 8 |     import urllib2 as compat_urllib_request
 9 | 
10 | try:
11 |     import urllib.error as compat_urllib_error
12 | except ImportError: # Python 2
13 |     import urllib2 as compat_urllib_error
14 | 
15 | try:
16 |     import urllib.parse as compat_urllib_parse
17 | except ImportError: # Python 2
18 |     import urllib as compat_urllib_parse
19 | 
20 | try:
21 |     from urllib.parse import urlparse as compat_urllib_parse_urlparse
22 | except ImportError: # Python 2
23 |     from urlparse2 import urlparse as compat_urllib_parse_urlparse
24 | 
25 | try:
26 |     import html.parser as compat_html_parser
27 | except ImportError: # Python 2
28 |     import HTMLParser as compat_html_parser
29 | 
30 | try:
31 |     import http.client as compat_http_client
32 | except ImportError: # Python 2
33 |     import httplib as compat_http_client
34 | 
35 | try:
36 |     COMPAT_STR = unicode # Python 2
37 | except NameError:
38 |     COMPAT_STR = str
39 | 
40 | try:
41 |     COMPAT_CHR = unichr # Python 2
42 | except NameError:
43 |     COMPAT_CHR = chr
44 | 
45 | def preferredencoding():
46 |     """Get preferred encoding.
47 | 
48 |     Returns the best encoding scheme for the system, based on
49 |     locale.getpreferredencoding() and some further tweaks.
50 |     """
51 |     try:
52 |         pref = locale.getpreferredencoding()
53 |         u'TEST'.encode(pref)
54 |     except SyntaxError:
55 |         pref = 'UTF-8'
56 |     return pref
57 | 
58 | if sys.version_info < (3, 0):
59 |     def compat_print(s):
60 |         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
61 | else:
62 |     def compat_print(s):
63 |         assert isinstance(s, unicode)
64 |         print(s)
65 | 


--------------------------------------------------------------------------------
/boilerpipy/error.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from lxml.etree import ParseError
 3 | 
 4 | class Unparseable(Exception):
 5 |     """
 6 |     Local exception handler
 7 |     """
 8 |     def __init__(self, error):
 9 |         # pylint fixes
10 |         super(Unparseable, self).__init__(error)
11 | 
12 |         if isinstance(error, type(ValueError)):
13 |             pass
14 |         if isinstance(error, type(ParseError)):
15 |             pass
16 |         if isinstance(error, type(AttributeError)):
17 |             pass
18 |         if isinstance(error, type(UnicodeError)):
19 |             pass
20 |         if isinstance(error, type(SyntaxError)):
21 |             pass
22 |         # Control shouldn't reach here
23 | 


--------------------------------------------------------------------------------
/boilerpipy/expressions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | ###  LIST of all the regexes supported by readability
 3 | 
 4 | import re
 5 | 
 6 | ## Regex stolen from Arc90's readability.js
 7 | REGEXPS = {
 8 |     'unlikelyNodes': re.compile(r'ad_wrapper|adwrapper|combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter|facebook|pinterest', re.I),
 9 |     'okMaybeItsANode': re.compile(r'and|article|body|column|main|shadow', re.I),
10 |     'positive': re.compile(r'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I),
11 |     'negative': re.compile(r'ad_wrapper|adwrapper|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget|share|icon', re.I),
12 |     'extraneous': re.compile(r'print|archive|comment|discuss|e[\-]?email|share|reply|all|login|sign|single', re.I),
13 |     'divToPElements': re.compile(r'<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I),
14 |     'replaceBrs': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}', re.I),
15 |     'replaceFonts': re.compile(r'<(\/?)font[^>]*>', re.I),
16 |     'trim': re.compile(r'^\s+|\s+$/'),
17 |     'normalize': re.compile(r'\s{2,}/'),
18 |     'killBreaks': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
19 |     'videos': re.compile(r'http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
20 |     'skipFootnoteLink': re.compile(r'^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$', re.I),
21 | }
22 | 
23 | # Strip out HTML attributes - from Arc90's readability.js
24 | BADATTRS = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*']
25 | SINGLEQUOTED = "'[^']+'"
26 | DOUBLEQUOTED = '"[^"]+"'
27 | NONSPACE = '[^ "\'>]+'
28 | HTMLSTRIP = re.compile("<" # Open tag
29 |                        "([^>]+) " # starting prefix
30 |                        "(?:%s) *" % ('|'.join(BADATTRS),) + # remove bad attributes
31 |                        '= *(?:%s|%s|%s)' % (NONSPACE, SINGLEQUOTED, DOUBLEQUOTED) + # needed value
32 |                        "([^>]*)"  # starting postfix
33 |                        ">"        # end of tag
34 |                        , re.I)
35 | 
36 | class Regexps:
37 |     """
38 |     Class to remove HTML cruft
39 |     """
40 |     def __init__(self, desc, regex, processexps):
41 |         self.desc = desc
42 |         self.regex = regex
43 |         self.processexps = processexps
44 | 
45 |     def sub(self, content):
46 |         """
47 |         Substitute regex expressions
48 |         """
49 |         return self.regex.sub(self.processexps, content)
50 | 
51 | ### Cruft in html from Arc90's readability.js
52 | CRUFTY_REGEXPS_HTML = (
53 |     Regexps('javascript',
54 |             regex=re.compile(r'<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
55 |             processexps=''),
56 | 
57 |     Regexps('double double-quoted attributes',
58 |             regex=re.compile(r'(="[^"]+")"+'),
59 |             processexps='\\1'),
60 | 
61 |     Regexps('unclosed tags',
62 |             regex = re.compile(r'(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
63 |             processexps='\\1>\\2'),
64 | 
65 |     Regexps('unclosed (numerical) attribute values',
66 |             regex = re.compile(r'(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
67 |             processexps='\\1"\\2'),
68 |     )
69 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('README.rst') as file:
 4 |     long_description = file.read()
 5 | 
 6 | version = '0.2.5'
 7 | 
 8 | setup(
 9 |     name='boilerpipy',
10 |     version=version,
11 |     description='Readability/Boilerpipe extractor in Python',
12 |     author='Harshavardhana',
13 |     author_email='harsha@harshavardhana.net',
14 |     url='https://github.com/harshavardhana/boilerpipy.git',
15 |     license='Apache',
16 |     platforms=['any'],
17 |     classifiers=[
18 |         "Development Status :: 5 - Production/Stable",
19 |         "Intended Audience :: Developers",
20 |         "License :: OSI Approved :: Apache Software License",
21 |         "Programming Language :: Python",
22 |         "Topic :: Software Development :: Libraries :: Python Modules",
23 |     ],
24 |     packages=find_packages(),
25 |     scripts=['bin/readability', 'bin/query'],
26 |     install_requires=['lxml', 'beautifulsoup4', 'urlparse2'],
27 |     long_description=long_description,
28 | )
29 | 


--------------------------------------------------------------------------------