1234567890123456789012345
" 91 | " " 92 | "" 93 | ) 94 | doc = Document(sample) 95 | doc.summary() 96 | 97 | def test_correct_cleanup(self): 98 | sample = """ 99 | 100 | 101 |Lot of text here.
104 | 105 |More text is written here, and contains punctuation and dots.
106 |The comment is also helpful, but it's 116 | still not the correct item to be extracted.
117 |It's even longer than the article itself!"
foo" + long_space + "
" 133 | 134 | doc = Document(sample) 135 | s = doc.summary() 136 | 137 | assert "foo" in s 138 | 139 | def test_not_self_closing(self): 140 | sample = 'foobar
' 141 | doc = Document(sample) 142 | assert ( 143 | 'foobar
' 144 | == doc.summary() 145 | ) 146 | 147 | def test_utf8_kanji(self): 148 | """Using the UTF-8 kanji sample, load article which is written in kanji""" 149 | sample = load_sample("utf-8-kanji.sample.html") 150 | doc = Document(sample) 151 | res = doc.summary() 152 | assert 0 < len(res) < 10000 153 | 154 | def test_author_present(self): 155 | sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html") 156 | doc = Document(sample) 157 | assert 'Alex von Tunzelmann' == doc.author() 158 | 159 | def test_author_absent(self): 160 | sample = load_sample("si-game.sample.html") 161 | doc = Document(sample) 162 | assert '[no-author]' == doc.author() 163 | 164 | def test_keep_images_present(self): 165 | sample = load_sample("summary-keep-all-images.sample.html") 166 | 167 | doc = Document(sample) 168 | 169 | assert "主要文章标题
196 |这是主要内容的第一段。
197 |これはコンテンツの第2段落です。
198 |이것은 콘텐츠의 세 번째 단락입니다.
199 |This is the fourth paragraph.
200 |]*>[ \n\r\t]*){2,}',re.I), 46 | # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I), 47 | # 'trimRe': re.compile(r'^\s+|\s+$/'), 48 | # 'normalizeRe': re.compile(r'\s{2,}/'), 49 | # 'killBreaksRe': re.compile(r'(
(\s| ?)*){1,}/'), 50 | "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I), 51 | # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, 52 | } 53 | 54 | 55 | class Unparseable(ValueError): 56 | pass 57 | 58 | 59 | def to_int(x): 60 | if not x: 61 | return None 62 | x = x.strip() 63 | if x.endswith("px"): 64 | return int(x[:-2]) 65 | if x.endswith("em"): 66 | return int(x[:-2]) * 12 67 | return int(x) 68 | 69 | 70 | def clean(text): 71 | # Many spaces make the following regexes run forever 72 | text = re.sub(r"\s{255,}", " " * 255, text) 73 | text = re.sub(r"\s*\n\s*", "\n", text) 74 | text = re.sub(r"\t|[ \t]{2,}", " ", text) 75 | return text.strip() 76 | 77 | 78 | def text_length(i): 79 | return len(clean(i.text_content() or "")) 80 | 81 | 82 | def compile_pattern(elements): 83 | if not elements: 84 | return None 85 | elif isinstance(elements, re.Pattern): 86 | return elements 87 | elif isinstance(elements, (str, bytes)): 88 | if isinstance(elements, bytes): 89 | elements = str(elements, "utf-8") 90 | elements = elements.split(",") 91 | if isinstance(elements, (list, tuple)): 92 | return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U) 93 | else: 94 | raise Exception(f"Unknown type for the pattern: {type(elements)}") 95 | # assume string or string like object 96 | 97 | 98 | class Document: 99 | """Class to build a etree document out of html.""" 100 | 101 | def __init__( 102 | self, 103 | input, 104 | positive_keywords=None, 105 | negative_keywords=None, 106 | url=None, 107 | min_text_length=25, 108 | retry_length=250, 109 | xpath=False, 110 | handle_failures="discard", 111 | ): 112 | """Generate the document 113 | 114 | :param input: string of the html content. 115 | :param positive_keywords: regex, list or comma-separated string of patterns in classes and ids 116 | :param negative_keywords: regex, list or comma-separated string in classes and ids 117 | :param min_text_length: Tunable. Set to a higher value for more precise detection of longer texts. 118 | :param retry_length: Tunable. Set to a lower value for better detection of very small texts. 119 | :param xpath: If set to True, adds x="..." attribute to each HTML node, 120 | containing xpath path pointing to original document path (allows to 121 | reconstruct selected summary in original document). 122 | :param handle_failures: Parameter passed to `lxml` for handling failure during exception. 123 | Support options = ["discard", "ignore", None] 124 | 125 | Examples: 126 | positive_keywords=["news-item", "block"] 127 | positive_keywords=["news-item, block"] 128 | positive_keywords=re.compile("news|block") 129 | negative_keywords=["mysidebar", "related", "ads"] 130 | 131 | The Document class is not re-enterable. 132 | It is designed to create a new Document() for each HTML file to process it. 133 | 134 | API methods: 135 | .title() -- full title 136 | .short_title() -- cleaned up title 137 | .content() -- full content 138 | .summary() -- cleaned up content 139 | """ 140 | self.input = input 141 | self.html = None 142 | self.encoding = None 143 | self.positive_keywords = compile_pattern(positive_keywords) 144 | self.negative_keywords = compile_pattern(negative_keywords) 145 | self.url = url 146 | self.min_text_length = min_text_length 147 | self.retry_length = retry_length 148 | self.xpath = xpath 149 | self.handle_failures = handle_failures 150 | 151 | def _html(self, force=False): 152 | if force or self.html is None: 153 | self.html = self._parse(self.input) 154 | if self.xpath: 155 | root = self.html.getroottree() 156 | for i in self.html.getiterator(): 157 | # print root.getpath(i) 158 | i.attrib["x"] = root.getpath(i) 159 | return self.html 160 | 161 | def _parse(self, input): 162 | if isinstance(input, (_ElementTree, HtmlElement)): 163 | doc = input 164 | self.encoding = 'utf-8' 165 | else: 166 | doc, self.encoding = build_doc(input) 167 | doc = html_cleaner.clean_html(doc) 168 | base_href = self.url 169 | if base_href: 170 | # trying to guard against bad links like 171 | try: 172 | # such support is added in lxml 3.3.0 173 | doc.make_links_absolute( 174 | base_href, 175 | resolve_base_href=True, 176 | handle_failures=self.handle_failures, 177 | ) 178 | except TypeError: # make_links_absolute() got an unexpected keyword argument 'handle_failures' 179 | # then we have lxml < 3.3.0 180 | # please upgrade to lxml >= 3.3.0 if you're failing here! 181 | doc.make_links_absolute( 182 | base_href, 183 | resolve_base_href=True, 184 | handle_failures=self.handle_failures, 185 | ) 186 | else: 187 | doc.resolve_base_href(handle_failures=self.handle_failures) 188 | return doc 189 | 190 | def content(self): 191 | """Returns document body""" 192 | return get_body(self._html(True)) 193 | 194 | def title(self): 195 | """Returns document title""" 196 | return get_title(self._html(True)) 197 | 198 | def author(self): 199 | """Returns document author""" 200 | return get_author(self._html(True)) 201 | 202 | def short_title(self): 203 | """Returns cleaned up document title""" 204 | return shorten_title(self._html(True)) 205 | 206 | def get_clean_html(self): 207 | """ 208 | An internal method, which can be overridden in subclasses, for example, 209 | to disable or to improve DOM-to-text conversion in .summary() method 210 | """ 211 | return clean_attributes(tounicode(self.html, method="html")) 212 | 213 | def summary(self, html_partial=False, keep_all_images=False): 214 | """ 215 | Given a HTML file, extracts the text of the article. 216 | 217 | :param html_partial: return only the div of the document, don't wrap 218 | in html and body tags. 219 | :param keep_all_images: Keep all images in summary. 220 | 221 | Warning: It mutates internal DOM representation of the HTML document, 222 | so it is better to call other API methods before this one. 223 | """ 224 | try: 225 | ruthless = True 226 | while True: 227 | self._html(True) 228 | for i in self.tags(self.html, "script", "style"): 229 | i.drop_tree() 230 | for i in self.tags(self.html, "body"): 231 | i.set("id", "readabilityBody") 232 | if ruthless: 233 | self.remove_unlikely_candidates() 234 | self.transform_misused_divs_into_paragraphs() 235 | candidates = self.score_paragraphs() 236 | 237 | best_candidate = self.select_best_candidate(candidates) 238 | 239 | if best_candidate: 240 | article = self.get_article( 241 | candidates, best_candidate, html_partial=html_partial 242 | ) 243 | else: 244 | if ruthless: 245 | log.info("ruthless removal did not work. ") 246 | ruthless = False 247 | log.debug( 248 | "ended up stripping too much - " 249 | "going for a safer _parse" 250 | ) 251 | # try again 252 | continue 253 | else: 254 | log.debug( 255 | "Ruthless and lenient parsing did not work. " 256 | "Returning raw html" 257 | ) 258 | article = self.html.find("body") 259 | if article is None: 260 | article = self.html 261 | cleaned_article = self.sanitize(article, candidates, keep_all_images) 262 | 263 | article_length = len(cleaned_article or "") 264 | retry_length = self.retry_length 265 | of_acceptable_length = article_length >= retry_length 266 | if ruthless and not of_acceptable_length: 267 | ruthless = False 268 | # Loop through and try again. 269 | continue 270 | else: 271 | return cleaned_article 272 | except Exception as e: 273 | log.exception("error getting summary: ") 274 | raise Unparseable(str(e)).with_traceback(sys.exc_info()[2]) 275 | 276 | def get_article(self, candidates, best_candidate, html_partial=False): 277 | # Now that we have the top candidate, look through its siblings for 278 | # content that might also be related. 279 | # Things like preambles, content split by ads that we removed, etc. 280 | sibling_score_threshold = max([10, best_candidate["content_score"] * 0.2]) 281 | # create a new html document with a html->body->div 282 | if html_partial: 283 | output = fragment_fromstring("") 284 | else: 285 | output = document_fromstring("") 286 | best_elem = best_candidate["elem"] 287 | parent = best_elem.getparent() 288 | siblings = parent.getchildren() if parent is not None else [best_elem] 289 | for sibling in siblings: 290 | # in lxml there no concept of simple text 291 | # if isinstance(sibling, NavigableString): continue 292 | append = False 293 | if sibling is best_elem: 294 | append = True 295 | sibling_key = sibling # HashableElement(sibling) 296 | if ( 297 | sibling_key in candidates 298 | and candidates[sibling_key]["content_score"] >= sibling_score_threshold 299 | ): 300 | append = True 301 | 302 | if sibling.tag == "p": 303 | link_density = self.get_link_density(sibling) 304 | node_content = sibling.text or "" 305 | node_length = len(node_content) 306 | 307 | if node_length > 80 and link_density < 0.25: 308 | append = True 309 | elif ( 310 | node_length <= 80 311 | and link_density == 0 312 | and re.search(r"\.( |$)", node_content) 313 | ): 314 | append = True 315 | 316 | if append: 317 | # We don't want to append directly to output, but the div 318 | # in html->body->div 319 | if html_partial: 320 | output.append(sibling) 321 | else: 322 | output.getchildren()[0].getchildren()[0].append(sibling) 323 | # if output is not None: 324 | # output.append(best_elem) 325 | return output 326 | 327 | def select_best_candidate(self, candidates): 328 | if not candidates: 329 | return None 330 | 331 | sorted_candidates = sorted( 332 | candidates.values(), key=lambda x: x["content_score"], reverse=True 333 | ) 334 | for candidate in sorted_candidates[:5]: 335 | elem = candidate["elem"] 336 | log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem))) 337 | 338 | best_candidate = sorted_candidates[0] 339 | return best_candidate 340 | 341 | def get_link_density(self, elem): 342 | link_length = 0 343 | for i in elem.findall(".//a"): 344 | link_length += text_length(i) 345 | # if len(elem.findall(".//div") or elem.findall(".//p")): 346 | # link_length = link_length 347 | total_length = text_length(elem) 348 | return float(link_length) / max(total_length, 1) 349 | 350 | def score_paragraphs(self): 351 | MIN_LEN = self.min_text_length 352 | candidates = {} 353 | ordered = [] 354 | for elem in self.tags(self._html(), "p", "pre", "td"): 355 | parent_node = elem.getparent() 356 | if parent_node is None: 357 | continue 358 | grand_parent_node = parent_node.getparent() 359 | 360 | inner_text = clean(elem.text_content() or "") 361 | inner_text_len = len(inner_text) 362 | 363 | # If this paragraph is less than 25 characters 364 | # don't even count it. 365 | if inner_text_len < MIN_LEN: 366 | continue 367 | 368 | if parent_node not in candidates: 369 | candidates[parent_node] = self.score_node(parent_node) 370 | ordered.append(parent_node) 371 | 372 | if grand_parent_node is not None and grand_parent_node not in candidates: 373 | candidates[grand_parent_node] = self.score_node(grand_parent_node) 374 | ordered.append(grand_parent_node) 375 | 376 | content_score = 1 377 | content_score += len(inner_text.split(",")) 378 | content_score += min((inner_text_len / 100), 3) 379 | # if elem not in candidates: 380 | # candidates[elem] = self.score_node(elem) 381 | 382 | # WTF? candidates[elem]['content_score'] += content_score 383 | candidates[parent_node]["content_score"] += content_score 384 | if grand_parent_node is not None: 385 | candidates[grand_parent_node]["content_score"] += content_score / 2.0 386 | 387 | # Scale the final candidates score based on link density. Good content 388 | # should have a relatively small link density (5% or less) and be 389 | # mostly unaffected by this operation. 390 | for elem in ordered: 391 | candidate = candidates[elem] 392 | ld = self.get_link_density(elem) 393 | score = candidate["content_score"] 394 | log.debug( 395 | "Branch %6.3f %s link density %.3f -> %6.3f" 396 | % (score, describe(elem), ld, score * (1 - ld)) 397 | ) 398 | candidate["content_score"] *= 1 - ld 399 | 400 | return candidates 401 | 402 | def class_weight(self, e): 403 | weight = 0 404 | for feature in [e.get("class", None), e.get("id", None)]: 405 | if feature: 406 | if REGEXES["negativeRe"].search(feature): 407 | weight -= 25 408 | 409 | if REGEXES["positiveRe"].search(feature): 410 | weight += 25 411 | 412 | if self.positive_keywords and self.positive_keywords.search(feature): 413 | weight += 25 414 | 415 | if self.negative_keywords and self.negative_keywords.search(feature): 416 | weight -= 25 417 | 418 | if self.positive_keywords and self.positive_keywords.match("tag-" + e.tag): 419 | weight += 25 420 | 421 | if self.negative_keywords and self.negative_keywords.match("tag-" + e.tag): 422 | weight -= 25 423 | 424 | return weight 425 | 426 | def score_node(self, elem): 427 | content_score = self.class_weight(elem) 428 | name = elem.tag.lower() 429 | if name in ["div", "article"]: 430 | content_score += 5 431 | elif name in ["pre", "td", "blockquote"]: 432 | content_score += 3 433 | elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form", "aside"]: 434 | content_score -= 3 435 | elif name in [ 436 | "h1", 437 | "h2", 438 | "h3", 439 | "h4", 440 | "h5", 441 | "h6", 442 | "th", 443 | "header", 444 | "footer", 445 | "nav", 446 | ]: 447 | content_score -= 5 448 | return {"content_score": content_score, "elem": elem} 449 | 450 | def remove_unlikely_candidates(self): 451 | for elem in self.html.findall(".//*"): 452 | s = "{} {}".format(elem.get("class", ""), elem.get("id", "")) 453 | if len(s) < 2: 454 | continue 455 | if ( 456 | REGEXES["unlikelyCandidatesRe"].search(s) 457 | and (not REGEXES["okMaybeItsACandidateRe"].search(s)) 458 | and elem.tag not in ["html", "body"] 459 | ): 460 | log.debug("Removing unlikely candidate - %s" % describe(elem)) 461 | elem.drop_tree() 462 | 463 | def transform_misused_divs_into_paragraphs(self): 464 | for elem in self.tags(self.html, "div"): 465 | # transform