├── .gitignore ├── README.md ├── extract_utils.py └── htm_body_extractor.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyo 3 | .idea 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 网页正文内容抽取 2 | =================== 3 | 此代码是对论文《基于行块分布函数的通用网页正文抽取》的Python实现方式。论文的出发点是针对搜索引擎正文提取的解决方案，移除了所有的标签元素，因此我在此保留了标签元素，改进用户阅读体验。 4 | 5 | ####特点： 6 | 7 | * 保留正文标签 8 | * 资源（图片、超链接等）路径为绝对路径（即使原文是相对路径）避免找不到资源 9 | 10 | ####调用： 11 | 12 | from html_body_extractor import BodyExtractor 13 | url = 'http://ballpo.com/detail/182560.html' 14 | be = BodyExtractor(url) 15 | be.execute() 16 | print be.body 17 | 18 | ####输出： 19 | >经纪人承认，尽管拉齐奥前锋凯塔(Keita Balde Diao)刚刚与蓝白军团续约，但来自英超联赛的俱乐部仍旧对他保持着浓厚的兴趣。

“今天，对凯塔感兴趣的俱乐部都知道，要想拉齐奥放走他，你必须拿出一大笔的资金，”经纪人萨维尼(Ulisse Savini)告诉TuttoMercatoWeb.com。“没有人打电话给我，但我们都很清楚：对凯塔感兴趣的俱乐部很多，这一点也不意外。除了利物浦经常在关注他之外，还有曼联。”

最后，经纪人解释道，这名19岁的前巴塞罗那球员需要拿到西班牙的护照才能转投英国踢球，尽管这问题不大。 20 | 21 | ####TODO： 22 | 23 | * 自定义样式 24 | * 进一步改进提取正确率 -------------------------------------------------------------------------------- /extract_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __author__ = 'liuzhijun' 5 | import re 6 | import urllib2 7 | import gzip 8 | 9 | try: 10 | from cStringIO import StringIO # py2 11 | except ImportError: 12 | from io import StringIO # py3 13 | 14 | #正则表达式元字符 15 | meta_chars = [ 16 | '+', '*', '?', '[', ']', '.', '{', '}', '(', ')' 17 | ] 18 | meta_regex = '([' + '\\'.join(meta_chars) + '])' 19 | 20 | 21 | def escape_regex_meta(text): 22 | """ 23 | text中正则表达式元字符替换成普通成字符 24 | """ 25 | return re.sub(meta_regex, lambda matchobj: '\\' + matchobj.group(), text) 26 | 27 | 28 | def url_validate(url): 29 | regex = re.compile( 30 | r'^(?:http|ftp)s?://' # http:// or https:// 31 | r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... 32 | r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 33 | r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 34 | r'(?::\d+)?' # optional port 35 | r'(?:/?|[/?]\S+)$', re.IGNORECASE) 36 | return regex.match(url) is not None 37 | 38 | 39 | def html_escape(text): 40 | """ 41 | html转义 42 | """ 43 | text = (text.replace(""", "\"").replace("“", "“").replace("”", "”") 44 | .replace("·", "·").replace("’", "’").replace("“", "“") 45 | .replace("”", "\”").replace("—", "——").replace("…", "…") 46 | .replace("•", "·").replace("(", "(").replace(")", ")") 47 | .replace("·", "·").replace("&", "&").replace("•", "·") 48 | .replace("<", "<").replace("<", "<").replace(">", ">") 49 | .replace(">", ">").replace(" ", " ").replace(" ", " ") 50 | .replace("˜", "~").replace("—", "—").replace("©", "@") 51 | .replace("©", "@").replace("♂", "").replace("\r\n|\r", "\n")) 52 | return text 53 | 54 | 55 | def get_html(url): 56 | assert url_validate(url), "invalid url" 57 | request = urllib2.Request(url) 58 | request.add_header("Accept-encoding", 'gzip') 59 | request.add_header("User-Agent", 'Mozilla/5.0 (Windows NT 6.2; WOW64) ' 60 | 'AppleWebKit/537.36 ' 61 | '(KHTML, like Gecko) Chrome/34.0.1847.131') 62 | response = urllib2.urlopen(request) 63 | html = response.read() 64 | 65 | def encode(html): 66 | try: 67 | html = unicode(html, 'utf-8').encode('utf-8') 68 | except UnicodeDecodeError: 69 | html = unicode(html, 'gbk').encode('utf-8') 70 | return html 71 | 72 | if response.info().get("Content-Encoding") == 'gzip': 73 | buf = StringIO(html) 74 | f = gzip.GzipFile(fileobj=buf) 75 | html = f.read() 76 | f.close() 77 | buf.close() 78 | html = encode(html) 79 | return html -------------------------------------------------------------------------------- /htm_body_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __author__ = 'liuzhijun' 5 | import re 6 | import os 7 | from urlparse import urlparse 8 | import extract_utils 9 | 10 | 11 | class BodyExtractor(object): 12 | """ 13 | url:链接地址 14 | body:正文内容 15 | depth:行块深度 16 | """ 17 | 18 | def __init__(self, url): 19 | self.url = url 20 | self.domain = '' 21 | self.body = '' #正文内容 22 | self.depth = 3 #行块的深度 23 | self.html = '' 24 | self.plain_text = '' 25 | self.html_text = '' 26 | self.margin = 35 #从text的margin长度开始去匹配text_a_p，数值越大匹配越精确，效率越差 27 | 28 | def execute(self): 29 | self._pre_process() 30 | self._extract() 31 | self._post_process() 32 | 33 | def _pre_process(self): 34 | html = extract_utils.get_html(self.url) 35 | self.html = html 36 | parsed_uri = urlparse(self.url) 37 | self.domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) 38 | 39 | plain_text, html_text = clean_html(self.html) 40 | self.html_text = html_text 41 | self.plain_text = plain_text 42 | 43 | def _post_process(self): 44 | """ 45 | 把资源链接的相对路径改为完整路径 46 | 清空标签的无用属性，比如class, style 47 | """ 48 | #TODO: 清空标签无用属性，比如class 49 | 50 | def repl(match): 51 | s = match.group() 52 | return s.replace('="', '="' + self.domain) 53 | self.body = re.sub(r'(?:href=["\']/(.*?)["\'])|(?:src=["\']/(.*?)["\'])', repl, self.body) 54 | 55 | def _extract(self): 56 | lines = tuple(self.plain_text.split('\n')) 57 | #lines对应每行的长度 58 | len_per_lines = [len(re.sub(r'\s+', '', line)) for line in lines] 59 | 60 | #每个块对应的长度 61 | len_per_blocks = [] 62 | for i in range(len(len_per_lines) - self.depth + 1): 63 | word_len = sum([len_per_lines[j] for j in range(i, i + self.depth)]) 64 | len_per_blocks.append(word_len) 65 | 66 | text_list = [] 67 | text_begin_list = [] 68 | text_end_list = [] 69 | 70 | for i, value in enumerate(len_per_blocks): 71 | if value > 0: 72 | text_begin_list.append(i) 73 | tmp = lines[i] 74 | while i < len(len_per_blocks) and len_per_blocks[i] > 0: 75 | i += 1 76 | tmp += lines[i] + "\n" 77 | text_end_list.append(i) 78 | text_list.append(tmp) 79 | 80 | result = reduce(lambda str1, str2: str1 if len(str1) > len(str2) else str2, text_list) 81 | result = result.strip() 82 | i_start = self._start(result) 83 | i_end = self._end(result) 84 | if i_start == 0 or i_end == 0 or i_start > i_end: 85 | i_start = self._start(result, position=30) - 47 86 | if i_start < i_end: 87 | self.body = self.html_text[i_start:i_end] 88 | else: 89 | self.body = [] 90 | self.body = ''.join(self.body.splitlines()) 91 | return self.body 92 | 93 | def _start(self, result, position=0): 94 | i_start = 0 95 | for i in range(self.margin)[::-1]: 96 | start = result[position:i + position] 97 | start = extract_utils.escape_regex_meta(start) 98 | p = re.compile(start, re.IGNORECASE) 99 | match = p.search(self.html_text) 100 | if match: 101 | s = match.group() 102 | i_start = self.html_text.index(s) 103 | break 104 | return i_start 105 | 106 | def _end(self, result): 107 | i_end = 0 108 | for i in range(1, self.margin)[::-1]: 109 | end = result[-i:] 110 | end = extract_utils.escape_regex_meta(end) 111 | p = re.compile(end, re.IGNORECASE) 112 | match = p.search(self.html_text) 113 | if match: 114 | s = match.group() 115 | i_end = self.html_text.index(s) + len(s) 116 | break 117 | return i_end 118 | 119 | 120 | def clean_html(html): 121 | """ 122 | 清洗html文本，去掉无用标签 123 | 1. "script","style",注释标签整行用空格代替 124 | 2. 特殊字符转义 125 | return:(pure_text,html_text):纯文本和包含标签的html文本 126 | """ 127 | regex = re.compile( 128 | r'(?:)|' #doctype 129 | r'(?:[\S\s]*?)|' 130 | r'(?:)|' #comment 131 | r'(?:[\S\s]*?)|' # js... 132 | r'(?:[\S\s]*?)', re.IGNORECASE) # css 133 | 134 | html_text = regex.sub('', html) #保留html标签 135 | plain_text = re.sub(r"(?:)", '', html_text) #不包含任何标签的纯html文本 136 | html_text = extract_utils.html_escape(html_text) 137 | plain_text = extract_utils.html_escape(plain_text) 138 | return plain_text, html_text 139 | 140 | 141 | if __name__ == "__main__": 142 | # url = "http://sports.sina.com.cn/j/2014-05-09/00227155725.shtml" 143 | # url = "http://sports.qq.com/a/20140509/011085.htm" 144 | # url ='http://sports.sina.com.cn/j/2014-05-09/23267157241.shtml?from=hao123_sports_nq' 145 | # url = 'http://sports.ifeng.com/gnzq/zc/hengda/detail_2014_05/10/36245019_0.shtml' 146 | url = 'http://sports.sohu.com/20140509/n399370219.shtml' 147 | # url = 'http://sports.sina.com.cn/nba/2014-05-07/09207153447.shtml' 148 | # url = 'http://foofish.net/blog/73/stringio' 149 | # url = 'http://www.importnew.com/11309.html' 150 | # url = 'http://gd.qq.com/a/20140511/003265.htm?qq=0&ADUIN=253421576&ADSESSION=1399776075&ADTAG=CLIENT.QQ.5323_.0&ADPUBNO=26323' 151 | # url = 'http://gd.qq.com/a/20140511/009231.htm' 152 | url = 'http://sports.qq.com/a/20140510/018805.htm' 153 | # url = 'http://www.qwolf.com/?p=791' 154 | url = 'http://www.cnblogs.com/huxi/archive/2010/07/04/1771073.html' 155 | url = 'http://cn.uefa.com/memberassociations/news/newsid=2104522.html' 156 | url = 'http://cn.uefa.com/memberassociations/association=esp/news/newsid=2104513.html' 157 | # url = 'http://ballpo.com/detail/182560.html' #OK 158 | url = 'http://news.arsenal.com.cn/html/a/3QEGT/' #比较ok 159 | # url = 'http://www.barca.cn/portal.php?mod=view&aid=1175' #ok 160 | # url = 'http://www.usportnews.com/goal/pl/60288.html' #ok 161 | # url = 'http://spurscn.com/forum.php?mod=viewthread&tid=3307' #ok 162 | # url = 'http://www.mureds.com/thread-77077-1-1.html' #ok 163 | # url = 'http://www.lfc.org.cn/Article/201309/20130905203950546.html' #ok 164 | url = 'http://www.espnstar.com.cn/pub/international/2014/0422/323408.htm' #ok 165 | # url = 'http://www.bvbfans.net/forum.php?mod=viewthread&tid=10403&extra=page%3D1' #ok 166 | url = 'http://blog.sina.com.cn/s/blog_4e8581890102ep9u.html' 167 | # url = 'http://news.sina.com.cn/c/2014-05-13/110530125372.shtml' #no 168 | url = 'http://www.oschina.net/news/51692/ubuntukylin-is-not-a-china-linux-system' #ok 169 | url = 'http://joy2everyone.iteye.com/blog/930342' 170 | url = 'http://gd.qq.com/a/20140511/009231.htm' 171 | url = 'http://ballpo.com/detail/182560.html' 172 | te = BodyExtractor(url) 173 | te.execute() 174 | print te.body 175 | # print te.img 176 | # print te.title 177 | 178 | 179 | 180 | 181 | 182 | 183 | --------------------------------------------------------------------------------