├── .gitignore ├── README.md └── extractor.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cix-extractor-py 2 | 3 | [cx-extractor](https://code.google.com/p/cx-extractor/) 的 Python 版本,提取网页正文,添加了[提取正文图片](http://blog.rainy.im/2015/09/02/web-content-and-main-image-extractor/)的功能。 4 | 5 | 6 | -------------------------------------------------------------------------------- /extractor.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests as req 5 | import re 6 | 7 | DBUG = 0 8 | 9 | reBODY =re.compile( r'([\s\S]*?)<\/body>', re.I) 10 | reCOMM = r'' 11 | reTRIM = r'<{0}.*?>([\s\S]*?)<\/{0}>' 12 | reTAG = r'<[\s\S]*?>|[ \t\r\f\v]' 13 | 14 | reIMG = re.compile(r'') 15 | 16 | class Extractor(): 17 | def __init__(self, url = "", blockSize=3, timeout=5, image=False): 18 | self.url = url 19 | self.blockSize = blockSize 20 | self.timeout = timeout 21 | self.saveImage = image 22 | self.rawPage = "" 23 | self.ctexts = [] 24 | self.cblocks = [] 25 | 26 | def getRawPage(self): 27 | try: 28 | resp = req.get(self.url, timeout=self.timeout) 29 | except Exception as e: 30 | raise e 31 | 32 | if DBUG: print(resp.encoding) 33 | 34 | resp.encoding = "UTF-8" 35 | 36 | return resp.status_code, resp.text 37 | 38 | def processTags(self): 39 | self.body = re.sub(reCOMM, "", self.body) 40 | self.body = re.sub(reTRIM.format("script"), "" ,re.sub(reTRIM.format("style"), "", self.body)) 41 | # self.body = re.sub(r"[\n]+","\n", re.sub(reTAG, "", self.body)) 42 | self.body = re.sub(reTAG, "", self.body) 43 | 44 | def processBlocks(self): 45 | self.ctexts = self.body.split("\n") 46 | self.textLens = [len(text) for text in self.ctexts] 47 | 48 | self.cblocks = [0]*(len(self.ctexts) - self.blockSize - 1) 49 | lines = len(self.ctexts) 50 | for i in range(self.blockSize): 51 | self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks)) 52 | 53 | maxTextLen = max(self.cblocks) 54 | 55 | if DBUG: print(maxTextLen) 56 | 57 | self.start = self.end = self.cblocks.index(maxTextLen) 58 | while self.start > 0 and self.cblocks[self.start] > min(self.textLens): 59 | self.start -= 1 60 | while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens): 61 | self.end += 1 62 | 63 | return "".join(self.ctexts[self.start:self.end]) 64 | 65 | def processImages(self): 66 | self.body = reIMG.sub(r'{{\1}}', self.body) 67 | 68 | def getContext(self): 69 | code, self.rawPage = self.getRawPage() 70 | self.body = re.findall(reBODY, self.rawPage)[0] 71 | 72 | if DBUG: print(code, self.rawPage) 73 | 74 | if self.saveImage: 75 | self.processImages() 76 | self.processTags() 77 | return self.processBlocks() 78 | # print(len(self.body.strip("\n"))) 79 | 80 | if __name__ == '__main__': 81 | ext = Extractor(url="http://blog.rainy.im/2015/09/02/web-content-and-main-image-extractor/",blockSize=5, image=False) 82 | print(ext.getContext()) 83 | --------------------------------------------------------------------------------