├── .gitignore
├── README.md
└── extractor.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cix-extractor-py
2 |
3 | [cx-extractor](https://code.google.com/p/cx-extractor/) 的 Python 版本,提取网页正文,添加了[提取正文图片](http://blog.rainy.im/2015/09/02/web-content-and-main-image-extractor/)的功能。
4 |
5 |
6 |
--------------------------------------------------------------------------------
/extractor.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests as req
5 | import re
6 |
7 | DBUG = 0
8 |
9 | reBODY =re.compile( r'
([\s\S]*?)<\/body>', re.I)
10 | reCOMM = r''
11 | reTRIM = r'<{0}.*?>([\s\S]*?)<\/{0}>'
12 | reTAG = r'<[\s\S]*?>|[ \t\r\f\v]'
13 |
14 | reIMG = re.compile(r'
')
15 |
16 | class Extractor():
17 | def __init__(self, url = "", blockSize=3, timeout=5, image=False):
18 | self.url = url
19 | self.blockSize = blockSize
20 | self.timeout = timeout
21 | self.saveImage = image
22 | self.rawPage = ""
23 | self.ctexts = []
24 | self.cblocks = []
25 |
26 | def getRawPage(self):
27 | try:
28 | resp = req.get(self.url, timeout=self.timeout)
29 | except Exception as e:
30 | raise e
31 |
32 | if DBUG: print(resp.encoding)
33 |
34 | resp.encoding = "UTF-8"
35 |
36 | return resp.status_code, resp.text
37 |
38 | def processTags(self):
39 | self.body = re.sub(reCOMM, "", self.body)
40 | self.body = re.sub(reTRIM.format("script"), "" ,re.sub(reTRIM.format("style"), "", self.body))
41 | # self.body = re.sub(r"[\n]+","\n", re.sub(reTAG, "", self.body))
42 | self.body = re.sub(reTAG, "", self.body)
43 |
44 | def processBlocks(self):
45 | self.ctexts = self.body.split("\n")
46 | self.textLens = [len(text) for text in self.ctexts]
47 |
48 | self.cblocks = [0]*(len(self.ctexts) - self.blockSize - 1)
49 | lines = len(self.ctexts)
50 | for i in range(self.blockSize):
51 | self.cblocks = list(map(lambda x,y: x+y, self.textLens[i : lines-1-self.blockSize+i], self.cblocks))
52 |
53 | maxTextLen = max(self.cblocks)
54 |
55 | if DBUG: print(maxTextLen)
56 |
57 | self.start = self.end = self.cblocks.index(maxTextLen)
58 | while self.start > 0 and self.cblocks[self.start] > min(self.textLens):
59 | self.start -= 1
60 | while self.end < lines - self.blockSize and self.cblocks[self.end] > min(self.textLens):
61 | self.end += 1
62 |
63 | return "".join(self.ctexts[self.start:self.end])
64 |
65 | def processImages(self):
66 | self.body = reIMG.sub(r'{{\1}}', self.body)
67 |
68 | def getContext(self):
69 | code, self.rawPage = self.getRawPage()
70 | self.body = re.findall(reBODY, self.rawPage)[0]
71 |
72 | if DBUG: print(code, self.rawPage)
73 |
74 | if self.saveImage:
75 | self.processImages()
76 | self.processTags()
77 | return self.processBlocks()
78 | # print(len(self.body.strip("\n")))
79 |
80 | if __name__ == '__main__':
81 | ext = Extractor(url="http://blog.rainy.im/2015/09/02/web-content-and-main-image-extractor/",blockSize=5, image=False)
82 | print(ext.getContext())
83 |
--------------------------------------------------------------------------------